attempt to fix _guess_spaces

This commit is contained in:
svlandeg 2020-06-26 11:38:37 +02:00
parent 5b1d15e247
commit 5b6ed05752
2 changed files with 2 additions and 3 deletions

View File

@ -73,7 +73,7 @@ cdef class Example:
tok_dict["ORTH"] = [tok.text for tok in predicted] tok_dict["ORTH"] = [tok.text for tok in predicted]
tok_dict["SPACY"] = [tok.whitespace_ for tok in predicted] tok_dict["SPACY"] = [tok.whitespace_ for tok in predicted]
if not _has_field(tok_dict, "SPACY"): if not _has_field(tok_dict, "SPACY"):
spaces = _guess_spaces(predicted.text, tok_dict["ORTH"]) tok_dict["SPACY"] = _guess_spaces(predicted.text, tok_dict["ORTH"])
return Example( return Example(
predicted, predicted,
annotations2doc(predicted.vocab, tok_dict, doc_dict) annotations2doc(predicted.vocab, tok_dict, doc_dict)
@ -333,8 +333,6 @@ def _fix_legacy_dict_data(example_dict):
else: else:
raise KeyError(Errors.E983.format(key=key, dict="token_annotation", keys=remapping.keys())) raise KeyError(Errors.E983.format(key=key, dict="token_annotation", keys=remapping.keys()))
text = example_dict.get("text", example_dict.get("raw")) text = example_dict.get("text", example_dict.get("raw"))
if not _has_field(token_dict, "SPACY"):
token_dict["SPACY"] = _guess_spaces(text, token_dict["ORTH"])
if "HEAD" in token_dict and "SENT_START" in token_dict: if "HEAD" in token_dict and "SENT_START" in token_dict:
# If heads are set, we don't also redundantly specify SENT_START. # If heads are set, we don't also redundantly specify SENT_START.
token_dict.pop("SENT_START") token_dict.pop("SENT_START")

View File

@ -161,6 +161,7 @@ def test_example_from_dict_no_ner(en_vocab):
ner_tags = example.get_aligned_ner() ner_tags = example.get_aligned_ner()
assert ner_tags == [None, None, None, None] assert ner_tags == [None, None, None, None]
def test_example_from_dict_some_ner(en_vocab): def test_example_from_dict_some_ner(en_vocab):
words = ["a", "b", "c", "d"] words = ["a", "b", "c", "d"]
spaces = [True, True, False, True] spaces = [True, True, False, True]