fix spaces

This commit is contained in:
svlandeg 2020-06-16 12:08:25 +02:00
parent 6fea5fa4bd
commit 1c35b8efcd
2 changed files with 8 additions and 8 deletions

View File

@ -55,8 +55,9 @@ cdef class Example:
tok_dict, doc_dict = _parse_example_dict_data(example_dict) tok_dict, doc_dict = _parse_example_dict_data(example_dict)
if "ORTH" not in tok_dict: if "ORTH" not in tok_dict:
tok_dict["ORTH"] = [tok.text for tok in predicted] tok_dict["ORTH"] = [tok.text for tok in predicted]
if "SPACY" not in tok_dict:
tok_dict["SPACY"] = [tok.whitespace_ for tok in predicted] tok_dict["SPACY"] = [tok.whitespace_ for tok in predicted]
if "SPACY" not in tok_dict:
tok_dict["SPACY"] = None
return Example( return Example(
predicted, predicted,
annotations2doc(predicted.vocab, tok_dict, doc_dict) annotations2doc(predicted.vocab, tok_dict, doc_dict)

View File

@ -166,7 +166,7 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer):
spaces = [True, True, True, True, True, False, False] spaces = [True, True, True, True, True, False, False]
doc = Doc(en_vocab, words=words, spaces=spaces) doc = Doc(en_vocab, words=words, spaces=spaces)
entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")] entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
gold_words =["I", "flew to", "San Francisco Valley", "."] gold_words = ["I", "flew to", "San Francisco Valley", "."]
example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
assert example.get_aligned("ENT_IOB") == [2, 2, 2, 3, 1, 1, 2] assert example.get_aligned("ENT_IOB") == [2, 2, 2, 3, 1, 1, 2]
assert example.get_aligned("ENT_TYPE", as_string=True) == ["", "", "", "LOC", "LOC", "LOC", ""] assert example.get_aligned("ENT_TYPE", as_string=True) == ["", "", "", "LOC", "LOC", "LOC", ""]
@ -188,12 +188,11 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer):
) )
doc = Doc(en_vocab, words=words, spaces=spaces) doc = Doc(en_vocab, words=words, spaces=spaces)
entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")] entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
gp = GoldParse( gold_words = ["I", "flew", " ", "to", "San Francisco Valley", "."]
doc, gold_spaces = [True, True, False, True, False, False]
words=["I", "flew", " ", "to", "San Francisco Valley", "."], example = Example.from_dict(doc, {"words": gold_words, "spaces": gold_spaces, "entities": entities})
entities=entities, assert example.get_aligned("ENT_IOB") == [2, 2, 2, 2, 3, 1, 2]
) assert example.get_aligned("ENT_TYPE", as_string=True) == ["", "", "", "", "LOC", "LOC", ""]
assert gp.ner == ["O", "O", "O", "O", "B-LOC", "L-LOC", "O"]
# from issue #4791 # from issue #4791
data = ( data = (