From 1c35b8efcdeee8b701077c57b80c1ae0efcda749 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 16 Jun 2020 12:08:25 +0200 Subject: [PATCH] fix spaces --- spacy/gold/example.pyx | 3 ++- spacy/tests/test_gold.py | 13 ++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/spacy/gold/example.pyx b/spacy/gold/example.pyx index ee81e0481..adae9335b 100644 --- a/spacy/gold/example.pyx +++ b/spacy/gold/example.pyx @@ -55,8 +55,9 @@ cdef class Example: tok_dict, doc_dict = _parse_example_dict_data(example_dict) if "ORTH" not in tok_dict: tok_dict["ORTH"] = [tok.text for tok in predicted] - if "SPACY" not in tok_dict: tok_dict["SPACY"] = [tok.whitespace_ for tok in predicted] + if "SPACY" not in tok_dict: + tok_dict["SPACY"] = None return Example( predicted, annotations2doc(predicted.vocab, tok_dict, doc_dict) diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index f7d7d70bb..5f92a476c 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -166,7 +166,7 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer): spaces = [True, True, True, True, True, False, False] doc = Doc(en_vocab, words=words, spaces=spaces) entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")] - gold_words =["I", "flew to", "San Francisco Valley", "."] + gold_words = ["I", "flew to", "San Francisco Valley", "."] example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) assert example.get_aligned("ENT_IOB") == [2, 2, 2, 3, 1, 1, 2] assert example.get_aligned("ENT_TYPE", as_string=True) == ["", "", "", "LOC", "LOC", "LOC", ""] @@ -188,12 +188,11 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer): ) doc = Doc(en_vocab, words=words, spaces=spaces) entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")] - gp = GoldParse( - doc, - words=["I", "flew", " ", "to", "San Francisco Valley", "."], - entities=entities, - ) - assert gp.ner == ["O", "O", "O", "O", "B-LOC", "L-LOC", "O"] + gold_words = ["I", "flew", " ", "to", "San Francisco Valley", "."] + gold_spaces = [True, True, False, True, False, False] + example = Example.from_dict(doc, {"words": gold_words, "spaces": gold_spaces, "entities": entities}) + assert example.get_aligned("ENT_IOB") == [2, 2, 2, 2, 3, 1, 2] + assert example.get_aligned("ENT_TYPE", as_string=True) == ["", "", "", "", "LOC", "LOC", ""] # from issue #4791 data = (