mirror of
https://github.com/explosion/spaCy.git
synced 2025-03-12 15:25:47 +03:00
fix spaces
This commit is contained in:
parent
6fea5fa4bd
commit
1c35b8efcd
|
@ -55,8 +55,9 @@ cdef class Example:
|
||||||
tok_dict, doc_dict = _parse_example_dict_data(example_dict)
|
tok_dict, doc_dict = _parse_example_dict_data(example_dict)
|
||||||
if "ORTH" not in tok_dict:
|
if "ORTH" not in tok_dict:
|
||||||
tok_dict["ORTH"] = [tok.text for tok in predicted]
|
tok_dict["ORTH"] = [tok.text for tok in predicted]
|
||||||
if "SPACY" not in tok_dict:
|
|
||||||
tok_dict["SPACY"] = [tok.whitespace_ for tok in predicted]
|
tok_dict["SPACY"] = [tok.whitespace_ for tok in predicted]
|
||||||
|
if "SPACY" not in tok_dict:
|
||||||
|
tok_dict["SPACY"] = None
|
||||||
return Example(
|
return Example(
|
||||||
predicted,
|
predicted,
|
||||||
annotations2doc(predicted.vocab, tok_dict, doc_dict)
|
annotations2doc(predicted.vocab, tok_dict, doc_dict)
|
||||||
|
|
|
@ -166,7 +166,7 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer):
|
||||||
spaces = [True, True, True, True, True, False, False]
|
spaces = [True, True, True, True, True, False, False]
|
||||||
doc = Doc(en_vocab, words=words, spaces=spaces)
|
doc = Doc(en_vocab, words=words, spaces=spaces)
|
||||||
entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
|
entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
|
||||||
gold_words =["I", "flew to", "San Francisco Valley", "."]
|
gold_words = ["I", "flew to", "San Francisco Valley", "."]
|
||||||
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
||||||
assert example.get_aligned("ENT_IOB") == [2, 2, 2, 3, 1, 1, 2]
|
assert example.get_aligned("ENT_IOB") == [2, 2, 2, 3, 1, 1, 2]
|
||||||
assert example.get_aligned("ENT_TYPE", as_string=True) == ["", "", "", "LOC", "LOC", "LOC", ""]
|
assert example.get_aligned("ENT_TYPE", as_string=True) == ["", "", "", "LOC", "LOC", "LOC", ""]
|
||||||
|
@ -188,12 +188,11 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer):
|
||||||
)
|
)
|
||||||
doc = Doc(en_vocab, words=words, spaces=spaces)
|
doc = Doc(en_vocab, words=words, spaces=spaces)
|
||||||
entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
|
entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
|
||||||
gp = GoldParse(
|
gold_words = ["I", "flew", " ", "to", "San Francisco Valley", "."]
|
||||||
doc,
|
gold_spaces = [True, True, False, True, False, False]
|
||||||
words=["I", "flew", " ", "to", "San Francisco Valley", "."],
|
example = Example.from_dict(doc, {"words": gold_words, "spaces": gold_spaces, "entities": entities})
|
||||||
entities=entities,
|
assert example.get_aligned("ENT_IOB") == [2, 2, 2, 2, 3, 1, 2]
|
||||||
)
|
assert example.get_aligned("ENT_TYPE", as_string=True) == ["", "", "", "", "LOC", "LOC", ""]
|
||||||
assert gp.ner == ["O", "O", "O", "O", "B-LOC", "L-LOC", "O"]
|
|
||||||
|
|
||||||
# from issue #4791
|
# from issue #4791
|
||||||
data = (
|
data = (
|
||||||
|
|
Loading…
Reference in New Issue
Block a user