mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	* adding spans to doc_annotation in Example.to_dict * to_dict compatible with from_dict: tuples instead of spans * use strings for label and kb_id * Simplify test * Update data formats docs Co-authored-by: Stefanie Wolf <stefanie.wolf@vitecsoftware.com> Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
		
			
				
	
	
		
			472 lines
		
	
	
		
			16 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			472 lines
		
	
	
		
			16 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import pytest
 | |
| from spacy.training.example import Example
 | |
| from spacy.tokens import Doc
 | |
| from spacy.vocab import Vocab
 | |
| from spacy.util import to_ternary_int
 | |
| 
 | |
| 
 | |
| def test_Example_init_requires_doc_objects():
 | |
|     vocab = Vocab()
 | |
|     with pytest.raises(TypeError):
 | |
|         Example(None, None)
 | |
|     with pytest.raises(TypeError):
 | |
|         Example(Doc(vocab, words=["hi"]), None)
 | |
|     with pytest.raises(TypeError):
 | |
|         Example(None, Doc(vocab, words=["hi"]))
 | |
| 
 | |
| 
 | |
| def test_Example_from_dict_basic():
 | |
|     example = Example.from_dict(
 | |
|         Doc(Vocab(), words=["hello", "world"]), {"words": ["hello", "world"]}
 | |
|     )
 | |
|     assert isinstance(example.x, Doc)
 | |
|     assert isinstance(example.y, Doc)
 | |
| 
 | |
| 
 | |
| @pytest.mark.parametrize(
 | |
|     "annots", [{"words": ["ice", "cream"], "weirdannots": ["something", "such"]}]
 | |
| )
 | |
| def test_Example_from_dict_invalid(annots):
 | |
|     vocab = Vocab()
 | |
|     predicted = Doc(vocab, words=annots["words"])
 | |
|     with pytest.raises(KeyError):
 | |
|         Example.from_dict(predicted, annots)
 | |
| 
 | |
| 
 | |
| @pytest.mark.parametrize(
 | |
|     "pred_words", [["ice", "cream"], ["icecream"], ["i", "ce", "cream"]]
 | |
| )
 | |
| @pytest.mark.parametrize("annots", [{"words": ["icecream"], "tags": ["NN"]}])
 | |
| def test_Example_from_dict_with_tags(pred_words, annots):
 | |
|     vocab = Vocab()
 | |
|     predicted = Doc(vocab, words=pred_words)
 | |
|     example = Example.from_dict(predicted, annots)
 | |
|     for i, token in enumerate(example.reference):
 | |
|         assert token.tag_ == annots["tags"][i]
 | |
|     aligned_tags = example.get_aligned("TAG", as_string=True)
 | |
|     assert aligned_tags == ["NN" for _ in predicted]
 | |
| 
 | |
| 
 | |
| @pytest.mark.filterwarnings("ignore::UserWarning")
 | |
| def test_aligned_tags():
 | |
|     pred_words = ["Apply", "some", "sunscreen", "unless", "you", "can", "not"]
 | |
|     gold_words = ["Apply", "some", "sun", "screen", "unless", "you", "cannot"]
 | |
|     gold_tags = ["VERB", "DET", "NOUN", "NOUN", "SCONJ", "PRON", "VERB"]
 | |
|     annots = {"words": gold_words, "tags": gold_tags}
 | |
|     vocab = Vocab()
 | |
|     predicted = Doc(vocab, words=pred_words)
 | |
|     example1 = Example.from_dict(predicted, annots)
 | |
|     aligned_tags1 = example1.get_aligned("TAG", as_string=True)
 | |
|     assert aligned_tags1 == ["VERB", "DET", "NOUN", "SCONJ", "PRON", "VERB", "VERB"]
 | |
|     # ensure that to_dict works correctly
 | |
|     example2 = Example.from_dict(predicted, example1.to_dict())
 | |
|     aligned_tags2 = example2.get_aligned("TAG", as_string=True)
 | |
|     assert aligned_tags2 == ["VERB", "DET", "NOUN", "SCONJ", "PRON", "VERB", "VERB"]
 | |
| 
 | |
| 
 | |
| def test_aligned_tags_multi():
 | |
|     pred_words = ["Applysome", "sunscreen", "unless", "you", "can", "not"]
 | |
|     gold_words = ["Apply", "somesun", "screen", "unless", "you", "cannot"]
 | |
|     gold_tags = ["VERB", "DET", "NOUN", "SCONJ", "PRON", "VERB"]
 | |
|     annots = {"words": gold_words, "tags": gold_tags}
 | |
|     vocab = Vocab()
 | |
|     predicted = Doc(vocab, words=pred_words)
 | |
|     example = Example.from_dict(predicted, annots)
 | |
|     aligned_tags = example.get_aligned("TAG", as_string=True)
 | |
|     assert aligned_tags == [None, None, "SCONJ", "PRON", "VERB", "VERB"]
 | |
| 
 | |
| 
 | |
| @pytest.mark.parametrize(
 | |
|     "annots",
 | |
|     [
 | |
|         {
 | |
|             "words": ["I", "like", "London", "and", "Berlin", "."],
 | |
|             "deps": ["nsubj", "ROOT", "dobj", "cc", "conj", "punct"],
 | |
|             "heads": [1, 1, 1, 2, 2, 1],
 | |
|         }
 | |
|     ],
 | |
| )
 | |
| def test_Example_from_dict_with_parse(annots):
 | |
|     vocab = Vocab()
 | |
|     predicted = Doc(vocab, words=annots["words"])
 | |
|     example = Example.from_dict(predicted, annots)
 | |
|     for i, token in enumerate(example.reference):
 | |
|         assert token.dep_ == annots["deps"][i]
 | |
|         assert token.head.i == annots["heads"][i]
 | |
| 
 | |
| 
 | |
| @pytest.mark.parametrize(
 | |
|     "annots",
 | |
|     [
 | |
|         {
 | |
|             "words": ["Sarah", "'s", "sister", "flew"],
 | |
|             "morphs": [
 | |
|                 "NounType=prop|Number=sing",
 | |
|                 "Poss=yes",
 | |
|                 "Number=sing",
 | |
|                 "Tense=past|VerbForm=fin",
 | |
|             ],
 | |
|         }
 | |
|     ],
 | |
| )
 | |
| def test_Example_from_dict_with_morphology(annots):
 | |
|     vocab = Vocab()
 | |
|     predicted = Doc(vocab, words=annots["words"])
 | |
|     example = Example.from_dict(predicted, annots)
 | |
|     for i, token in enumerate(example.reference):
 | |
|         assert str(token.morph) == annots["morphs"][i]
 | |
| 
 | |
| 
 | |
| @pytest.mark.parametrize(
 | |
|     "annots",
 | |
|     [
 | |
|         {
 | |
|             "words": ["This", "is", "one", "sentence", "this", "is", "another"],
 | |
|             "sent_starts": [1, False, 0, None, True, -1, -5.7],
 | |
|         }
 | |
|     ],
 | |
| )
 | |
| def test_Example_from_dict_with_sent_start(annots):
 | |
|     vocab = Vocab()
 | |
|     predicted = Doc(vocab, words=annots["words"])
 | |
|     example = Example.from_dict(predicted, annots)
 | |
|     assert len(list(example.reference.sents)) == 2
 | |
|     for i, token in enumerate(example.reference):
 | |
|         if to_ternary_int(annots["sent_starts"][i]) == 1:
 | |
|             assert token.is_sent_start is True
 | |
|         elif to_ternary_int(annots["sent_starts"][i]) == 0:
 | |
|             assert token.is_sent_start is None
 | |
|         else:
 | |
|             assert token.is_sent_start is False
 | |
| 
 | |
| 
 | |
| @pytest.mark.parametrize(
 | |
|     "annots",
 | |
|     [
 | |
|         {
 | |
|             "words": ["This", "is", "a", "sentence"],
 | |
|             "cats": {"cat1": 1.0, "cat2": 0.0, "cat3": 0.5},
 | |
|         }
 | |
|     ],
 | |
| )
 | |
| def test_Example_from_dict_with_cats(annots):
 | |
|     vocab = Vocab()
 | |
|     predicted = Doc(vocab, words=annots["words"])
 | |
|     example = Example.from_dict(predicted, annots)
 | |
|     assert len(list(example.reference.cats)) == 3
 | |
|     assert example.reference.cats["cat1"] == 1.0
 | |
|     assert example.reference.cats["cat2"] == 0.0
 | |
|     assert example.reference.cats["cat3"] == 0.5
 | |
| 
 | |
| 
 | |
| @pytest.mark.parametrize(
 | |
|     "annots",
 | |
|     [
 | |
|         {
 | |
|             "words": ["I", "like", "New", "York", "and", "Berlin", "."],
 | |
|             "entities": [(7, 15, "LOC"), (20, 26, "LOC")],
 | |
|         }
 | |
|     ],
 | |
| )
 | |
| def test_Example_from_dict_with_entities(annots):
 | |
|     vocab = Vocab()
 | |
|     predicted = Doc(vocab, words=annots["words"])
 | |
|     example = Example.from_dict(predicted, annots)
 | |
|     assert len(list(example.reference.ents)) == 2
 | |
|     # fmt: off
 | |
|     assert [example.reference[i].ent_iob_ for i in range(7)] == ["O", "O", "B", "I", "O", "B", "O"]
 | |
|     assert example.get_aligned("ENT_IOB") == [2, 2, 3, 1, 2, 3, 2]
 | |
|     # fmt: on
 | |
|     assert example.reference[2].ent_type_ == "LOC"
 | |
|     assert example.reference[3].ent_type_ == "LOC"
 | |
|     assert example.reference[5].ent_type_ == "LOC"
 | |
| 
 | |
| 
 | |
| def test_Example_from_dict_with_empty_entities():
 | |
|     annots = {
 | |
|         "words": ["I", "like", "New", "York", "and", "Berlin", "."],
 | |
|         "entities": [],
 | |
|     }
 | |
|     vocab = Vocab()
 | |
|     predicted = Doc(vocab, words=annots["words"])
 | |
|     example = Example.from_dict(predicted, annots)
 | |
|     # entities as empty list sets everything to O
 | |
|     assert example.reference.has_annotation("ENT_IOB")
 | |
|     assert len(list(example.reference.ents)) == 0
 | |
|     assert all(token.ent_iob_ == "O" for token in example.reference)
 | |
|     # various unset/missing entities leaves entities unset
 | |
|     annots["entities"] = None
 | |
|     example = Example.from_dict(predicted, annots)
 | |
|     assert not example.reference.has_annotation("ENT_IOB")
 | |
|     annots.pop("entities", None)
 | |
|     example = Example.from_dict(predicted, annots)
 | |
|     assert not example.reference.has_annotation("ENT_IOB")
 | |
| 
 | |
| 
 | |
| @pytest.mark.parametrize(
 | |
|     "annots",
 | |
|     [
 | |
|         {
 | |
|             "words": ["I", "like", "New", "York", "and", "Berlin", "."],
 | |
|             "entities": [
 | |
|                 (0, 4, "LOC"),
 | |
|                 (21, 27, "LOC"),
 | |
|             ],  # not aligned to token boundaries
 | |
|         }
 | |
|     ],
 | |
| )
 | |
| def test_Example_from_dict_with_entities_invalid(annots):
 | |
|     vocab = Vocab()
 | |
|     predicted = Doc(vocab, words=annots["words"])
 | |
|     with pytest.warns(UserWarning):
 | |
|         example = Example.from_dict(predicted, annots)
 | |
|     assert len(list(example.reference.ents)) == 0
 | |
| 
 | |
| 
 | |
| @pytest.mark.parametrize(
 | |
|     "annots",
 | |
|     [
 | |
|         {
 | |
|             "words": ["I", "like", "New", "York", "and", "Berlin", "."],
 | |
|             "entities": [
 | |
|                 (7, 15, "LOC"),
 | |
|                 (11, 15, "LOC"),
 | |
|                 (20, 26, "LOC"),
 | |
|             ],  # overlapping
 | |
|         }
 | |
|     ],
 | |
| )
 | |
| def test_Example_from_dict_with_entities_overlapping(annots):
 | |
|     vocab = Vocab()
 | |
|     predicted = Doc(vocab, words=annots["words"])
 | |
|     with pytest.raises(ValueError):
 | |
|         Example.from_dict(predicted, annots)
 | |
| 
 | |
| 
 | |
| @pytest.mark.parametrize(
 | |
|     "annots",
 | |
|     [
 | |
|         {
 | |
|             "words": ["I", "like", "New", "York", "and", "Berlin", "."],
 | |
|             "spans": {
 | |
|                 "cities": [(7, 15, "LOC"), (20, 26, "LOC")],
 | |
|                 "people": [(0, 1, "PERSON")],
 | |
|             },
 | |
|         }
 | |
|     ],
 | |
| )
 | |
| def test_Example_from_dict_with_spans(annots):
 | |
|     vocab = Vocab()
 | |
|     predicted = Doc(vocab, words=annots["words"])
 | |
|     example = Example.from_dict(predicted, annots)
 | |
|     assert len(list(example.reference.ents)) == 0
 | |
|     assert len(list(example.reference.spans["cities"])) == 2
 | |
|     assert len(list(example.reference.spans["people"])) == 1
 | |
|     for span in example.reference.spans["cities"]:
 | |
|         assert span.label_ == "LOC"
 | |
|     for span in example.reference.spans["people"]:
 | |
|         assert span.label_ == "PERSON"
 | |
| 
 | |
| 
 | |
| @pytest.mark.parametrize(
 | |
|     "annots",
 | |
|     [
 | |
|         {
 | |
|             "words": ["I", "like", "New", "York", "and", "Berlin", "."],
 | |
|             "spans": {
 | |
|                 "cities": [(7, 15, "LOC"), (11, 15, "LOC"), (20, 26, "LOC")],
 | |
|                 "people": [(0, 1, "PERSON")],
 | |
|             },
 | |
|         }
 | |
|     ],
 | |
| )
 | |
| def test_Example_from_dict_with_spans_overlapping(annots):
 | |
|     vocab = Vocab()
 | |
|     predicted = Doc(vocab, words=annots["words"])
 | |
|     example = Example.from_dict(predicted, annots)
 | |
|     assert len(list(example.reference.ents)) == 0
 | |
|     assert len(list(example.reference.spans["cities"])) == 3
 | |
|     assert len(list(example.reference.spans["people"])) == 1
 | |
|     for span in example.reference.spans["cities"]:
 | |
|         assert span.label_ == "LOC"
 | |
|     for span in example.reference.spans["people"]:
 | |
|         assert span.label_ == "PERSON"
 | |
| 
 | |
| 
 | |
| @pytest.mark.parametrize(
 | |
|     "annots",
 | |
|     [
 | |
|         {
 | |
|             "words": ["I", "like", "New", "York", "and", "Berlin", "."],
 | |
|             "spans": [(0, 1, "PERSON")],
 | |
|         },
 | |
|         {
 | |
|             "words": ["I", "like", "New", "York", "and", "Berlin", "."],
 | |
|             "spans": {"cities": (7, 15, "LOC")},
 | |
|         },
 | |
|         {
 | |
|             "words": ["I", "like", "New", "York", "and", "Berlin", "."],
 | |
|             "spans": {"cities": [7, 11]},
 | |
|         },
 | |
|         {
 | |
|             "words": ["I", "like", "New", "York", "and", "Berlin", "."],
 | |
|             "spans": {"cities": [[7]]},
 | |
|         },
 | |
|     ],
 | |
| )
 | |
| def test_Example_from_dict_with_spans_invalid(annots):
 | |
|     vocab = Vocab()
 | |
|     predicted = Doc(vocab, words=annots["words"])
 | |
|     with pytest.raises(ValueError):
 | |
|         Example.from_dict(predicted, annots)
 | |
| 
 | |
| 
 | |
| @pytest.mark.parametrize(
 | |
|     "annots",
 | |
|     [
 | |
|         {
 | |
|             "words": ["I", "like", "New", "York", "and", "Berlin", "."],
 | |
|             "entities": [(7, 15, "LOC"), (20, 26, "LOC")],
 | |
|             "links": {
 | |
|                 (7, 15): {"Q60": 1.0, "Q64": 0.0},
 | |
|                 (20, 26): {"Q60": 0.0, "Q64": 1.0},
 | |
|             },
 | |
|         }
 | |
|     ],
 | |
| )
 | |
| def test_Example_from_dict_with_links(annots):
 | |
|     vocab = Vocab()
 | |
|     predicted = Doc(vocab, words=annots["words"])
 | |
|     example = Example.from_dict(predicted, annots)
 | |
|     assert example.reference[0].ent_kb_id_ == ""
 | |
|     assert example.reference[1].ent_kb_id_ == ""
 | |
|     assert example.reference[2].ent_kb_id_ == "Q60"
 | |
|     assert example.reference[3].ent_kb_id_ == "Q60"
 | |
|     assert example.reference[4].ent_kb_id_ == ""
 | |
|     assert example.reference[5].ent_kb_id_ == "Q64"
 | |
|     assert example.reference[6].ent_kb_id_ == ""
 | |
| 
 | |
| 
 | |
| @pytest.mark.parametrize(
 | |
|     "annots",
 | |
|     [
 | |
|         {
 | |
|             "words": ["I", "like", "New", "York", "and", "Berlin", "."],
 | |
|             "links": {(7, 14): {"Q7381115": 1.0, "Q2146908": 0.0}},
 | |
|         }
 | |
|     ],
 | |
| )
 | |
| def test_Example_from_dict_with_links_invalid(annots):
 | |
|     vocab = Vocab()
 | |
|     predicted = Doc(vocab, words=annots["words"])
 | |
|     with pytest.raises(ValueError):
 | |
|         Example.from_dict(predicted, annots)
 | |
| 
 | |
| 
 | |
| def test_Example_from_dict_sentences():
 | |
|     vocab = Vocab()
 | |
|     predicted = Doc(vocab, words=["One", "sentence", ".", "one", "more"])
 | |
|     annots = {"sent_starts": [1, 0, 0, 1, 0]}
 | |
|     ex = Example.from_dict(predicted, annots)
 | |
|     assert len(list(ex.reference.sents)) == 2
 | |
| 
 | |
|     # this currently throws an error - bug or feature?
 | |
|     # predicted = Doc(vocab, words=["One", "sentence", "not", "one", "more"])
 | |
|     # annots = {"sent_starts": [1, 0, 0, 0, 0]}
 | |
|     # ex = Example.from_dict(predicted, annots)
 | |
|     # assert len(list(ex.reference.sents)) == 1
 | |
| 
 | |
|     predicted = Doc(vocab, words=["One", "sentence", "not", "one", "more"])
 | |
|     annots = {"sent_starts": [1, -1, 0, 0, 0]}
 | |
|     ex = Example.from_dict(predicted, annots)
 | |
|     assert len(list(ex.reference.sents)) == 1
 | |
| 
 | |
| 
 | |
| def test_Example_missing_deps():
 | |
|     vocab = Vocab()
 | |
|     words = ["I", "like", "London", "and", "Berlin", "."]
 | |
|     deps = ["nsubj", "ROOT", "dobj", "cc", "conj", "punct"]
 | |
|     heads = [1, 1, 1, 2, 2, 1]
 | |
|     annots_head_only = {"words": words, "heads": heads}
 | |
|     annots_head_dep = {"words": words, "heads": heads, "deps": deps}
 | |
|     predicted = Doc(vocab, words=words)
 | |
| 
 | |
|     # when not providing deps, the head information is considered to be missing
 | |
|     # in this case, the token's heads refer to themselves
 | |
|     example_1 = Example.from_dict(predicted, annots_head_only)
 | |
|     assert [t.head.i for t in example_1.reference] == [0, 1, 2, 3, 4, 5]
 | |
| 
 | |
|     # when providing deps, the head information is actually used
 | |
|     example_2 = Example.from_dict(predicted, annots_head_dep)
 | |
|     assert [t.head.i for t in example_2.reference] == heads
 | |
| 
 | |
| 
 | |
| def test_Example_missing_heads():
 | |
|     vocab = Vocab()
 | |
|     words = ["I", "like", "London", "and", "Berlin", "."]
 | |
|     deps = ["nsubj", "ROOT", "dobj", None, "conj", "punct"]
 | |
|     heads = [1, 1, 1, None, 2, 1]
 | |
|     annots = {"words": words, "heads": heads, "deps": deps}
 | |
|     predicted = Doc(vocab, words=words)
 | |
| 
 | |
|     example = Example.from_dict(predicted, annots)
 | |
|     parsed_heads = [t.head.i for t in example.reference]
 | |
|     assert parsed_heads[0] == heads[0]
 | |
|     assert parsed_heads[1] == heads[1]
 | |
|     assert parsed_heads[2] == heads[2]
 | |
|     assert parsed_heads[4] == heads[4]
 | |
|     assert parsed_heads[5] == heads[5]
 | |
|     expected = [True, True, True, False, True, True]
 | |
|     assert [t.has_head() for t in example.reference] == expected
 | |
|     # Ensure that the missing head doesn't create an artificial new sentence start
 | |
|     expected = [True, False, False, False, False, False]
 | |
|     assert example.get_aligned_sent_starts() == expected
 | |
| 
 | |
| 
 | |
| def test_Example_aligned_whitespace(en_vocab):
 | |
|     words = ["a", " ", "b"]
 | |
|     tags = ["A", "SPACE", "B"]
 | |
|     predicted = Doc(en_vocab, words=words)
 | |
|     reference = Doc(en_vocab, words=words, tags=tags)
 | |
| 
 | |
|     example = Example(predicted, reference)
 | |
|     assert example.get_aligned("TAG", as_string=True) == tags
 | |
| 
 | |
| 
 | |
| @pytest.mark.issue("11260")
 | |
| def test_issue11260():
 | |
|     annots = {
 | |
|         "words": ["I", "like", "New", "York", "."],
 | |
|         "spans": {
 | |
|             "cities": [(7, 15, "LOC", "")],
 | |
|             "people": [(0, 1, "PERSON", "")],
 | |
|         },
 | |
|     }
 | |
|     vocab = Vocab()
 | |
|     predicted = Doc(vocab, words=annots["words"])
 | |
|     example = Example.from_dict(predicted, annots)
 | |
|     assert len(example.reference.spans["cities"]) == 1
 | |
|     assert len(example.reference.spans["people"]) == 1
 | |
| 
 | |
|     output_dict = example.to_dict()
 | |
|     assert "spans" in output_dict["doc_annotation"]
 | |
|     assert output_dict["doc_annotation"]["spans"]["cities"] == annots["spans"]["cities"]
 | |
|     assert output_dict["doc_annotation"]["spans"]["people"] == annots["spans"]["people"]
 | |
| 
 | |
|     output_example = Example.from_dict(predicted, output_dict)
 | |
| 
 | |
|     assert len(output_example.reference.spans["cities"]) == len(
 | |
|         example.reference.spans["cities"]
 | |
|     )
 | |
|     assert len(output_example.reference.spans["people"]) == len(
 | |
|         example.reference.spans["people"]
 | |
|     )
 | |
|     for span in example.reference.spans["cities"]:
 | |
|         assert span.label_ == "LOC"
 | |
|         assert span.text == "New York"
 | |
|         assert span.start_char == 7
 | |
|     for span in example.reference.spans["people"]:
 | |
|         assert span.label_ == "PERSON"
 | |
|         assert span.text == "I"
 | |
|         assert span.start_char == 0
 |