mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	Few more Example unit tests (#5720)
* small fixes in Example, UX * add gold tests for aligned_spans and get_aligned_parse * sentencizer unnecessary
This commit is contained in:
		
							parent
							
								
									433dc3c9c9
								
							
						
					
					
						commit
						a39a110c4e
					
				|  | @ -477,15 +477,14 @@ class Errors(object): | ||||||
|     E199 = ("Unable to merge 0-length span at doc[{start}:{end}].") |     E199 = ("Unable to merge 0-length span at doc[{start}:{end}].") | ||||||
| 
 | 
 | ||||||
|     # TODO: fix numbering after merging develop into master |     # TODO: fix numbering after merging develop into master | ||||||
|  |     E969 = ("Expected string values for field '{field}', but received {types} instead. ") | ||||||
|     E970 = ("Can not execute command '{str_command}'. Do you have '{tool}' installed?") |     E970 = ("Can not execute command '{str_command}'. Do you have '{tool}' installed?") | ||||||
|     E971 = ("Found incompatible lengths in Doc.from_array: {array_length} for the " |     E971 = ("Found incompatible lengths in Doc.from_array: {array_length} for the " | ||||||
|             "array and {doc_length} for the Doc itself.") |             "array and {doc_length} for the Doc itself.") | ||||||
|     E972 = ("Example.__init__ got None for '{arg}'. Requires Doc.") |     E972 = ("Example.__init__ got None for '{arg}'. Requires Doc.") | ||||||
|     E973 = ("Unexpected type for NER data") |     E973 = ("Unexpected type for NER data") | ||||||
|     E974 = ("Unknown {obj} attribute: {key}") |     E974 = ("Unknown {obj} attribute: {key}") | ||||||
|     E975 = ("The method 'Example.from_dict' expects a Doc as first argument, " |     E976 = ("The method 'Example.from_dict' expects a {type} as {n} argument, " | ||||||
|             "but got {type}") |  | ||||||
|     E976 = ("The method 'Example.from_dict' expects a dict as second argument, " |  | ||||||
|             "but received None.") |             "but received None.") | ||||||
|     E977 = ("Can not compare a MorphAnalysis with a string object. " |     E977 = ("Can not compare a MorphAnalysis with a string object. " | ||||||
|             "This is likely a bug in spaCy, so feel free to open an issue.") |             "This is likely a bug in spaCy, so feel free to open an issue.") | ||||||
|  |  | ||||||
|  | @ -28,7 +28,6 @@ cpdef Doc annotations2doc(vocab, tok_annot, doc_annot): | ||||||
| 
 | 
 | ||||||
| cdef class Example: | cdef class Example: | ||||||
|     def __init__(self, Doc predicted, Doc reference, *, alignment=None): |     def __init__(self, Doc predicted, Doc reference, *, alignment=None): | ||||||
|         """ Doc can either be text, or an actual Doc """ |  | ||||||
|         if predicted is None: |         if predicted is None: | ||||||
|             raise TypeError(Errors.E972.format(arg="predicted")) |             raise TypeError(Errors.E972.format(arg="predicted")) | ||||||
|         if reference is None: |         if reference is None: | ||||||
|  | @ -59,17 +58,15 @@ cdef class Example: | ||||||
| 
 | 
 | ||||||
|     @classmethod |     @classmethod | ||||||
|     def from_dict(cls, Doc predicted, dict example_dict): |     def from_dict(cls, Doc predicted, dict example_dict): | ||||||
|  |         if predicted is None: | ||||||
|  |             raise ValueError(Errors.E976.format(n="first", type="Doc")) | ||||||
|         if example_dict is None: |         if example_dict is None: | ||||||
|             raise ValueError(Errors.E976) |             raise ValueError(Errors.E976.format(n="second", type="dict")) | ||||||
|         if not isinstance(predicted, Doc): |  | ||||||
|             raise TypeError(Errors.E975.format(type=type(predicted))) |  | ||||||
|         example_dict = _fix_legacy_dict_data(example_dict) |         example_dict = _fix_legacy_dict_data(example_dict) | ||||||
|         tok_dict, doc_dict = _parse_example_dict_data(example_dict) |         tok_dict, doc_dict = _parse_example_dict_data(example_dict) | ||||||
|         if "ORTH" not in tok_dict: |         if "ORTH" not in tok_dict: | ||||||
|             tok_dict["ORTH"] = [tok.text for tok in predicted] |             tok_dict["ORTH"] = [tok.text for tok in predicted] | ||||||
|             tok_dict["SPACY"] = [tok.whitespace_ for tok in predicted] |             tok_dict["SPACY"] = [tok.whitespace_ for tok in predicted] | ||||||
|         if not _has_field(tok_dict, "SPACY"): |  | ||||||
|             spaces = _guess_spaces(predicted.text, tok_dict["ORTH"]) |  | ||||||
|         return Example( |         return Example( | ||||||
|             predicted, |             predicted, | ||||||
|             annotations2doc(predicted.vocab, tok_dict, doc_dict) |             annotations2doc(predicted.vocab, tok_dict, doc_dict) | ||||||
|  | @ -257,7 +254,11 @@ def _annot2array(vocab, tok_annot, doc_annot): | ||||||
|             values.append([vocab.morphology.add(v) for v in value]) |             values.append([vocab.morphology.add(v) for v in value]) | ||||||
|         else: |         else: | ||||||
|             attrs.append(key) |             attrs.append(key) | ||||||
|  |             try: | ||||||
|                 values.append([vocab.strings.add(v) for v in value]) |                 values.append([vocab.strings.add(v) for v in value]) | ||||||
|  |             except TypeError: | ||||||
|  |                 types= set([type(v) for v in value]) | ||||||
|  |                 raise TypeError(Errors.E969.format(field=key, types=types)) | ||||||
| 
 | 
 | ||||||
|     array = numpy.asarray(values, dtype="uint64") |     array = numpy.asarray(values, dtype="uint64") | ||||||
|     return attrs, array.T |     return attrs, array.T | ||||||
|  |  | ||||||
|  | @ -45,7 +45,7 @@ def test_parser_ancestors(tree, cyclic_tree, partial_tree, multirooted_tree): | ||||||
| 
 | 
 | ||||||
| def test_parser_contains_cycle(tree, cyclic_tree, partial_tree, multirooted_tree): | def test_parser_contains_cycle(tree, cyclic_tree, partial_tree, multirooted_tree): | ||||||
|     assert contains_cycle(tree) is None |     assert contains_cycle(tree) is None | ||||||
|     assert contains_cycle(cyclic_tree) == set([3, 4, 5]) |     assert contains_cycle(cyclic_tree) == {3, 4, 5} | ||||||
|     assert contains_cycle(partial_tree) is None |     assert contains_cycle(partial_tree) is None | ||||||
|     assert contains_cycle(multirooted_tree) is None |     assert contains_cycle(multirooted_tree) is None | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -5,6 +5,7 @@ from spacy.gold import Corpus, docs_to_json | ||||||
| from spacy.gold.example import Example | from spacy.gold.example import Example | ||||||
| from spacy.gold.converters import json2docs | from spacy.gold.converters import json2docs | ||||||
| from spacy.lang.en import English | from spacy.lang.en import English | ||||||
|  | from spacy.pipeline import EntityRuler | ||||||
| from spacy.tokens import Doc, DocBin | from spacy.tokens import Doc, DocBin | ||||||
| from spacy.util import get_words_and_spaces, minibatch | from spacy.util import get_words_and_spaces, minibatch | ||||||
| from thinc.api import compounding | from thinc.api import compounding | ||||||
|  | @ -272,72 +273,72 @@ def test_split_sentences(en_vocab): | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_gold_biluo_one_to_many(en_vocab, en_tokenizer): | def test_gold_biluo_one_to_many(en_vocab, en_tokenizer): | ||||||
|     words = ["Mr. and ", "Mrs. Smith", "flew to", "San Francisco Valley", "."] |     words = ["Mr and ", "Mrs Smith", "flew to", "San Francisco Valley", "."] | ||||||
|     spaces = [True, True, True, False, False] |     spaces = [True, True, True, False, False] | ||||||
|     doc = Doc(en_vocab, words=words, spaces=spaces) |     doc = Doc(en_vocab, words=words, spaces=spaces) | ||||||
|     prefix = "Mr. and Mrs. Smith flew to " |     prefix = "Mr and Mrs Smith flew to " | ||||||
|     entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")] |     entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")] | ||||||
|     gold_words = ["Mr. and Mrs. Smith", "flew", "to", "San", "Francisco", "Valley", "."] |     gold_words = ["Mr and Mrs Smith", "flew", "to", "San", "Francisco", "Valley", "."] | ||||||
|     example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) |     example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) | ||||||
|     ner_tags = example.get_aligned_ner() |     ner_tags = example.get_aligned_ner() | ||||||
|     assert ner_tags == ["O", "O", "O", "U-LOC", "O"] |     assert ner_tags == ["O", "O", "O", "U-LOC", "O"] | ||||||
| 
 | 
 | ||||||
|     entities = [ |     entities = [ | ||||||
|         (len("Mr. and "), len("Mr. and Mrs. Smith"), "PERSON"),  # "Mrs. Smith" is a PERSON |         (len("Mr and "), len("Mr and Mrs Smith"), "PERSON"),  # "Mrs Smith" is a PERSON | ||||||
|         (len(prefix), len(prefix + "San Francisco Valley"), "LOC"), |         (len(prefix), len(prefix + "San Francisco Valley"), "LOC"), | ||||||
|     ] |     ] | ||||||
|     gold_words = ["Mr. and", "Mrs.", "Smith", "flew", "to", "San", "Francisco", "Valley", "."] |     gold_words = ["Mr and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."] | ||||||
|     example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) |     example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) | ||||||
|     ner_tags = example.get_aligned_ner() |     ner_tags = example.get_aligned_ner() | ||||||
|     assert ner_tags == ["O", "U-PERSON", "O", "U-LOC", "O"] |     assert ner_tags == ["O", "U-PERSON", "O", "U-LOC", "O"] | ||||||
| 
 | 
 | ||||||
|     entities = [ |     entities = [ | ||||||
|         (len("Mr. and "), len("Mr. and Mrs."), "PERSON"),  # "Mrs." is a Person |         (len("Mr and "), len("Mr and Mrs"), "PERSON"),  # "Mrs" is a Person | ||||||
|         (len(prefix), len(prefix + "San Francisco Valley"), "LOC"), |         (len(prefix), len(prefix + "San Francisco Valley"), "LOC"), | ||||||
|     ] |     ] | ||||||
|     gold_words = ["Mr. and", "Mrs.", "Smith", "flew", "to", "San", "Francisco", "Valley", "."] |     gold_words = ["Mr and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."] | ||||||
|     example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) |     example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) | ||||||
|     ner_tags = example.get_aligned_ner() |     ner_tags = example.get_aligned_ner() | ||||||
|     assert ner_tags == ["O", None, "O", "U-LOC", "O"] |     assert ner_tags == ["O", None, "O", "U-LOC", "O"] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_gold_biluo_many_to_one(en_vocab, en_tokenizer): | def test_gold_biluo_many_to_one(en_vocab, en_tokenizer): | ||||||
|     words = ["Mr. and", "Mrs.", "Smith", "flew", "to", "San", "Francisco", "Valley", "."] |     words = ["Mr and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."] | ||||||
|     spaces = [True, True, True, True, True, True, True, False, False] |     spaces = [True, True, True, True, True, True, True, False, False] | ||||||
|     doc = Doc(en_vocab, words=words, spaces=spaces) |     doc = Doc(en_vocab, words=words, spaces=spaces) | ||||||
|     prefix = "Mr. and Mrs. Smith flew to " |     prefix = "Mr and Mrs Smith flew to " | ||||||
|     entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")] |     entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")] | ||||||
|     gold_words = ["Mr. and Mrs. Smith", "flew to", "San Francisco Valley", "."] |     gold_words = ["Mr and Mrs Smith", "flew to", "San Francisco Valley", "."] | ||||||
|     example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) |     example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) | ||||||
|     ner_tags = example.get_aligned_ner() |     ner_tags = example.get_aligned_ner() | ||||||
|     assert ner_tags == ["O", "O", "O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"] |     assert ner_tags == ["O", "O", "O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"] | ||||||
| 
 | 
 | ||||||
|     entities = [ |     entities = [ | ||||||
|         (len("Mr. and "), len("Mr. and Mrs. Smith"), "PERSON"),  # "Mrs. Smith" is a PERSON |         (len("Mr and "), len("Mr and Mrs Smith"), "PERSON"),  # "Mrs Smith" is a PERSON | ||||||
|         (len(prefix), len(prefix + "San Francisco Valley"), "LOC"), |         (len(prefix), len(prefix + "San Francisco Valley"), "LOC"), | ||||||
|     ] |     ] | ||||||
|     gold_words = ["Mr. and", "Mrs. Smith", "flew to", "San Francisco Valley", "."] |     gold_words = ["Mr and", "Mrs Smith", "flew to", "San Francisco Valley", "."] | ||||||
|     example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) |     example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) | ||||||
|     ner_tags = example.get_aligned_ner() |     ner_tags = example.get_aligned_ner() | ||||||
|     assert ner_tags == ["O", "B-PERSON", "L-PERSON", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"] |     assert ner_tags == ["O", "B-PERSON", "L-PERSON", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_gold_biluo_misaligned(en_vocab, en_tokenizer): | def test_gold_biluo_misaligned(en_vocab, en_tokenizer): | ||||||
|     words = ["Mr. and Mrs.", "Smith", "flew", "to", "San Francisco", "Valley", "."] |     words = ["Mr and Mrs", "Smith", "flew", "to", "San Francisco", "Valley", "."] | ||||||
|     spaces = [True, True, True, True, True, False, False] |     spaces = [True, True, True, True, True, False, False] | ||||||
|     doc = Doc(en_vocab, words=words, spaces=spaces) |     doc = Doc(en_vocab, words=words, spaces=spaces) | ||||||
|     prefix = "Mr. and Mrs. Smith flew to " |     prefix = "Mr and Mrs Smith flew to " | ||||||
|     entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")] |     entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")] | ||||||
|     gold_words = ["Mr.", "and Mrs. Smith", "flew to", "San", "Francisco Valley", "."] |     gold_words = ["Mr", "and Mrs Smith", "flew to", "San", "Francisco Valley", "."] | ||||||
|     example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) |     example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) | ||||||
|     ner_tags = example.get_aligned_ner() |     ner_tags = example.get_aligned_ner() | ||||||
|     assert ner_tags == ["O", "O", "O", "O", "B-LOC", "L-LOC", "O"] |     assert ner_tags == ["O", "O", "O", "O", "B-LOC", "L-LOC", "O"] | ||||||
| 
 | 
 | ||||||
|     entities = [ |     entities = [ | ||||||
|         (len("Mr. and "), len("Mr. and Mrs. Smith"), "PERSON"),  # "Mrs. Smith" is a PERSON |         (len("Mr and "), len("Mr and Mrs Smith"), "PERSON"),  # "Mrs Smith" is a PERSON | ||||||
|         (len(prefix), len(prefix + "San Francisco Valley"), "LOC"), |         (len(prefix), len(prefix + "San Francisco Valley"), "LOC"), | ||||||
|     ] |     ] | ||||||
|     gold_words = ["Mr. and", "Mrs. Smith", "flew to", "San", "Francisco Valley", "."] |     gold_words = ["Mr and", "Mrs Smith", "flew to", "San", "Francisco Valley", "."] | ||||||
|     example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) |     example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) | ||||||
|     ner_tags = example.get_aligned_ner() |     ner_tags = example.get_aligned_ner() | ||||||
|     assert ner_tags == [None, None, "O", "O", "B-LOC", "L-LOC", "O"] |     assert ner_tags == [None, None, "O", "O", "B-LOC", "L-LOC", "O"] | ||||||
|  | @ -407,6 +408,49 @@ def test_biluo_spans(en_tokenizer): | ||||||
|     assert spans[1].label_ == "GPE" |     assert spans[1].label_ == "GPE" | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | def test_aligned_spans_y2x(en_vocab, en_tokenizer): | ||||||
|  |     words = ["Mr and Mrs Smith", "flew", "to", "San Francisco Valley", "."] | ||||||
|  |     spaces = [True, True, True, False, False] | ||||||
|  |     doc = Doc(en_vocab, words=words, spaces=spaces) | ||||||
|  |     prefix = "Mr and Mrs Smith flew to " | ||||||
|  |     entities = [ | ||||||
|  |         (0, len("Mr and Mrs Smith"), "PERSON"), | ||||||
|  |         (len(prefix), len(prefix + "San Francisco Valley"), "LOC"), | ||||||
|  |     ] | ||||||
|  |     tokens_ref = ["Mr", "and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."] | ||||||
|  |     example = Example.from_dict(doc, {"words": tokens_ref, "entities": entities}) | ||||||
|  |     ents_ref = example.reference.ents | ||||||
|  |     assert [(ent.start, ent.end) for ent in ents_ref] == [(0, 4), (6, 9)] | ||||||
|  |     ents_y2x = example.get_aligned_spans_y2x(ents_ref) | ||||||
|  |     assert [(ent.start, ent.end) for ent in ents_y2x] == [(0, 1), (3, 4)] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_aligned_spans_x2y(en_vocab, en_tokenizer): | ||||||
|  |     text = "Mr and Mrs Smith flew to San Francisco Valley" | ||||||
|  |     nlp = English() | ||||||
|  |     ruler = EntityRuler(nlp) | ||||||
|  |     patterns = [{"label": "PERSON", "pattern": "Mr and Mrs Smith"}, | ||||||
|  |                 {"label": "LOC", "pattern": "San Francisco Valley"}] | ||||||
|  |     ruler.add_patterns(patterns) | ||||||
|  |     nlp.add_pipe(ruler) | ||||||
|  |     doc = nlp(text) | ||||||
|  |     assert [(ent.start, ent.end) for ent in doc.ents] == [(0, 4), (6, 9)] | ||||||
|  |     prefix = "Mr and Mrs Smith flew to " | ||||||
|  |     entities = [ | ||||||
|  |         (0, len("Mr and Mrs Smith"), "PERSON"), | ||||||
|  |         (len(prefix), len(prefix + "San Francisco Valley"), "LOC"), | ||||||
|  |     ] | ||||||
|  |     tokens_ref = ["Mr and Mrs", "Smith", "flew", "to", "San Francisco", "Valley"] | ||||||
|  |     example = Example.from_dict(doc, {"words": tokens_ref, "entities": entities}) | ||||||
|  |     assert [(ent.start, ent.end) for ent in example.reference.ents] == [(0, 2), (4, 6)] | ||||||
|  | 
 | ||||||
|  |     # Ensure that 'get_aligned_spans_x2y' has the aligned entities correct | ||||||
|  |     ents_pred = example.predicted.ents | ||||||
|  |     assert [(ent.start, ent.end) for ent in ents_pred] == [(0, 4), (6, 9)] | ||||||
|  |     ents_x2y = example.get_aligned_spans_x2y(ents_pred) | ||||||
|  |     assert [(ent.start, ent.end) for ent in ents_x2y] == [(0, 2), (4, 6)] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| def test_gold_ner_missing_tags(en_tokenizer): | def test_gold_ner_missing_tags(en_tokenizer): | ||||||
|     doc = en_tokenizer("I flew to Silicon Valley via London.") |     doc = en_tokenizer("I flew to Silicon Valley via London.") | ||||||
|     biluo_tags = [None, "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"] |     biluo_tags = [None, "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"] | ||||||
|  | @ -414,6 +458,16 @@ def test_gold_ner_missing_tags(en_tokenizer): | ||||||
|     assert example.get_aligned("ENT_IOB") == [0, 2, 2, 3, 1, 2, 3, 2] |     assert example.get_aligned("ENT_IOB") == [0, 2, 2, 3, 1, 2, 3, 2] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | def test_projectivize(en_tokenizer): | ||||||
|  |     doc = en_tokenizer("He pretty quickly walks away") | ||||||
|  |     heads = [3, 2, 3, 0, 2] | ||||||
|  |     example = Example.from_dict(doc, {"heads": heads}) | ||||||
|  |     proj_heads, proj_labels = example.get_aligned_parse(projectivize=True) | ||||||
|  |     nonproj_heads, nonproj_labels = example.get_aligned_parse(projectivize=False) | ||||||
|  |     assert proj_heads == [3, 2, 3, 0, 3] | ||||||
|  |     assert nonproj_heads == [3, 2, 3, 0, 2] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| def test_iob_to_biluo(): | def test_iob_to_biluo(): | ||||||
|     good_iob = ["O", "O", "B-LOC", "I-LOC", "O", "B-PERSON"] |     good_iob = ["O", "O", "B-LOC", "I-LOC", "O", "B-PERSON"] | ||||||
|     good_biluo = ["O", "O", "B-LOC", "L-LOC", "O", "U-PERSON"] |     good_biluo = ["O", "O", "B-LOC", "L-LOC", "O", "U-PERSON"] | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user