Few more Example unit tests (#5720)

* small fixes in Example, UX

* add gold tests for aligned_spans and get_aligned_parse

* sentencizer unnecessary
This commit is contained in:
Sofie Van Landeghem 2020-07-07 18:46:00 +02:00 committed by GitHub
parent 433dc3c9c9
commit a39a110c4e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 82 additions and 28 deletions

View File

@ -477,15 +477,14 @@ class Errors(object):
E199 = ("Unable to merge 0-length span at doc[{start}:{end}].") E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")
# TODO: fix numbering after merging develop into master # TODO: fix numbering after merging develop into master
E969 = ("Expected string values for field '{field}', but received {types} instead. ")
E970 = ("Can not execute command '{str_command}'. Do you have '{tool}' installed?") E970 = ("Can not execute command '{str_command}'. Do you have '{tool}' installed?")
E971 = ("Found incompatible lengths in Doc.from_array: {array_length} for the " E971 = ("Found incompatible lengths in Doc.from_array: {array_length} for the "
"array and {doc_length} for the Doc itself.") "array and {doc_length} for the Doc itself.")
E972 = ("Example.__init__ got None for '{arg}'. Requires Doc.") E972 = ("Example.__init__ got None for '{arg}'. Requires Doc.")
E973 = ("Unexpected type for NER data") E973 = ("Unexpected type for NER data")
E974 = ("Unknown {obj} attribute: {key}") E974 = ("Unknown {obj} attribute: {key}")
E975 = ("The method 'Example.from_dict' expects a Doc as first argument, " E976 = ("The method 'Example.from_dict' expects a {type} as {n} argument, "
"but got {type}")
E976 = ("The method 'Example.from_dict' expects a dict as second argument, "
"but received None.") "but received None.")
E977 = ("Can not compare a MorphAnalysis with a string object. " E977 = ("Can not compare a MorphAnalysis with a string object. "
"This is likely a bug in spaCy, so feel free to open an issue.") "This is likely a bug in spaCy, so feel free to open an issue.")

View File

@ -28,7 +28,6 @@ cpdef Doc annotations2doc(vocab, tok_annot, doc_annot):
cdef class Example: cdef class Example:
def __init__(self, Doc predicted, Doc reference, *, alignment=None): def __init__(self, Doc predicted, Doc reference, *, alignment=None):
""" Doc can either be text, or an actual Doc """
if predicted is None: if predicted is None:
raise TypeError(Errors.E972.format(arg="predicted")) raise TypeError(Errors.E972.format(arg="predicted"))
if reference is None: if reference is None:
@ -59,17 +58,15 @@ cdef class Example:
@classmethod @classmethod
def from_dict(cls, Doc predicted, dict example_dict): def from_dict(cls, Doc predicted, dict example_dict):
if predicted is None:
raise ValueError(Errors.E976.format(n="first", type="Doc"))
if example_dict is None: if example_dict is None:
raise ValueError(Errors.E976) raise ValueError(Errors.E976.format(n="second", type="dict"))
if not isinstance(predicted, Doc):
raise TypeError(Errors.E975.format(type=type(predicted)))
example_dict = _fix_legacy_dict_data(example_dict) example_dict = _fix_legacy_dict_data(example_dict)
tok_dict, doc_dict = _parse_example_dict_data(example_dict) tok_dict, doc_dict = _parse_example_dict_data(example_dict)
if "ORTH" not in tok_dict: if "ORTH" not in tok_dict:
tok_dict["ORTH"] = [tok.text for tok in predicted] tok_dict["ORTH"] = [tok.text for tok in predicted]
tok_dict["SPACY"] = [tok.whitespace_ for tok in predicted] tok_dict["SPACY"] = [tok.whitespace_ for tok in predicted]
if not _has_field(tok_dict, "SPACY"):
spaces = _guess_spaces(predicted.text, tok_dict["ORTH"])
return Example( return Example(
predicted, predicted,
annotations2doc(predicted.vocab, tok_dict, doc_dict) annotations2doc(predicted.vocab, tok_dict, doc_dict)
@ -257,7 +254,11 @@ def _annot2array(vocab, tok_annot, doc_annot):
values.append([vocab.morphology.add(v) for v in value]) values.append([vocab.morphology.add(v) for v in value])
else: else:
attrs.append(key) attrs.append(key)
try:
values.append([vocab.strings.add(v) for v in value]) values.append([vocab.strings.add(v) for v in value])
except TypeError:
types= set([type(v) for v in value])
raise TypeError(Errors.E969.format(field=key, types=types))
array = numpy.asarray(values, dtype="uint64") array = numpy.asarray(values, dtype="uint64")
return attrs, array.T return attrs, array.T

View File

@ -45,7 +45,7 @@ def test_parser_ancestors(tree, cyclic_tree, partial_tree, multirooted_tree):
def test_parser_contains_cycle(tree, cyclic_tree, partial_tree, multirooted_tree): def test_parser_contains_cycle(tree, cyclic_tree, partial_tree, multirooted_tree):
assert contains_cycle(tree) is None assert contains_cycle(tree) is None
assert contains_cycle(cyclic_tree) == set([3, 4, 5]) assert contains_cycle(cyclic_tree) == {3, 4, 5}
assert contains_cycle(partial_tree) is None assert contains_cycle(partial_tree) is None
assert contains_cycle(multirooted_tree) is None assert contains_cycle(multirooted_tree) is None

View File

@ -5,6 +5,7 @@ from spacy.gold import Corpus, docs_to_json
from spacy.gold.example import Example from spacy.gold.example import Example
from spacy.gold.converters import json2docs from spacy.gold.converters import json2docs
from spacy.lang.en import English from spacy.lang.en import English
from spacy.pipeline import EntityRuler
from spacy.tokens import Doc, DocBin from spacy.tokens import Doc, DocBin
from spacy.util import get_words_and_spaces, minibatch from spacy.util import get_words_and_spaces, minibatch
from thinc.api import compounding from thinc.api import compounding
@ -272,72 +273,72 @@ def test_split_sentences(en_vocab):
def test_gold_biluo_one_to_many(en_vocab, en_tokenizer): def test_gold_biluo_one_to_many(en_vocab, en_tokenizer):
words = ["Mr. and ", "Mrs. Smith", "flew to", "San Francisco Valley", "."] words = ["Mr and ", "Mrs Smith", "flew to", "San Francisco Valley", "."]
spaces = [True, True, True, False, False] spaces = [True, True, True, False, False]
doc = Doc(en_vocab, words=words, spaces=spaces) doc = Doc(en_vocab, words=words, spaces=spaces)
prefix = "Mr. and Mrs. Smith flew to " prefix = "Mr and Mrs Smith flew to "
entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")] entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")]
gold_words = ["Mr. and Mrs. Smith", "flew", "to", "San", "Francisco", "Valley", "."] gold_words = ["Mr and Mrs Smith", "flew", "to", "San", "Francisco", "Valley", "."]
example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
ner_tags = example.get_aligned_ner() ner_tags = example.get_aligned_ner()
assert ner_tags == ["O", "O", "O", "U-LOC", "O"] assert ner_tags == ["O", "O", "O", "U-LOC", "O"]
entities = [ entities = [
(len("Mr. and "), len("Mr. and Mrs. Smith"), "PERSON"), # "Mrs. Smith" is a PERSON (len("Mr and "), len("Mr and Mrs Smith"), "PERSON"), # "Mrs Smith" is a PERSON
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"), (len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
] ]
gold_words = ["Mr. and", "Mrs.", "Smith", "flew", "to", "San", "Francisco", "Valley", "."] gold_words = ["Mr and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
ner_tags = example.get_aligned_ner() ner_tags = example.get_aligned_ner()
assert ner_tags == ["O", "U-PERSON", "O", "U-LOC", "O"] assert ner_tags == ["O", "U-PERSON", "O", "U-LOC", "O"]
entities = [ entities = [
(len("Mr. and "), len("Mr. and Mrs."), "PERSON"), # "Mrs." is a Person (len("Mr and "), len("Mr and Mrs"), "PERSON"), # "Mrs" is a Person
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"), (len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
] ]
gold_words = ["Mr. and", "Mrs.", "Smith", "flew", "to", "San", "Francisco", "Valley", "."] gold_words = ["Mr and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
ner_tags = example.get_aligned_ner() ner_tags = example.get_aligned_ner()
assert ner_tags == ["O", None, "O", "U-LOC", "O"] assert ner_tags == ["O", None, "O", "U-LOC", "O"]
def test_gold_biluo_many_to_one(en_vocab, en_tokenizer): def test_gold_biluo_many_to_one(en_vocab, en_tokenizer):
words = ["Mr. and", "Mrs.", "Smith", "flew", "to", "San", "Francisco", "Valley", "."] words = ["Mr and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
spaces = [True, True, True, True, True, True, True, False, False] spaces = [True, True, True, True, True, True, True, False, False]
doc = Doc(en_vocab, words=words, spaces=spaces) doc = Doc(en_vocab, words=words, spaces=spaces)
prefix = "Mr. and Mrs. Smith flew to " prefix = "Mr and Mrs Smith flew to "
entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")] entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")]
gold_words = ["Mr. and Mrs. Smith", "flew to", "San Francisco Valley", "."] gold_words = ["Mr and Mrs Smith", "flew to", "San Francisco Valley", "."]
example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
ner_tags = example.get_aligned_ner() ner_tags = example.get_aligned_ner()
assert ner_tags == ["O", "O", "O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"] assert ner_tags == ["O", "O", "O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
entities = [ entities = [
(len("Mr. and "), len("Mr. and Mrs. Smith"), "PERSON"), # "Mrs. Smith" is a PERSON (len("Mr and "), len("Mr and Mrs Smith"), "PERSON"), # "Mrs Smith" is a PERSON
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"), (len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
] ]
gold_words = ["Mr. and", "Mrs. Smith", "flew to", "San Francisco Valley", "."] gold_words = ["Mr and", "Mrs Smith", "flew to", "San Francisco Valley", "."]
example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
ner_tags = example.get_aligned_ner() ner_tags = example.get_aligned_ner()
assert ner_tags == ["O", "B-PERSON", "L-PERSON", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"] assert ner_tags == ["O", "B-PERSON", "L-PERSON", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
def test_gold_biluo_misaligned(en_vocab, en_tokenizer): def test_gold_biluo_misaligned(en_vocab, en_tokenizer):
words = ["Mr. and Mrs.", "Smith", "flew", "to", "San Francisco", "Valley", "."] words = ["Mr and Mrs", "Smith", "flew", "to", "San Francisco", "Valley", "."]
spaces = [True, True, True, True, True, False, False] spaces = [True, True, True, True, True, False, False]
doc = Doc(en_vocab, words=words, spaces=spaces) doc = Doc(en_vocab, words=words, spaces=spaces)
prefix = "Mr. and Mrs. Smith flew to " prefix = "Mr and Mrs Smith flew to "
entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")] entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")]
gold_words = ["Mr.", "and Mrs. Smith", "flew to", "San", "Francisco Valley", "."] gold_words = ["Mr", "and Mrs Smith", "flew to", "San", "Francisco Valley", "."]
example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
ner_tags = example.get_aligned_ner() ner_tags = example.get_aligned_ner()
assert ner_tags == ["O", "O", "O", "O", "B-LOC", "L-LOC", "O"] assert ner_tags == ["O", "O", "O", "O", "B-LOC", "L-LOC", "O"]
entities = [ entities = [
(len("Mr. and "), len("Mr. and Mrs. Smith"), "PERSON"), # "Mrs. Smith" is a PERSON (len("Mr and "), len("Mr and Mrs Smith"), "PERSON"), # "Mrs Smith" is a PERSON
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"), (len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
] ]
gold_words = ["Mr. and", "Mrs. Smith", "flew to", "San", "Francisco Valley", "."] gold_words = ["Mr and", "Mrs Smith", "flew to", "San", "Francisco Valley", "."]
example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
ner_tags = example.get_aligned_ner() ner_tags = example.get_aligned_ner()
assert ner_tags == [None, None, "O", "O", "B-LOC", "L-LOC", "O"] assert ner_tags == [None, None, "O", "O", "B-LOC", "L-LOC", "O"]
@ -407,6 +408,49 @@ def test_biluo_spans(en_tokenizer):
assert spans[1].label_ == "GPE" assert spans[1].label_ == "GPE"
def test_aligned_spans_y2x(en_vocab, en_tokenizer):
words = ["Mr and Mrs Smith", "flew", "to", "San Francisco Valley", "."]
spaces = [True, True, True, False, False]
doc = Doc(en_vocab, words=words, spaces=spaces)
prefix = "Mr and Mrs Smith flew to "
entities = [
(0, len("Mr and Mrs Smith"), "PERSON"),
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
]
tokens_ref = ["Mr", "and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
example = Example.from_dict(doc, {"words": tokens_ref, "entities": entities})
ents_ref = example.reference.ents
assert [(ent.start, ent.end) for ent in ents_ref] == [(0, 4), (6, 9)]
ents_y2x = example.get_aligned_spans_y2x(ents_ref)
assert [(ent.start, ent.end) for ent in ents_y2x] == [(0, 1), (3, 4)]
def test_aligned_spans_x2y(en_vocab, en_tokenizer):
text = "Mr and Mrs Smith flew to San Francisco Valley"
nlp = English()
ruler = EntityRuler(nlp)
patterns = [{"label": "PERSON", "pattern": "Mr and Mrs Smith"},
{"label": "LOC", "pattern": "San Francisco Valley"}]
ruler.add_patterns(patterns)
nlp.add_pipe(ruler)
doc = nlp(text)
assert [(ent.start, ent.end) for ent in doc.ents] == [(0, 4), (6, 9)]
prefix = "Mr and Mrs Smith flew to "
entities = [
(0, len("Mr and Mrs Smith"), "PERSON"),
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
]
tokens_ref = ["Mr and Mrs", "Smith", "flew", "to", "San Francisco", "Valley"]
example = Example.from_dict(doc, {"words": tokens_ref, "entities": entities})
assert [(ent.start, ent.end) for ent in example.reference.ents] == [(0, 2), (4, 6)]
# Ensure that 'get_aligned_spans_x2y' has the aligned entities correct
ents_pred = example.predicted.ents
assert [(ent.start, ent.end) for ent in ents_pred] == [(0, 4), (6, 9)]
ents_x2y = example.get_aligned_spans_x2y(ents_pred)
assert [(ent.start, ent.end) for ent in ents_x2y] == [(0, 2), (4, 6)]
def test_gold_ner_missing_tags(en_tokenizer): def test_gold_ner_missing_tags(en_tokenizer):
doc = en_tokenizer("I flew to Silicon Valley via London.") doc = en_tokenizer("I flew to Silicon Valley via London.")
biluo_tags = [None, "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"] biluo_tags = [None, "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
@ -414,6 +458,16 @@ def test_gold_ner_missing_tags(en_tokenizer):
assert example.get_aligned("ENT_IOB") == [0, 2, 2, 3, 1, 2, 3, 2] assert example.get_aligned("ENT_IOB") == [0, 2, 2, 3, 1, 2, 3, 2]
def test_projectivize(en_tokenizer):
doc = en_tokenizer("He pretty quickly walks away")
heads = [3, 2, 3, 0, 2]
example = Example.from_dict(doc, {"heads": heads})
proj_heads, proj_labels = example.get_aligned_parse(projectivize=True)
nonproj_heads, nonproj_labels = example.get_aligned_parse(projectivize=False)
assert proj_heads == [3, 2, 3, 0, 3]
assert nonproj_heads == [3, 2, 3, 0, 2]
def test_iob_to_biluo(): def test_iob_to_biluo():
good_iob = ["O", "O", "B-LOC", "I-LOC", "O", "B-PERSON"] good_iob = ["O", "O", "B-LOC", "I-LOC", "O", "B-PERSON"]
good_biluo = ["O", "O", "B-LOC", "L-LOC", "O", "U-PERSON"] good_biluo = ["O", "O", "B-LOC", "L-LOC", "O", "U-PERSON"]