mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 18:56:36 +03:00
Few more Example unit tests (#5720)
* small fixes in Example, UX * add gold tests for aligned_spans and get_aligned_parse * sentencizer unnecessary
This commit is contained in:
parent
433dc3c9c9
commit
a39a110c4e
|
@ -477,15 +477,14 @@ class Errors(object):
|
||||||
E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")
|
E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")
|
||||||
|
|
||||||
# TODO: fix numbering after merging develop into master
|
# TODO: fix numbering after merging develop into master
|
||||||
|
E969 = ("Expected string values for field '{field}', but received {types} instead. ")
|
||||||
E970 = ("Can not execute command '{str_command}'. Do you have '{tool}' installed?")
|
E970 = ("Can not execute command '{str_command}'. Do you have '{tool}' installed?")
|
||||||
E971 = ("Found incompatible lengths in Doc.from_array: {array_length} for the "
|
E971 = ("Found incompatible lengths in Doc.from_array: {array_length} for the "
|
||||||
"array and {doc_length} for the Doc itself.")
|
"array and {doc_length} for the Doc itself.")
|
||||||
E972 = ("Example.__init__ got None for '{arg}'. Requires Doc.")
|
E972 = ("Example.__init__ got None for '{arg}'. Requires Doc.")
|
||||||
E973 = ("Unexpected type for NER data")
|
E973 = ("Unexpected type for NER data")
|
||||||
E974 = ("Unknown {obj} attribute: {key}")
|
E974 = ("Unknown {obj} attribute: {key}")
|
||||||
E975 = ("The method 'Example.from_dict' expects a Doc as first argument, "
|
E976 = ("The method 'Example.from_dict' expects a {type} as {n} argument, "
|
||||||
"but got {type}")
|
|
||||||
E976 = ("The method 'Example.from_dict' expects a dict as second argument, "
|
|
||||||
"but received None.")
|
"but received None.")
|
||||||
E977 = ("Can not compare a MorphAnalysis with a string object. "
|
E977 = ("Can not compare a MorphAnalysis with a string object. "
|
||||||
"This is likely a bug in spaCy, so feel free to open an issue.")
|
"This is likely a bug in spaCy, so feel free to open an issue.")
|
||||||
|
|
|
@ -28,7 +28,6 @@ cpdef Doc annotations2doc(vocab, tok_annot, doc_annot):
|
||||||
|
|
||||||
cdef class Example:
|
cdef class Example:
|
||||||
def __init__(self, Doc predicted, Doc reference, *, alignment=None):
|
def __init__(self, Doc predicted, Doc reference, *, alignment=None):
|
||||||
""" Doc can either be text, or an actual Doc """
|
|
||||||
if predicted is None:
|
if predicted is None:
|
||||||
raise TypeError(Errors.E972.format(arg="predicted"))
|
raise TypeError(Errors.E972.format(arg="predicted"))
|
||||||
if reference is None:
|
if reference is None:
|
||||||
|
@ -59,17 +58,15 @@ cdef class Example:
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_dict(cls, Doc predicted, dict example_dict):
|
def from_dict(cls, Doc predicted, dict example_dict):
|
||||||
|
if predicted is None:
|
||||||
|
raise ValueError(Errors.E976.format(n="first", type="Doc"))
|
||||||
if example_dict is None:
|
if example_dict is None:
|
||||||
raise ValueError(Errors.E976)
|
raise ValueError(Errors.E976.format(n="second", type="dict"))
|
||||||
if not isinstance(predicted, Doc):
|
|
||||||
raise TypeError(Errors.E975.format(type=type(predicted)))
|
|
||||||
example_dict = _fix_legacy_dict_data(example_dict)
|
example_dict = _fix_legacy_dict_data(example_dict)
|
||||||
tok_dict, doc_dict = _parse_example_dict_data(example_dict)
|
tok_dict, doc_dict = _parse_example_dict_data(example_dict)
|
||||||
if "ORTH" not in tok_dict:
|
if "ORTH" not in tok_dict:
|
||||||
tok_dict["ORTH"] = [tok.text for tok in predicted]
|
tok_dict["ORTH"] = [tok.text for tok in predicted]
|
||||||
tok_dict["SPACY"] = [tok.whitespace_ for tok in predicted]
|
tok_dict["SPACY"] = [tok.whitespace_ for tok in predicted]
|
||||||
if not _has_field(tok_dict, "SPACY"):
|
|
||||||
spaces = _guess_spaces(predicted.text, tok_dict["ORTH"])
|
|
||||||
return Example(
|
return Example(
|
||||||
predicted,
|
predicted,
|
||||||
annotations2doc(predicted.vocab, tok_dict, doc_dict)
|
annotations2doc(predicted.vocab, tok_dict, doc_dict)
|
||||||
|
@ -257,7 +254,11 @@ def _annot2array(vocab, tok_annot, doc_annot):
|
||||||
values.append([vocab.morphology.add(v) for v in value])
|
values.append([vocab.morphology.add(v) for v in value])
|
||||||
else:
|
else:
|
||||||
attrs.append(key)
|
attrs.append(key)
|
||||||
|
try:
|
||||||
values.append([vocab.strings.add(v) for v in value])
|
values.append([vocab.strings.add(v) for v in value])
|
||||||
|
except TypeError:
|
||||||
|
types= set([type(v) for v in value])
|
||||||
|
raise TypeError(Errors.E969.format(field=key, types=types))
|
||||||
|
|
||||||
array = numpy.asarray(values, dtype="uint64")
|
array = numpy.asarray(values, dtype="uint64")
|
||||||
return attrs, array.T
|
return attrs, array.T
|
||||||
|
|
|
@ -45,7 +45,7 @@ def test_parser_ancestors(tree, cyclic_tree, partial_tree, multirooted_tree):
|
||||||
|
|
||||||
def test_parser_contains_cycle(tree, cyclic_tree, partial_tree, multirooted_tree):
|
def test_parser_contains_cycle(tree, cyclic_tree, partial_tree, multirooted_tree):
|
||||||
assert contains_cycle(tree) is None
|
assert contains_cycle(tree) is None
|
||||||
assert contains_cycle(cyclic_tree) == set([3, 4, 5])
|
assert contains_cycle(cyclic_tree) == {3, 4, 5}
|
||||||
assert contains_cycle(partial_tree) is None
|
assert contains_cycle(partial_tree) is None
|
||||||
assert contains_cycle(multirooted_tree) is None
|
assert contains_cycle(multirooted_tree) is None
|
||||||
|
|
||||||
|
|
|
@ -5,6 +5,7 @@ from spacy.gold import Corpus, docs_to_json
|
||||||
from spacy.gold.example import Example
|
from spacy.gold.example import Example
|
||||||
from spacy.gold.converters import json2docs
|
from spacy.gold.converters import json2docs
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
|
from spacy.pipeline import EntityRuler
|
||||||
from spacy.tokens import Doc, DocBin
|
from spacy.tokens import Doc, DocBin
|
||||||
from spacy.util import get_words_and_spaces, minibatch
|
from spacy.util import get_words_and_spaces, minibatch
|
||||||
from thinc.api import compounding
|
from thinc.api import compounding
|
||||||
|
@ -272,72 +273,72 @@ def test_split_sentences(en_vocab):
|
||||||
|
|
||||||
|
|
||||||
def test_gold_biluo_one_to_many(en_vocab, en_tokenizer):
|
def test_gold_biluo_one_to_many(en_vocab, en_tokenizer):
|
||||||
words = ["Mr. and ", "Mrs. Smith", "flew to", "San Francisco Valley", "."]
|
words = ["Mr and ", "Mrs Smith", "flew to", "San Francisco Valley", "."]
|
||||||
spaces = [True, True, True, False, False]
|
spaces = [True, True, True, False, False]
|
||||||
doc = Doc(en_vocab, words=words, spaces=spaces)
|
doc = Doc(en_vocab, words=words, spaces=spaces)
|
||||||
prefix = "Mr. and Mrs. Smith flew to "
|
prefix = "Mr and Mrs Smith flew to "
|
||||||
entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")]
|
entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")]
|
||||||
gold_words = ["Mr. and Mrs. Smith", "flew", "to", "San", "Francisco", "Valley", "."]
|
gold_words = ["Mr and Mrs Smith", "flew", "to", "San", "Francisco", "Valley", "."]
|
||||||
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
||||||
ner_tags = example.get_aligned_ner()
|
ner_tags = example.get_aligned_ner()
|
||||||
assert ner_tags == ["O", "O", "O", "U-LOC", "O"]
|
assert ner_tags == ["O", "O", "O", "U-LOC", "O"]
|
||||||
|
|
||||||
entities = [
|
entities = [
|
||||||
(len("Mr. and "), len("Mr. and Mrs. Smith"), "PERSON"), # "Mrs. Smith" is a PERSON
|
(len("Mr and "), len("Mr and Mrs Smith"), "PERSON"), # "Mrs Smith" is a PERSON
|
||||||
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
|
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
|
||||||
]
|
]
|
||||||
gold_words = ["Mr. and", "Mrs.", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
|
gold_words = ["Mr and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
|
||||||
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
||||||
ner_tags = example.get_aligned_ner()
|
ner_tags = example.get_aligned_ner()
|
||||||
assert ner_tags == ["O", "U-PERSON", "O", "U-LOC", "O"]
|
assert ner_tags == ["O", "U-PERSON", "O", "U-LOC", "O"]
|
||||||
|
|
||||||
entities = [
|
entities = [
|
||||||
(len("Mr. and "), len("Mr. and Mrs."), "PERSON"), # "Mrs." is a Person
|
(len("Mr and "), len("Mr and Mrs"), "PERSON"), # "Mrs" is a Person
|
||||||
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
|
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
|
||||||
]
|
]
|
||||||
gold_words = ["Mr. and", "Mrs.", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
|
gold_words = ["Mr and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
|
||||||
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
||||||
ner_tags = example.get_aligned_ner()
|
ner_tags = example.get_aligned_ner()
|
||||||
assert ner_tags == ["O", None, "O", "U-LOC", "O"]
|
assert ner_tags == ["O", None, "O", "U-LOC", "O"]
|
||||||
|
|
||||||
|
|
||||||
def test_gold_biluo_many_to_one(en_vocab, en_tokenizer):
|
def test_gold_biluo_many_to_one(en_vocab, en_tokenizer):
|
||||||
words = ["Mr. and", "Mrs.", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
|
words = ["Mr and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
|
||||||
spaces = [True, True, True, True, True, True, True, False, False]
|
spaces = [True, True, True, True, True, True, True, False, False]
|
||||||
doc = Doc(en_vocab, words=words, spaces=spaces)
|
doc = Doc(en_vocab, words=words, spaces=spaces)
|
||||||
prefix = "Mr. and Mrs. Smith flew to "
|
prefix = "Mr and Mrs Smith flew to "
|
||||||
entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")]
|
entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")]
|
||||||
gold_words = ["Mr. and Mrs. Smith", "flew to", "San Francisco Valley", "."]
|
gold_words = ["Mr and Mrs Smith", "flew to", "San Francisco Valley", "."]
|
||||||
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
||||||
ner_tags = example.get_aligned_ner()
|
ner_tags = example.get_aligned_ner()
|
||||||
assert ner_tags == ["O", "O", "O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
|
assert ner_tags == ["O", "O", "O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
|
||||||
|
|
||||||
entities = [
|
entities = [
|
||||||
(len("Mr. and "), len("Mr. and Mrs. Smith"), "PERSON"), # "Mrs. Smith" is a PERSON
|
(len("Mr and "), len("Mr and Mrs Smith"), "PERSON"), # "Mrs Smith" is a PERSON
|
||||||
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
|
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
|
||||||
]
|
]
|
||||||
gold_words = ["Mr. and", "Mrs. Smith", "flew to", "San Francisco Valley", "."]
|
gold_words = ["Mr and", "Mrs Smith", "flew to", "San Francisco Valley", "."]
|
||||||
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
||||||
ner_tags = example.get_aligned_ner()
|
ner_tags = example.get_aligned_ner()
|
||||||
assert ner_tags == ["O", "B-PERSON", "L-PERSON", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
|
assert ner_tags == ["O", "B-PERSON", "L-PERSON", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
|
||||||
|
|
||||||
|
|
||||||
def test_gold_biluo_misaligned(en_vocab, en_tokenizer):
|
def test_gold_biluo_misaligned(en_vocab, en_tokenizer):
|
||||||
words = ["Mr. and Mrs.", "Smith", "flew", "to", "San Francisco", "Valley", "."]
|
words = ["Mr and Mrs", "Smith", "flew", "to", "San Francisco", "Valley", "."]
|
||||||
spaces = [True, True, True, True, True, False, False]
|
spaces = [True, True, True, True, True, False, False]
|
||||||
doc = Doc(en_vocab, words=words, spaces=spaces)
|
doc = Doc(en_vocab, words=words, spaces=spaces)
|
||||||
prefix = "Mr. and Mrs. Smith flew to "
|
prefix = "Mr and Mrs Smith flew to "
|
||||||
entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")]
|
entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")]
|
||||||
gold_words = ["Mr.", "and Mrs. Smith", "flew to", "San", "Francisco Valley", "."]
|
gold_words = ["Mr", "and Mrs Smith", "flew to", "San", "Francisco Valley", "."]
|
||||||
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
||||||
ner_tags = example.get_aligned_ner()
|
ner_tags = example.get_aligned_ner()
|
||||||
assert ner_tags == ["O", "O", "O", "O", "B-LOC", "L-LOC", "O"]
|
assert ner_tags == ["O", "O", "O", "O", "B-LOC", "L-LOC", "O"]
|
||||||
|
|
||||||
entities = [
|
entities = [
|
||||||
(len("Mr. and "), len("Mr. and Mrs. Smith"), "PERSON"), # "Mrs. Smith" is a PERSON
|
(len("Mr and "), len("Mr and Mrs Smith"), "PERSON"), # "Mrs Smith" is a PERSON
|
||||||
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
|
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
|
||||||
]
|
]
|
||||||
gold_words = ["Mr. and", "Mrs. Smith", "flew to", "San", "Francisco Valley", "."]
|
gold_words = ["Mr and", "Mrs Smith", "flew to", "San", "Francisco Valley", "."]
|
||||||
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
|
||||||
ner_tags = example.get_aligned_ner()
|
ner_tags = example.get_aligned_ner()
|
||||||
assert ner_tags == [None, None, "O", "O", "B-LOC", "L-LOC", "O"]
|
assert ner_tags == [None, None, "O", "O", "B-LOC", "L-LOC", "O"]
|
||||||
|
@ -407,6 +408,49 @@ def test_biluo_spans(en_tokenizer):
|
||||||
assert spans[1].label_ == "GPE"
|
assert spans[1].label_ == "GPE"
|
||||||
|
|
||||||
|
|
||||||
|
def test_aligned_spans_y2x(en_vocab, en_tokenizer):
|
||||||
|
words = ["Mr and Mrs Smith", "flew", "to", "San Francisco Valley", "."]
|
||||||
|
spaces = [True, True, True, False, False]
|
||||||
|
doc = Doc(en_vocab, words=words, spaces=spaces)
|
||||||
|
prefix = "Mr and Mrs Smith flew to "
|
||||||
|
entities = [
|
||||||
|
(0, len("Mr and Mrs Smith"), "PERSON"),
|
||||||
|
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
|
||||||
|
]
|
||||||
|
tokens_ref = ["Mr", "and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
|
||||||
|
example = Example.from_dict(doc, {"words": tokens_ref, "entities": entities})
|
||||||
|
ents_ref = example.reference.ents
|
||||||
|
assert [(ent.start, ent.end) for ent in ents_ref] == [(0, 4), (6, 9)]
|
||||||
|
ents_y2x = example.get_aligned_spans_y2x(ents_ref)
|
||||||
|
assert [(ent.start, ent.end) for ent in ents_y2x] == [(0, 1), (3, 4)]
|
||||||
|
|
||||||
|
|
||||||
|
def test_aligned_spans_x2y(en_vocab, en_tokenizer):
|
||||||
|
text = "Mr and Mrs Smith flew to San Francisco Valley"
|
||||||
|
nlp = English()
|
||||||
|
ruler = EntityRuler(nlp)
|
||||||
|
patterns = [{"label": "PERSON", "pattern": "Mr and Mrs Smith"},
|
||||||
|
{"label": "LOC", "pattern": "San Francisco Valley"}]
|
||||||
|
ruler.add_patterns(patterns)
|
||||||
|
nlp.add_pipe(ruler)
|
||||||
|
doc = nlp(text)
|
||||||
|
assert [(ent.start, ent.end) for ent in doc.ents] == [(0, 4), (6, 9)]
|
||||||
|
prefix = "Mr and Mrs Smith flew to "
|
||||||
|
entities = [
|
||||||
|
(0, len("Mr and Mrs Smith"), "PERSON"),
|
||||||
|
(len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
|
||||||
|
]
|
||||||
|
tokens_ref = ["Mr and Mrs", "Smith", "flew", "to", "San Francisco", "Valley"]
|
||||||
|
example = Example.from_dict(doc, {"words": tokens_ref, "entities": entities})
|
||||||
|
assert [(ent.start, ent.end) for ent in example.reference.ents] == [(0, 2), (4, 6)]
|
||||||
|
|
||||||
|
# Ensure that 'get_aligned_spans_x2y' has the aligned entities correct
|
||||||
|
ents_pred = example.predicted.ents
|
||||||
|
assert [(ent.start, ent.end) for ent in ents_pred] == [(0, 4), (6, 9)]
|
||||||
|
ents_x2y = example.get_aligned_spans_x2y(ents_pred)
|
||||||
|
assert [(ent.start, ent.end) for ent in ents_x2y] == [(0, 2), (4, 6)]
|
||||||
|
|
||||||
|
|
||||||
def test_gold_ner_missing_tags(en_tokenizer):
|
def test_gold_ner_missing_tags(en_tokenizer):
|
||||||
doc = en_tokenizer("I flew to Silicon Valley via London.")
|
doc = en_tokenizer("I flew to Silicon Valley via London.")
|
||||||
biluo_tags = [None, "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
|
biluo_tags = [None, "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
|
||||||
|
@ -414,6 +458,16 @@ def test_gold_ner_missing_tags(en_tokenizer):
|
||||||
assert example.get_aligned("ENT_IOB") == [0, 2, 2, 3, 1, 2, 3, 2]
|
assert example.get_aligned("ENT_IOB") == [0, 2, 2, 3, 1, 2, 3, 2]
|
||||||
|
|
||||||
|
|
||||||
|
def test_projectivize(en_tokenizer):
|
||||||
|
doc = en_tokenizer("He pretty quickly walks away")
|
||||||
|
heads = [3, 2, 3, 0, 2]
|
||||||
|
example = Example.from_dict(doc, {"heads": heads})
|
||||||
|
proj_heads, proj_labels = example.get_aligned_parse(projectivize=True)
|
||||||
|
nonproj_heads, nonproj_labels = example.get_aligned_parse(projectivize=False)
|
||||||
|
assert proj_heads == [3, 2, 3, 0, 3]
|
||||||
|
assert nonproj_heads == [3, 2, 3, 0, 2]
|
||||||
|
|
||||||
|
|
||||||
def test_iob_to_biluo():
|
def test_iob_to_biluo():
|
||||||
good_iob = ["O", "O", "B-LOC", "I-LOC", "O", "B-PERSON"]
|
good_iob = ["O", "O", "B-LOC", "I-LOC", "O", "B-PERSON"]
|
||||||
good_biluo = ["O", "O", "B-LOC", "L-LOC", "O", "U-PERSON"]
|
good_biluo = ["O", "O", "B-LOC", "L-LOC", "O", "U-PERSON"]
|
||||||
|
|
Loading…
Reference in New Issue
Block a user