diff --git a/spacy/errors.py b/spacy/errors.py index 31533e7e2..5a4e0d0c7 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -477,15 +477,14 @@ class Errors(object): E199 = ("Unable to merge 0-length span at doc[{start}:{end}].") # TODO: fix numbering after merging develop into master + E969 = ("Expected string values for field '{field}', but received {types} instead. ") E970 = ("Can not execute command '{str_command}'. Do you have '{tool}' installed?") E971 = ("Found incompatible lengths in Doc.from_array: {array_length} for the " "array and {doc_length} for the Doc itself.") E972 = ("Example.__init__ got None for '{arg}'. Requires Doc.") E973 = ("Unexpected type for NER data") E974 = ("Unknown {obj} attribute: {key}") - E975 = ("The method 'Example.from_dict' expects a Doc as first argument, " - "but got {type}") - E976 = ("The method 'Example.from_dict' expects a dict as second argument, " + E976 = ("The method 'Example.from_dict' expects a {type} as {n} argument, " "but received None.") E977 = ("Can not compare a MorphAnalysis with a string object. " "This is likely a bug in spaCy, so feel free to open an issue.") diff --git a/spacy/gold/example.pyx b/spacy/gold/example.pyx index ce1a0928b..f5b9f0eeb 100644 --- a/spacy/gold/example.pyx +++ b/spacy/gold/example.pyx @@ -28,7 +28,6 @@ cpdef Doc annotations2doc(vocab, tok_annot, doc_annot): cdef class Example: def __init__(self, Doc predicted, Doc reference, *, alignment=None): - """ Doc can either be text, or an actual Doc """ if predicted is None: raise TypeError(Errors.E972.format(arg="predicted")) if reference is None: @@ -59,17 +58,15 @@ cdef class Example: @classmethod def from_dict(cls, Doc predicted, dict example_dict): + if predicted is None: + raise ValueError(Errors.E976.format(n="first", type="Doc")) if example_dict is None: - raise ValueError(Errors.E976) - if not isinstance(predicted, Doc): - raise TypeError(Errors.E975.format(type=type(predicted))) + raise ValueError(Errors.E976.format(n="second", type="dict")) example_dict = _fix_legacy_dict_data(example_dict) tok_dict, doc_dict = _parse_example_dict_data(example_dict) if "ORTH" not in tok_dict: tok_dict["ORTH"] = [tok.text for tok in predicted] tok_dict["SPACY"] = [tok.whitespace_ for tok in predicted] - if not _has_field(tok_dict, "SPACY"): - spaces = _guess_spaces(predicted.text, tok_dict["ORTH"]) return Example( predicted, annotations2doc(predicted.vocab, tok_dict, doc_dict) @@ -257,7 +254,11 @@ def _annot2array(vocab, tok_annot, doc_annot): values.append([vocab.morphology.add(v) for v in value]) else: attrs.append(key) - values.append([vocab.strings.add(v) for v in value]) + try: + values.append([vocab.strings.add(v) for v in value]) + except TypeError: + types= set([type(v) for v in value]) + raise TypeError(Errors.E969.format(field=key, types=types)) array = numpy.asarray(values, dtype="uint64") return attrs, array.T diff --git a/spacy/tests/parser/test_nonproj.py b/spacy/tests/parser/test_nonproj.py index 86d9a0180..496ec7e03 100644 --- a/spacy/tests/parser/test_nonproj.py +++ b/spacy/tests/parser/test_nonproj.py @@ -45,7 +45,7 @@ def test_parser_ancestors(tree, cyclic_tree, partial_tree, multirooted_tree): def test_parser_contains_cycle(tree, cyclic_tree, partial_tree, multirooted_tree): assert contains_cycle(tree) is None - assert contains_cycle(cyclic_tree) == set([3, 4, 5]) + assert contains_cycle(cyclic_tree) == {3, 4, 5} assert contains_cycle(partial_tree) is None assert contains_cycle(multirooted_tree) is None diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index 24f2bbc13..7d3033560 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -5,6 +5,7 @@ from spacy.gold import Corpus, docs_to_json from spacy.gold.example import Example from spacy.gold.converters import json2docs from spacy.lang.en import English +from spacy.pipeline import EntityRuler from spacy.tokens import Doc, DocBin from spacy.util import get_words_and_spaces, minibatch from thinc.api import compounding @@ -272,72 +273,72 @@ def test_split_sentences(en_vocab): def test_gold_biluo_one_to_many(en_vocab, en_tokenizer): - words = ["Mr. and ", "Mrs. Smith", "flew to", "San Francisco Valley", "."] + words = ["Mr and ", "Mrs Smith", "flew to", "San Francisco Valley", "."] spaces = [True, True, True, False, False] doc = Doc(en_vocab, words=words, spaces=spaces) - prefix = "Mr. and Mrs. Smith flew to " + prefix = "Mr and Mrs Smith flew to " entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")] - gold_words = ["Mr. and Mrs. Smith", "flew", "to", "San", "Francisco", "Valley", "."] + gold_words = ["Mr and Mrs Smith", "flew", "to", "San", "Francisco", "Valley", "."] example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) ner_tags = example.get_aligned_ner() assert ner_tags == ["O", "O", "O", "U-LOC", "O"] entities = [ - (len("Mr. and "), len("Mr. and Mrs. Smith"), "PERSON"), # "Mrs. Smith" is a PERSON + (len("Mr and "), len("Mr and Mrs Smith"), "PERSON"), # "Mrs Smith" is a PERSON (len(prefix), len(prefix + "San Francisco Valley"), "LOC"), ] - gold_words = ["Mr. and", "Mrs.", "Smith", "flew", "to", "San", "Francisco", "Valley", "."] + gold_words = ["Mr and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."] example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) ner_tags = example.get_aligned_ner() assert ner_tags == ["O", "U-PERSON", "O", "U-LOC", "O"] entities = [ - (len("Mr. and "), len("Mr. and Mrs."), "PERSON"), # "Mrs." is a Person + (len("Mr and "), len("Mr and Mrs"), "PERSON"), # "Mrs" is a Person (len(prefix), len(prefix + "San Francisco Valley"), "LOC"), ] - gold_words = ["Mr. and", "Mrs.", "Smith", "flew", "to", "San", "Francisco", "Valley", "."] + gold_words = ["Mr and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."] example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) ner_tags = example.get_aligned_ner() assert ner_tags == ["O", None, "O", "U-LOC", "O"] def test_gold_biluo_many_to_one(en_vocab, en_tokenizer): - words = ["Mr. and", "Mrs.", "Smith", "flew", "to", "San", "Francisco", "Valley", "."] + words = ["Mr and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."] spaces = [True, True, True, True, True, True, True, False, False] doc = Doc(en_vocab, words=words, spaces=spaces) - prefix = "Mr. and Mrs. Smith flew to " + prefix = "Mr and Mrs Smith flew to " entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")] - gold_words = ["Mr. and Mrs. Smith", "flew to", "San Francisco Valley", "."] + gold_words = ["Mr and Mrs Smith", "flew to", "San Francisco Valley", "."] example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) ner_tags = example.get_aligned_ner() assert ner_tags == ["O", "O", "O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"] entities = [ - (len("Mr. and "), len("Mr. and Mrs. Smith"), "PERSON"), # "Mrs. Smith" is a PERSON + (len("Mr and "), len("Mr and Mrs Smith"), "PERSON"), # "Mrs Smith" is a PERSON (len(prefix), len(prefix + "San Francisco Valley"), "LOC"), ] - gold_words = ["Mr. and", "Mrs. Smith", "flew to", "San Francisco Valley", "."] + gold_words = ["Mr and", "Mrs Smith", "flew to", "San Francisco Valley", "."] example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) ner_tags = example.get_aligned_ner() assert ner_tags == ["O", "B-PERSON", "L-PERSON", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"] def test_gold_biluo_misaligned(en_vocab, en_tokenizer): - words = ["Mr. and Mrs.", "Smith", "flew", "to", "San Francisco", "Valley", "."] + words = ["Mr and Mrs", "Smith", "flew", "to", "San Francisco", "Valley", "."] spaces = [True, True, True, True, True, False, False] doc = Doc(en_vocab, words=words, spaces=spaces) - prefix = "Mr. and Mrs. Smith flew to " + prefix = "Mr and Mrs Smith flew to " entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")] - gold_words = ["Mr.", "and Mrs. Smith", "flew to", "San", "Francisco Valley", "."] + gold_words = ["Mr", "and Mrs Smith", "flew to", "San", "Francisco Valley", "."] example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) ner_tags = example.get_aligned_ner() assert ner_tags == ["O", "O", "O", "O", "B-LOC", "L-LOC", "O"] entities = [ - (len("Mr. and "), len("Mr. and Mrs. Smith"), "PERSON"), # "Mrs. Smith" is a PERSON + (len("Mr and "), len("Mr and Mrs Smith"), "PERSON"), # "Mrs Smith" is a PERSON (len(prefix), len(prefix + "San Francisco Valley"), "LOC"), ] - gold_words = ["Mr. and", "Mrs. Smith", "flew to", "San", "Francisco Valley", "."] + gold_words = ["Mr and", "Mrs Smith", "flew to", "San", "Francisco Valley", "."] example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) ner_tags = example.get_aligned_ner() assert ner_tags == [None, None, "O", "O", "B-LOC", "L-LOC", "O"] @@ -407,6 +408,49 @@ def test_biluo_spans(en_tokenizer): assert spans[1].label_ == "GPE" +def test_aligned_spans_y2x(en_vocab, en_tokenizer): + words = ["Mr and Mrs Smith", "flew", "to", "San Francisco Valley", "."] + spaces = [True, True, True, False, False] + doc = Doc(en_vocab, words=words, spaces=spaces) + prefix = "Mr and Mrs Smith flew to " + entities = [ + (0, len("Mr and Mrs Smith"), "PERSON"), + (len(prefix), len(prefix + "San Francisco Valley"), "LOC"), + ] + tokens_ref = ["Mr", "and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."] + example = Example.from_dict(doc, {"words": tokens_ref, "entities": entities}) + ents_ref = example.reference.ents + assert [(ent.start, ent.end) for ent in ents_ref] == [(0, 4), (6, 9)] + ents_y2x = example.get_aligned_spans_y2x(ents_ref) + assert [(ent.start, ent.end) for ent in ents_y2x] == [(0, 1), (3, 4)] + + +def test_aligned_spans_x2y(en_vocab, en_tokenizer): + text = "Mr and Mrs Smith flew to San Francisco Valley" + nlp = English() + ruler = EntityRuler(nlp) + patterns = [{"label": "PERSON", "pattern": "Mr and Mrs Smith"}, + {"label": "LOC", "pattern": "San Francisco Valley"}] + ruler.add_patterns(patterns) + nlp.add_pipe(ruler) + doc = nlp(text) + assert [(ent.start, ent.end) for ent in doc.ents] == [(0, 4), (6, 9)] + prefix = "Mr and Mrs Smith flew to " + entities = [ + (0, len("Mr and Mrs Smith"), "PERSON"), + (len(prefix), len(prefix + "San Francisco Valley"), "LOC"), + ] + tokens_ref = ["Mr and Mrs", "Smith", "flew", "to", "San Francisco", "Valley"] + example = Example.from_dict(doc, {"words": tokens_ref, "entities": entities}) + assert [(ent.start, ent.end) for ent in example.reference.ents] == [(0, 2), (4, 6)] + + # Ensure that 'get_aligned_spans_x2y' has the aligned entities correct + ents_pred = example.predicted.ents + assert [(ent.start, ent.end) for ent in ents_pred] == [(0, 4), (6, 9)] + ents_x2y = example.get_aligned_spans_x2y(ents_pred) + assert [(ent.start, ent.end) for ent in ents_x2y] == [(0, 2), (4, 6)] + + def test_gold_ner_missing_tags(en_tokenizer): doc = en_tokenizer("I flew to Silicon Valley via London.") biluo_tags = [None, "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"] @@ -414,6 +458,16 @@ def test_gold_ner_missing_tags(en_tokenizer): assert example.get_aligned("ENT_IOB") == [0, 2, 2, 3, 1, 2, 3, 2] +def test_projectivize(en_tokenizer): + doc = en_tokenizer("He pretty quickly walks away") + heads = [3, 2, 3, 0, 2] + example = Example.from_dict(doc, {"heads": heads}) + proj_heads, proj_labels = example.get_aligned_parse(projectivize=True) + nonproj_heads, nonproj_labels = example.get_aligned_parse(projectivize=False) + assert proj_heads == [3, 2, 3, 0, 3] + assert nonproj_heads == [3, 2, 3, 0, 2] + + def test_iob_to_biluo(): good_iob = ["O", "O", "B-LOC", "I-LOC", "O", "B-PERSON"] good_biluo = ["O", "O", "B-LOC", "L-LOC", "O", "U-PERSON"]