diff --git a/spacy/errors.py b/spacy/errors.py index 1f035afcf..30b6e320d 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -620,10 +620,6 @@ class Errors(object): E997 = ("Tokenizer special cases are not allowed to modify the text. " "This would map '{chunk}' to '{orth}' given token attributes " "'{token_attrs}'.") - E998 = ("To create GoldParse objects from Example objects without a " - "Doc, get_gold_parses() should be called with a Vocab object.") - E999 = ("Encountered an unexpected format for the dictionary holding " - "gold annotations: {gold_dict}") @add_codes diff --git a/spacy/gold/example.pyx b/spacy/gold/example.pyx index 1fc671010..24778ff77 100644 --- a/spacy/gold/example.pyx +++ b/spacy/gold/example.pyx @@ -9,11 +9,11 @@ from .align import Alignment from ..errors import Errors, AlignmentError -cpdef Doc annotations2doc(Doc predicted, tok_annot, doc_annot): - # TODO: Improve and test this - words = tok_annot.get("ORTH", [tok.text for tok in predicted]) - attrs, array = _annot2array(predicted, tok_annot, doc_annot) - output = Doc(predicted.vocab, words=words) +cpdef Doc annotations2doc(vocab, tok_annot, doc_annot): + """ Create a Doc from dictionaries with token and doc annotations. Assumes ORTH is set. """ + words = tok_annot["ORTH"] + attrs, array = _annot2array(vocab, tok_annot, doc_annot) + output = Doc(vocab, words=words) if array.size: output = output.from_array(attrs, array) output.cats.update(doc_annot.get("cats", {})) @@ -23,6 +23,7 @@ cpdef Doc annotations2doc(Doc predicted, tok_annot, doc_annot): cdef class Example: def __init__(self, Doc predicted, Doc reference, *, Alignment alignment=None): """ Doc can either be text, or an actual Doc """ + assert predicted.vocab is reference.vocab msg = "Example.__init__ got None for '{arg}'. Requires Doc." if predicted is None: raise TypeError(msg.format(arg="predicted")) @@ -52,11 +53,13 @@ cdef class Example: raise ValueError("Example.from_dict expected dict, received None") if not isinstance(predicted, Doc): raise TypeError(f"Argument 1 should be Doc. Got {type(predicted)}") - example_dict = _fix_legacy_dict_data(predicted, example_dict) + example_dict = _fix_legacy_dict_data(example_dict) tok_dict, doc_dict = _parse_example_dict_data(example_dict) + if "ORTH" not in tok_dict: + tok_dict["ORTH"] = [tok.text for tok in predicted] return Example( predicted, - annotations2doc(predicted, tok_dict, doc_dict) + annotations2doc(predicted.vocab, tok_dict, doc_dict) ) @property @@ -78,6 +81,7 @@ cdef class Example: gold_to_cand = alignment.gold_to_cand cand_to_gold = alignment.cand_to_gold + vocab = self.reference.vocab gold_values = self.reference.to_array([field]) output = [] for i, gold_i in enumerate(cand_to_gold): @@ -85,11 +89,11 @@ cdef class Example: output.append(None) elif gold_i is None: if i in i2j_multi: - output.append(gold_values[i2j_multi[i]]) + output.append(vocab.strings[gold_values[i2j_multi[i]]]) else: output.append(None) else: - output.append(gold_values[gold_i]) + output.append(vocab.strings[gold_values[gold_i]]) return output def to_dict(self): @@ -139,21 +143,21 @@ cdef class Example: return self.x.text -def _annot2array(predicted, tok_annot, doc_annot): +def _annot2array(vocab, tok_annot, doc_annot): attrs = [] values = [] for key, value in doc_annot.items(): if key == "entities": - words = tok_annot.get("ORTH", [tok.text for tok in predicted]) - ent_iobs, ent_types = _parse_ner_tags(predicted.vocab, words, value) + words = tok_annot["ORTH"] + ent_iobs, ent_types = _parse_ner_tags(vocab, words, value) tok_annot["ENT_IOB"] = ent_iobs tok_annot["ENT_TYPE"] = ent_types elif key == "links": entities = doc_annot.get("entities", {}) if value and not entities: raise ValueError(Errors.E981) - ent_kb_ids = _parse_links(predicted.vocab, words, value, entities) + ent_kb_ids = _parse_links(vocab, words, value, entities) tok_annot["ENT_KB_ID"] = ent_kb_ids elif key == "cats": pass @@ -173,7 +177,7 @@ def _annot2array(predicted, tok_annot, doc_annot): values.append(value) elif key == "MORPH": attrs.append(key) - values.append([predicted.vocab.morphology.add(v) for v in value]) + values.append([vocab.morphology.add(v) for v in value]) elif key == "ENT_IOB": iob_strings = Token.iob_strings() attrs.append(key) @@ -183,7 +187,7 @@ def _annot2array(predicted, tok_annot, doc_annot): raise ValueError(Errors.E982.format(values=iob_strings, value=values)) else: attrs.append(key) - values.append([predicted.vocab.strings.add(v) for v in value]) + values.append([vocab.strings.add(v) for v in value]) array = numpy.asarray(values, dtype="uint64") return attrs, array.T @@ -196,7 +200,7 @@ def _parse_example_dict_data(example_dict): ) -def _fix_legacy_dict_data(predicted, example_dict): +def _fix_legacy_dict_data(example_dict): token_dict = example_dict.get("token_annotation", {}) doc_dict = example_dict.get("doc_annotation", {}) for key, value in example_dict.items(): diff --git a/spacy/tests/test_new_example.py b/spacy/tests/test_new_example.py index 3cacf82ae..517329dba 100644 --- a/spacy/tests/test_new_example.py +++ b/spacy/tests/test_new_example.py @@ -28,17 +28,20 @@ def test_Example_from_dict_basic(): def test_Example_from_dict_invalid(annots): vocab = Vocab() predicted = Doc(vocab, words=annots["words"]) - with pytest.raises(ValueError): + with pytest.raises(KeyError): Example.from_dict(predicted, annots) -@pytest.mark.parametrize("annots", [{"words": ["ice", "cream"], "tags": ["NN", "NN"]}]) -def test_Example_from_dict_with_tags(annots): +@pytest.mark.parametrize("gold_words", [["ice", "cream"], ["icecream"], ["i", "ce", "cream"]]) +@pytest.mark.parametrize("annots", [{"words": ["icecream"], "tags": ["NN"]}]) +def test_Example_from_dict_with_tags(gold_words, annots): vocab = Vocab() - predicted = Doc(vocab, words=annots["words"]) + predicted = Doc(vocab, words=gold_words) example = Example.from_dict(predicted, annots) for i, token in enumerate(example.reference): assert token.tag_ == annots["tags"][i] + aligned_tags = example.get_aligned("tag") + assert aligned_tags == ["NN" for _ in predicted] @pytest.mark.parametrize(