diff --git a/spacy/errors.py b/spacy/errors.py index 8efef8333..e4f6610ee 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -581,6 +581,10 @@ class Errors(object): # TODO: fix numbering after merging develop into master + E983 = ("Each link annotation should refer to a dictionary with at most one " + "identifier mapping to 1.0, and all others to 0.0.") + E984 = ("The offsets of the annotations for 'links' need to refer exactly " + "to the offsets of the 'entities' annotations.") E985 = ("The 'ent_iob' attribute of a Token should be an integer indexing " "into {values}, but found {value}.") E986 = ("Could not create any training batches: check your input. " diff --git a/spacy/gold/new_example.pyx b/spacy/gold/new_example.pyx index 51007e8c3..d2492a29f 100644 --- a/spacy/gold/new_example.pyx +++ b/spacy/gold/new_example.pyx @@ -85,12 +85,28 @@ cdef class NewExample: return self.x.text -def _annot2array(strings, tok_annot, doc_annot): +def _annot2array(vocab, tok_annot, doc_annot): attrs = [] values = [] + + for key, value in doc_annot.items(): + if key == "entities": + words = tok_annot["ORTH"] + ent_iobs, ent_types = _parse_ner_tags(vocab, words, value) + tok_annot["ENT_IOB"] = ent_iobs + tok_annot["ENT_TYPE"] = ent_types + elif key == "links": + entities = doc_annot.get("entities", {}) + if value and not entities: + raise ValueError(Errors.E984) + ent_kb_ids = _parse_links(vocab, words, value, entities) + tok_annot["ENT_KB_ID"] = ent_kb_ids + else: + raise ValueError(f"Unknown doc attribute: {key}") + for key, value in tok_annot.items(): if key not in IDS: - raise ValueError(f"Unknown attr: {key}") + raise ValueError(f"Unknown token attribute: {key}") elif key == "ORTH": pass elif key == "HEAD": @@ -108,10 +124,8 @@ def _annot2array(strings, tok_annot, doc_annot): raise ValueError(Errors.E985.format(values=iob_strings, value=values)) else: attrs.append(key) - values.append([strings.add(v) for v in value]) - # TODO: Calculate token.ent_kb_id from doc_annot["links"]. - # We need to fix this and the doc.ents thing, both should be doc - # annotations. + values.append([vocab.strings.add(v) for v in value]) + array = numpy.asarray(values, dtype="uint64") return attrs, array.T @@ -129,8 +143,10 @@ def _fix_legacy_dict_data(predicted, example_dict): for key, value in example_dict.items(): if key in ("token_annotation", "doc_annotation"): pass - elif key in ("cats", "links"): + elif key in ("cats", "links") and value: doc_dict[key] = value + elif key in ("ner", "entities") and value: + doc_dict["entities"] = value else: token_dict[key] = value # Remap keys @@ -149,12 +165,6 @@ def _fix_legacy_dict_data(predicted, example_dict): for key, value in old_token_dict.items(): if key in remapping: token_dict[remapping[key]] = value - elif key in ("ner", "entities") and value: - # Arguably it would be smarter to put this in the doc annotation? - words = token_dict.get("words", [t.text for t in predicted]) - ent_iobs, ent_types = _parse_ner_tags(predicted, words, value) - token_dict["ENT_IOB"] = ent_iobs - token_dict["ENT_TYPE"] = ent_types else: raise ValueError(f"Unknown attr: {key}") return { @@ -163,16 +173,13 @@ def _fix_legacy_dict_data(predicted, example_dict): } -def _parse_ner_tags(predicted, words, biluo_or_offsets): +def _parse_ner_tags(vocab, words, biluo_or_offsets): if isinstance(biluo_or_offsets[0], (list, tuple)): # Convert to biluo if necessary # This is annoying but to convert the offsets we need a Doc # that has the target tokenization. - reference = Doc( - predicted.vocab, - words=words - ) - biluo = biluo_tags_from_offsets(predicted, biluo_or_offsets) + reference = Doc(vocab, words=words) + biluo = biluo_tags_from_offsets(reference, biluo_or_offsets) else: biluo = biluo_or_offsets ent_iobs = [] @@ -185,6 +192,37 @@ def _parse_ner_tags(predicted, words, biluo_or_offsets): ent_types.append("") return ent_iobs, ent_types +def _parse_links(vocab, words, links, entities): + reference = Doc(vocab, words=words) + + starts = {token.idx: token.i for token in reference} + ends = {token.idx + len(token): token.i for token in reference} + ent_kb_ids = ["" for _ in reference] + entity_map = [(ent[0], ent[1]) for ent in entities] + + # links annotations need to refer 1-1 to entity annotations - throw error otherwise + for index, annot_dict in links.items(): + start_char, end_char = index + if (start_char, end_char) not in entity_map: + raise ValueError(Errors.E984) + + for index, annot_dict in links.items(): + true_kb_ids = [] + for key, value in annot_dict.items(): + if value == 1.0: + true_kb_ids.append(key) + if len(true_kb_ids) > 1: + raise ValueError(Errors.E983) + + if len(true_kb_ids) == 1: + start_char, end_char = index + start_token = starts.get(start_char) + end_token = ends.get(end_char) + for i in range(start_token, end_token+1): + ent_kb_ids[i] = true_kb_ids[0] + + return ent_kb_ids + class Example: def get_aligned(self, field): diff --git a/spacy/tests/test_new_example.py b/spacy/tests/test_new_example.py index 7a43cd9a6..4ebafb6bb 100644 --- a/spacy/tests/test_new_example.py +++ b/spacy/tests/test_new_example.py @@ -41,33 +41,6 @@ def test_Example_from_dict_with_tags(annots): assert token.tag_ == annots["tags"][i] -@pytest.mark.xfail(reason="TODO - fix") -@pytest.mark.parametrize( - "annots", - [ - { - "words": ["I", "like", "New", "York", "and", "Berlin", "."], - "entities": [(7, 15, "LOC"), (20, 26, "LOC")], - } - ], -) -def test_Example_from_dict_with_entities(annots): - vocab = Vocab() - predicted = Doc(vocab, words=annots["words"]) - eg = Example.from_dict(predicted, annots) - assert len(list(eg.reference.ents)) == 2 - assert eg.reference[0].ent_iob_ == "O" - assert eg.reference[1].ent_iob_ == "O" - assert eg.reference[2].ent_iob_ == "B" - assert eg.reference[3].ent_iob_ == "I" - assert eg.reference[4].ent_iob_ == "O" - assert eg.reference[5].ent_iob_ == "B" - assert eg.reference[6].ent_iob_ == "O" - assert eg.reference[2].ent_type_ == "LOC" - assert eg.reference[3].ent_type_ == "LOC" - assert eg.reference[5].ent_type_ == "LOC" - - @pytest.mark.parametrize( "annots", [ @@ -147,13 +120,39 @@ def test_Example_from_dict_with_cats(annots): assert eg.reference.cats["cat3"] == 0.5 -@pytest.mark.xfail(reason="TODO - fix") @pytest.mark.parametrize( "annots", [ { - "words": ["Russ", "Cochran", "made", "reprints"], - "links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}}, + "words": ["I", "like", "New", "York", "and", "Berlin", "."], + "entities": [(7, 15, "LOC"), (20, 26, "LOC")], + } + ], +) +def test_Example_from_dict_with_entities(annots): + vocab = Vocab() + predicted = Doc(vocab, words=annots["words"]) + eg = Example.from_dict(predicted, annots) + assert len(list(eg.reference.ents)) == 2 + assert eg.reference[0].ent_iob_ == "O" + assert eg.reference[1].ent_iob_ == "O" + assert eg.reference[2].ent_iob_ == "B" + assert eg.reference[3].ent_iob_ == "I" + assert eg.reference[4].ent_iob_ == "O" + assert eg.reference[5].ent_iob_ == "B" + assert eg.reference[6].ent_iob_ == "O" + assert eg.reference[2].ent_type_ == "LOC" + assert eg.reference[3].ent_type_ == "LOC" + assert eg.reference[5].ent_type_ == "LOC" + + +@pytest.mark.parametrize( + "annots", + [ + { + "words": ["I", "like", "New", "York", "and", "Berlin", "."], + "entities": [(7, 15, "LOC"), (20, 26, "LOC")], + "links": {(7, 15): {"Q60": 1.0, "Q64": 0.0}, (20, 26): {"Q60": 0.0, "Q64": 1.0}}, } ], ) @@ -161,7 +160,28 @@ def test_Example_from_dict_with_links(annots): vocab = Vocab() predicted = Doc(vocab, words=annots["words"]) eg = Example.from_dict(predicted, annots) - assert eg.reference[0].ent_kb_id_ == "Q7381115" - assert eg.reference[1].ent_kb_id_ == "Q7381115" - assert eg.reference[2].ent_kb_id_ == "" - assert eg.reference[3].ent_kb_id_ == "" + assert eg.reference[0].ent_kb_id_ == "" + assert eg.reference[1].ent_kb_id_ == "" + assert eg.reference[2].ent_kb_id_ == "Q60" + assert eg.reference[3].ent_kb_id_ == "Q60" + assert eg.reference[4].ent_kb_id_ == "" + assert eg.reference[5].ent_kb_id_ == "Q64" + assert eg.reference[6].ent_kb_id_ == "" + + +@pytest.mark.parametrize( + "annots", + [ + { + "words": ["I", "like", "New", "York", "and", "Berlin", "."], + "entities": [(7, 15, "LOC"), (20, 26, "LOC")], + "links": {(0, 1): {"Q7381115": 1.0, "Q2146908": 0.0}}, + } + ], +) +def test_Example_from_dict_with_links_invalid(annots): + vocab = Vocab() + predicted = Doc(vocab, words=annots["words"]) + with pytest.raises(ValueError): + Example.from_dict(predicted, annots) +