From f503817623627b0e7a8e7bdacdcda412fc9318c0 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 2 Jul 2020 13:48:11 +0200 Subject: [PATCH 1/2] fix parsing entity links in new gold format --- spacy/errors.py | 2 -- spacy/gold/example.pyx | 16 +++------------- 2 files changed, 3 insertions(+), 15 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 66a3c61da..6e7ec49ae 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -556,8 +556,6 @@ class Errors(object): E979 = ("Cannot convert {type} to an Example object.") E980 = ("Each link annotation should refer to a dictionary with at most one " "identifier mapping to 1.0, and all others to 0.0.") - E981 = ("The offsets of the annotations for 'links' need to refer exactly " - "to the offsets of the 'entities' annotations.") E982 = ("The 'ent_iob' attribute of a Token should be an integer indexing " "into {values}, but found {value}.") E983 = ("Invalid key for '{dict}': {key}. Available keys: " diff --git a/spacy/gold/example.pyx b/spacy/gold/example.pyx index 5e36156a9..841b233c4 100644 --- a/spacy/gold/example.pyx +++ b/spacy/gold/example.pyx @@ -235,10 +235,7 @@ def _annot2array(vocab, tok_annot, doc_annot): if key == "entities": pass elif key == "links": - entities = doc_annot.get("entities", {}) - if not entities: - raise ValueError(Errors.E981) - ent_kb_ids = _parse_links(vocab, tok_annot["ORTH"], value, entities) + ent_kb_ids = _parse_links(vocab, tok_annot["ORTH"], tok_annot["SPACY"], value) tok_annot["ENT_KB_ID"] = ent_kb_ids elif key == "cats": pass @@ -381,18 +378,11 @@ def _parse_ner_tags(biluo_or_offsets, vocab, words, spaces): ent_types.append("") return ent_iobs, ent_types -def _parse_links(vocab, words, links, entities): - reference = Doc(vocab, words=words) +def _parse_links(vocab, words, spaces, links): + reference = Doc(vocab, words=words, spaces=spaces) starts = {token.idx: token.i for token in reference} ends = {token.idx + len(token): token.i for token in reference} ent_kb_ids = ["" for _ in reference] - entity_map = [(ent[0], ent[1]) for ent in entities] - - # links annotations need to refer 1-1 to entity annotations - throw error otherwise - for index, annot_dict in links.items(): - start_char, end_char = index - if (start_char, end_char) not in entity_map: - raise ValueError(Errors.E981) for index, annot_dict in links.items(): true_kb_ids = [] From 04ed4d60a84b9fdcd87476e3f8db5a7d4b7a8889 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 2 Jul 2020 13:57:35 +0200 Subject: [PATCH 2/2] raise error when links are not aligned to tokens --- spacy/errors.py | 2 ++ spacy/gold/example.pyx | 2 ++ spacy/tests/test_new_example.py | 3 +-- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 6e7ec49ae..61ff5a037 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -556,6 +556,8 @@ class Errors(object): E979 = ("Cannot convert {type} to an Example object.") E980 = ("Each link annotation should refer to a dictionary with at most one " "identifier mapping to 1.0, and all others to 0.0.") + E981 = ("The offsets of the annotations for 'links' could not be aligned " + "to token boundaries.") E982 = ("The 'ent_iob' attribute of a Token should be an integer indexing " "into {values}, but found {value}.") E983 = ("Invalid key for '{dict}': {key}. Available keys: " diff --git a/spacy/gold/example.pyx b/spacy/gold/example.pyx index 841b233c4..2ecee1821 100644 --- a/spacy/gold/example.pyx +++ b/spacy/gold/example.pyx @@ -396,6 +396,8 @@ def _parse_links(vocab, words, spaces, links): start_char, end_char = index start_token = starts.get(start_char) end_token = ends.get(end_char) + if start_token is None or end_token is None: + raise ValueError(Errors.E981) for i in range(start_token, end_token+1): ent_kb_ids[i] = true_kb_ids[0] diff --git a/spacy/tests/test_new_example.py b/spacy/tests/test_new_example.py index b89654554..58eab4a54 100644 --- a/spacy/tests/test_new_example.py +++ b/spacy/tests/test_new_example.py @@ -230,8 +230,7 @@ def test_Example_from_dict_with_links(annots): [ { "words": ["I", "like", "New", "York", "and", "Berlin", "."], - "entities": [(7, 15, "LOC"), (20, 26, "LOC")], - "links": {(0, 1): {"Q7381115": 1.0, "Q2146908": 0.0}}, + "links": {(7, 14): {"Q7381115": 1.0, "Q2146908": 0.0}}, } ], )