Merge pull request #5693 from svlandeg/bugfix/nel-v3

This commit is contained in:
Ines Montani 2020-07-02 14:45:46 +02:00 committed by GitHub
commit 8a5b9a6d5f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 8 additions and 17 deletions

View File

@ -556,8 +556,8 @@ class Errors(object):
E979 = ("Cannot convert {type} to an Example object.") E979 = ("Cannot convert {type} to an Example object.")
E980 = ("Each link annotation should refer to a dictionary with at most one " E980 = ("Each link annotation should refer to a dictionary with at most one "
"identifier mapping to 1.0, and all others to 0.0.") "identifier mapping to 1.0, and all others to 0.0.")
E981 = ("The offsets of the annotations for 'links' need to refer exactly " E981 = ("The offsets of the annotations for 'links' could not be aligned "
"to the offsets of the 'entities' annotations.") "to token boundaries.")
E982 = ("The 'ent_iob' attribute of a Token should be an integer indexing " E982 = ("The 'ent_iob' attribute of a Token should be an integer indexing "
"into {values}, but found {value}.") "into {values}, but found {value}.")
E983 = ("Invalid key for '{dict}': {key}. Available keys: " E983 = ("Invalid key for '{dict}': {key}. Available keys: "

View File

@ -235,10 +235,7 @@ def _annot2array(vocab, tok_annot, doc_annot):
if key == "entities": if key == "entities":
pass pass
elif key == "links": elif key == "links":
entities = doc_annot.get("entities", {}) ent_kb_ids = _parse_links(vocab, tok_annot["ORTH"], tok_annot["SPACY"], value)
if not entities:
raise ValueError(Errors.E981)
ent_kb_ids = _parse_links(vocab, tok_annot["ORTH"], value, entities)
tok_annot["ENT_KB_ID"] = ent_kb_ids tok_annot["ENT_KB_ID"] = ent_kb_ids
elif key == "cats": elif key == "cats":
pass pass
@ -381,18 +378,11 @@ def _parse_ner_tags(biluo_or_offsets, vocab, words, spaces):
ent_types.append("") ent_types.append("")
return ent_iobs, ent_types return ent_iobs, ent_types
def _parse_links(vocab, words, links, entities): def _parse_links(vocab, words, spaces, links):
reference = Doc(vocab, words=words) reference = Doc(vocab, words=words, spaces=spaces)
starts = {token.idx: token.i for token in reference} starts = {token.idx: token.i for token in reference}
ends = {token.idx + len(token): token.i for token in reference} ends = {token.idx + len(token): token.i for token in reference}
ent_kb_ids = ["" for _ in reference] ent_kb_ids = ["" for _ in reference]
entity_map = [(ent[0], ent[1]) for ent in entities]
# links annotations need to refer 1-1 to entity annotations - throw error otherwise
for index, annot_dict in links.items():
start_char, end_char = index
if (start_char, end_char) not in entity_map:
raise ValueError(Errors.E981)
for index, annot_dict in links.items(): for index, annot_dict in links.items():
true_kb_ids = [] true_kb_ids = []
@ -406,6 +396,8 @@ def _parse_links(vocab, words, links, entities):
start_char, end_char = index start_char, end_char = index
start_token = starts.get(start_char) start_token = starts.get(start_char)
end_token = ends.get(end_char) end_token = ends.get(end_char)
if start_token is None or end_token is None:
raise ValueError(Errors.E981)
for i in range(start_token, end_token+1): for i in range(start_token, end_token+1):
ent_kb_ids[i] = true_kb_ids[0] ent_kb_ids[i] = true_kb_ids[0]

View File

@ -230,8 +230,7 @@ def test_Example_from_dict_with_links(annots):
[ [
{ {
"words": ["I", "like", "New", "York", "and", "Berlin", "."], "words": ["I", "like", "New", "York", "and", "Berlin", "."],
"entities": [(7, 15, "LOC"), (20, 26, "LOC")], "links": {(7, 14): {"Q7381115": 1.0, "Q2146908": 0.0}},
"links": {(0, 1): {"Q7381115": 1.0, "Q2146908": 0.0}},
} }
], ],
) )