mirror of
https://github.com/explosion/spaCy.git
synced 2025-03-12 15:25:47 +03:00
entities on doc_annotation, parse links and check their offsets against the entities. unit test works
This commit is contained in:
parent
3aed177a35
commit
880dccf93e
|
@ -581,6 +581,10 @@ class Errors(object):
|
||||||
|
|
||||||
# TODO: fix numbering after merging develop into master
|
# TODO: fix numbering after merging develop into master
|
||||||
|
|
||||||
|
E983 = ("Each link annotation should refer to a dictionary with at most one "
|
||||||
|
"identifier mapping to 1.0, and all others to 0.0.")
|
||||||
|
E984 = ("The offsets of the annotations for 'links' need to refer exactly "
|
||||||
|
"to the offsets of the 'entities' annotations.")
|
||||||
E985 = ("The 'ent_iob' attribute of a Token should be an integer indexing "
|
E985 = ("The 'ent_iob' attribute of a Token should be an integer indexing "
|
||||||
"into {values}, but found {value}.")
|
"into {values}, but found {value}.")
|
||||||
E986 = ("Could not create any training batches: check your input. "
|
E986 = ("Could not create any training batches: check your input. "
|
||||||
|
|
|
@ -85,12 +85,28 @@ cdef class NewExample:
|
||||||
return self.x.text
|
return self.x.text
|
||||||
|
|
||||||
|
|
||||||
def _annot2array(strings, tok_annot, doc_annot):
|
def _annot2array(vocab, tok_annot, doc_annot):
|
||||||
attrs = []
|
attrs = []
|
||||||
values = []
|
values = []
|
||||||
|
|
||||||
|
for key, value in doc_annot.items():
|
||||||
|
if key == "entities":
|
||||||
|
words = tok_annot["ORTH"]
|
||||||
|
ent_iobs, ent_types = _parse_ner_tags(vocab, words, value)
|
||||||
|
tok_annot["ENT_IOB"] = ent_iobs
|
||||||
|
tok_annot["ENT_TYPE"] = ent_types
|
||||||
|
elif key == "links":
|
||||||
|
entities = doc_annot.get("entities", {})
|
||||||
|
if value and not entities:
|
||||||
|
raise ValueError(Errors.E984)
|
||||||
|
ent_kb_ids = _parse_links(vocab, words, value, entities)
|
||||||
|
tok_annot["ENT_KB_ID"] = ent_kb_ids
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unknown doc attribute: {key}")
|
||||||
|
|
||||||
for key, value in tok_annot.items():
|
for key, value in tok_annot.items():
|
||||||
if key not in IDS:
|
if key not in IDS:
|
||||||
raise ValueError(f"Unknown attr: {key}")
|
raise ValueError(f"Unknown token attribute: {key}")
|
||||||
elif key == "ORTH":
|
elif key == "ORTH":
|
||||||
pass
|
pass
|
||||||
elif key == "HEAD":
|
elif key == "HEAD":
|
||||||
|
@ -108,10 +124,8 @@ def _annot2array(strings, tok_annot, doc_annot):
|
||||||
raise ValueError(Errors.E985.format(values=iob_strings, value=values))
|
raise ValueError(Errors.E985.format(values=iob_strings, value=values))
|
||||||
else:
|
else:
|
||||||
attrs.append(key)
|
attrs.append(key)
|
||||||
values.append([strings.add(v) for v in value])
|
values.append([vocab.strings.add(v) for v in value])
|
||||||
# TODO: Calculate token.ent_kb_id from doc_annot["links"].
|
|
||||||
# We need to fix this and the doc.ents thing, both should be doc
|
|
||||||
# annotations.
|
|
||||||
array = numpy.asarray(values, dtype="uint64")
|
array = numpy.asarray(values, dtype="uint64")
|
||||||
return attrs, array.T
|
return attrs, array.T
|
||||||
|
|
||||||
|
@ -129,8 +143,10 @@ def _fix_legacy_dict_data(predicted, example_dict):
|
||||||
for key, value in example_dict.items():
|
for key, value in example_dict.items():
|
||||||
if key in ("token_annotation", "doc_annotation"):
|
if key in ("token_annotation", "doc_annotation"):
|
||||||
pass
|
pass
|
||||||
elif key in ("cats", "links"):
|
elif key in ("cats", "links") and value:
|
||||||
doc_dict[key] = value
|
doc_dict[key] = value
|
||||||
|
elif key in ("ner", "entities") and value:
|
||||||
|
doc_dict["entities"] = value
|
||||||
else:
|
else:
|
||||||
token_dict[key] = value
|
token_dict[key] = value
|
||||||
# Remap keys
|
# Remap keys
|
||||||
|
@ -149,12 +165,6 @@ def _fix_legacy_dict_data(predicted, example_dict):
|
||||||
for key, value in old_token_dict.items():
|
for key, value in old_token_dict.items():
|
||||||
if key in remapping:
|
if key in remapping:
|
||||||
token_dict[remapping[key]] = value
|
token_dict[remapping[key]] = value
|
||||||
elif key in ("ner", "entities") and value:
|
|
||||||
# Arguably it would be smarter to put this in the doc annotation?
|
|
||||||
words = token_dict.get("words", [t.text for t in predicted])
|
|
||||||
ent_iobs, ent_types = _parse_ner_tags(predicted, words, value)
|
|
||||||
token_dict["ENT_IOB"] = ent_iobs
|
|
||||||
token_dict["ENT_TYPE"] = ent_types
|
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unknown attr: {key}")
|
raise ValueError(f"Unknown attr: {key}")
|
||||||
return {
|
return {
|
||||||
|
@ -163,16 +173,13 @@ def _fix_legacy_dict_data(predicted, example_dict):
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def _parse_ner_tags(predicted, words, biluo_or_offsets):
|
def _parse_ner_tags(vocab, words, biluo_or_offsets):
|
||||||
if isinstance(biluo_or_offsets[0], (list, tuple)):
|
if isinstance(biluo_or_offsets[0], (list, tuple)):
|
||||||
# Convert to biluo if necessary
|
# Convert to biluo if necessary
|
||||||
# This is annoying but to convert the offsets we need a Doc
|
# This is annoying but to convert the offsets we need a Doc
|
||||||
# that has the target tokenization.
|
# that has the target tokenization.
|
||||||
reference = Doc(
|
reference = Doc(vocab, words=words)
|
||||||
predicted.vocab,
|
biluo = biluo_tags_from_offsets(reference, biluo_or_offsets)
|
||||||
words=words
|
|
||||||
)
|
|
||||||
biluo = biluo_tags_from_offsets(predicted, biluo_or_offsets)
|
|
||||||
else:
|
else:
|
||||||
biluo = biluo_or_offsets
|
biluo = biluo_or_offsets
|
||||||
ent_iobs = []
|
ent_iobs = []
|
||||||
|
@ -185,6 +192,37 @@ def _parse_ner_tags(predicted, words, biluo_or_offsets):
|
||||||
ent_types.append("")
|
ent_types.append("")
|
||||||
return ent_iobs, ent_types
|
return ent_iobs, ent_types
|
||||||
|
|
||||||
|
def _parse_links(vocab, words, links, entities):
|
||||||
|
reference = Doc(vocab, words=words)
|
||||||
|
|
||||||
|
starts = {token.idx: token.i for token in reference}
|
||||||
|
ends = {token.idx + len(token): token.i for token in reference}
|
||||||
|
ent_kb_ids = ["" for _ in reference]
|
||||||
|
entity_map = [(ent[0], ent[1]) for ent in entities]
|
||||||
|
|
||||||
|
# links annotations need to refer 1-1 to entity annotations - throw error otherwise
|
||||||
|
for index, annot_dict in links.items():
|
||||||
|
start_char, end_char = index
|
||||||
|
if (start_char, end_char) not in entity_map:
|
||||||
|
raise ValueError(Errors.E984)
|
||||||
|
|
||||||
|
for index, annot_dict in links.items():
|
||||||
|
true_kb_ids = []
|
||||||
|
for key, value in annot_dict.items():
|
||||||
|
if value == 1.0:
|
||||||
|
true_kb_ids.append(key)
|
||||||
|
if len(true_kb_ids) > 1:
|
||||||
|
raise ValueError(Errors.E983)
|
||||||
|
|
||||||
|
if len(true_kb_ids) == 1:
|
||||||
|
start_char, end_char = index
|
||||||
|
start_token = starts.get(start_char)
|
||||||
|
end_token = ends.get(end_char)
|
||||||
|
for i in range(start_token, end_token+1):
|
||||||
|
ent_kb_ids[i] = true_kb_ids[0]
|
||||||
|
|
||||||
|
return ent_kb_ids
|
||||||
|
|
||||||
|
|
||||||
class Example:
|
class Example:
|
||||||
def get_aligned(self, field):
|
def get_aligned(self, field):
|
||||||
|
|
|
@ -41,33 +41,6 @@ def test_Example_from_dict_with_tags(annots):
|
||||||
assert token.tag_ == annots["tags"][i]
|
assert token.tag_ == annots["tags"][i]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail(reason="TODO - fix")
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"annots",
|
|
||||||
[
|
|
||||||
{
|
|
||||||
"words": ["I", "like", "New", "York", "and", "Berlin", "."],
|
|
||||||
"entities": [(7, 15, "LOC"), (20, 26, "LOC")],
|
|
||||||
}
|
|
||||||
],
|
|
||||||
)
|
|
||||||
def test_Example_from_dict_with_entities(annots):
|
|
||||||
vocab = Vocab()
|
|
||||||
predicted = Doc(vocab, words=annots["words"])
|
|
||||||
eg = Example.from_dict(predicted, annots)
|
|
||||||
assert len(list(eg.reference.ents)) == 2
|
|
||||||
assert eg.reference[0].ent_iob_ == "O"
|
|
||||||
assert eg.reference[1].ent_iob_ == "O"
|
|
||||||
assert eg.reference[2].ent_iob_ == "B"
|
|
||||||
assert eg.reference[3].ent_iob_ == "I"
|
|
||||||
assert eg.reference[4].ent_iob_ == "O"
|
|
||||||
assert eg.reference[5].ent_iob_ == "B"
|
|
||||||
assert eg.reference[6].ent_iob_ == "O"
|
|
||||||
assert eg.reference[2].ent_type_ == "LOC"
|
|
||||||
assert eg.reference[3].ent_type_ == "LOC"
|
|
||||||
assert eg.reference[5].ent_type_ == "LOC"
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"annots",
|
"annots",
|
||||||
[
|
[
|
||||||
|
@ -147,13 +120,39 @@ def test_Example_from_dict_with_cats(annots):
|
||||||
assert eg.reference.cats["cat3"] == 0.5
|
assert eg.reference.cats["cat3"] == 0.5
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail(reason="TODO - fix")
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"annots",
|
"annots",
|
||||||
[
|
[
|
||||||
{
|
{
|
||||||
"words": ["Russ", "Cochran", "made", "reprints"],
|
"words": ["I", "like", "New", "York", "and", "Berlin", "."],
|
||||||
"links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}},
|
"entities": [(7, 15, "LOC"), (20, 26, "LOC")],
|
||||||
|
}
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_Example_from_dict_with_entities(annots):
|
||||||
|
vocab = Vocab()
|
||||||
|
predicted = Doc(vocab, words=annots["words"])
|
||||||
|
eg = Example.from_dict(predicted, annots)
|
||||||
|
assert len(list(eg.reference.ents)) == 2
|
||||||
|
assert eg.reference[0].ent_iob_ == "O"
|
||||||
|
assert eg.reference[1].ent_iob_ == "O"
|
||||||
|
assert eg.reference[2].ent_iob_ == "B"
|
||||||
|
assert eg.reference[3].ent_iob_ == "I"
|
||||||
|
assert eg.reference[4].ent_iob_ == "O"
|
||||||
|
assert eg.reference[5].ent_iob_ == "B"
|
||||||
|
assert eg.reference[6].ent_iob_ == "O"
|
||||||
|
assert eg.reference[2].ent_type_ == "LOC"
|
||||||
|
assert eg.reference[3].ent_type_ == "LOC"
|
||||||
|
assert eg.reference[5].ent_type_ == "LOC"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"annots",
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"words": ["I", "like", "New", "York", "and", "Berlin", "."],
|
||||||
|
"entities": [(7, 15, "LOC"), (20, 26, "LOC")],
|
||||||
|
"links": {(7, 15): {"Q60": 1.0, "Q64": 0.0}, (20, 26): {"Q60": 0.0, "Q64": 1.0}},
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
@ -161,7 +160,28 @@ def test_Example_from_dict_with_links(annots):
|
||||||
vocab = Vocab()
|
vocab = Vocab()
|
||||||
predicted = Doc(vocab, words=annots["words"])
|
predicted = Doc(vocab, words=annots["words"])
|
||||||
eg = Example.from_dict(predicted, annots)
|
eg = Example.from_dict(predicted, annots)
|
||||||
assert eg.reference[0].ent_kb_id_ == "Q7381115"
|
assert eg.reference[0].ent_kb_id_ == ""
|
||||||
assert eg.reference[1].ent_kb_id_ == "Q7381115"
|
assert eg.reference[1].ent_kb_id_ == ""
|
||||||
assert eg.reference[2].ent_kb_id_ == ""
|
assert eg.reference[2].ent_kb_id_ == "Q60"
|
||||||
assert eg.reference[3].ent_kb_id_ == ""
|
assert eg.reference[3].ent_kb_id_ == "Q60"
|
||||||
|
assert eg.reference[4].ent_kb_id_ == ""
|
||||||
|
assert eg.reference[5].ent_kb_id_ == "Q64"
|
||||||
|
assert eg.reference[6].ent_kb_id_ == ""
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"annots",
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"words": ["I", "like", "New", "York", "and", "Berlin", "."],
|
||||||
|
"entities": [(7, 15, "LOC"), (20, 26, "LOC")],
|
||||||
|
"links": {(0, 1): {"Q7381115": 1.0, "Q2146908": 0.0}},
|
||||||
|
}
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_Example_from_dict_with_links_invalid(annots):
|
||||||
|
vocab = Vocab()
|
||||||
|
predicted = Doc(vocab, words=annots["words"])
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
Example.from_dict(predicted, annots)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user