mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-10 16:40:34 +03:00
Fix handling of NER data in Example
This commit is contained in:
parent
359e874766
commit
7eb064854e
|
@ -4,6 +4,8 @@ import numpy
|
|||
|
||||
from ..tokens import Token
|
||||
from ..tokens.doc cimport Doc
|
||||
from ..tokens.span cimport Span
|
||||
from ..tokens.span import Span
|
||||
from ..attrs import IDS
|
||||
from .align cimport Alignment
|
||||
from .iob_utils import biluo_to_iob, biluo_tags_from_offsets, biluo_tags_from_doc
|
||||
|
@ -19,6 +21,8 @@ cpdef Doc annotations2doc(vocab, tok_annot, doc_annot):
|
|||
output = Doc(vocab, words=tok_annot["ORTH"], spaces=tok_annot["SPACY"])
|
||||
if array.size:
|
||||
output = output.from_array(attrs, array)
|
||||
if "entities" in doc_annot:
|
||||
_add_entities_to_doc(output, doc_annot["entities"])
|
||||
# TODO: links ?!
|
||||
output.cats.update(doc_annot.get("cats", {}))
|
||||
return output
|
||||
|
@ -99,29 +103,6 @@ cdef class Example:
|
|||
output[i] = None
|
||||
else:
|
||||
output[i] = gold_values[gold_i]
|
||||
|
||||
if field in ["ENT_IOB"]:
|
||||
# Fix many-to-one IOB codes
|
||||
prev_j = -1
|
||||
prev_value = -1
|
||||
for i, value in enumerate(output):
|
||||
if i in i2j_multi:
|
||||
j = i2j_multi[i]
|
||||
if j == prev_j and prev_value == value == 3:
|
||||
output[i] = 1 # set B to I
|
||||
prev_j = j
|
||||
else:
|
||||
prev_j = -1
|
||||
prev_value = value
|
||||
|
||||
if field in ["ENT_IOB", "ENT_TYPE", "ENT_KB_ID"]:
|
||||
# Assign one-to-many NER tags
|
||||
for j, cand_j in enumerate(gold_to_cand):
|
||||
if cand_j is None:
|
||||
if j in j2i_multi:
|
||||
i = j2i_multi[j]
|
||||
if output[i] is None:
|
||||
output[i] = gold_values[j]
|
||||
if as_string and field not in ["ENT_IOB", "SENT_START"]:
|
||||
output = [vocab.strings[o] if o is not None else o for o in output]
|
||||
return output
|
||||
|
@ -145,15 +126,30 @@ cdef class Example:
|
|||
|
||||
def get_aligned_ner(self):
|
||||
x_ents = []
|
||||
gold_to_cand = self.alignment.gold_to_cand
|
||||
for y_ent in self.y.ents:
|
||||
x_span = self.x.char_span(y_ent.start_char, y_ent.end_char, label=y_ent.label)
|
||||
if x_span is not None:
|
||||
x_ents.append(x_span)
|
||||
x_start = gold_to_cand[y_ent.start]
|
||||
x_end = gold_to_cand[y_ent.end-1]
|
||||
if x_start is not None and x_end is not None:
|
||||
x_ents.append(Span(self.x, x_start, x_end+1, label=y_ent.label))
|
||||
else:
|
||||
x_span = self.x.char_span(
|
||||
y_ent.start_char,
|
||||
y_ent.end_char,
|
||||
label=y_ent.label
|
||||
)
|
||||
if x_span is not None:
|
||||
x_ents.append(x_span)
|
||||
x_tags = biluo_tags_from_offsets(
|
||||
self.x,
|
||||
[(e.start_char, e.end_char, e.label_) for e in x_ents],
|
||||
missing="O"
|
||||
)
|
||||
for token in self.y:
|
||||
if token.ent_iob == 0:
|
||||
cand_i = gold_to_cand[token.i]
|
||||
if cand_i is not None:
|
||||
x_tags[cand_i] = None
|
||||
return x_tags
|
||||
|
||||
def to_dict(self):
|
||||
|
@ -222,11 +218,7 @@ def _annot2array(vocab, tok_annot, doc_annot):
|
|||
for key, value in doc_annot.items():
|
||||
if value:
|
||||
if key == "entities":
|
||||
words = tok_annot["ORTH"]
|
||||
spaces = tok_annot["SPACY"]
|
||||
ent_iobs, ent_types = _parse_ner_tags(value, vocab, words, spaces)
|
||||
tok_annot["ENT_IOB"] = ent_iobs
|
||||
tok_annot["ENT_TYPE"] = ent_types
|
||||
pass
|
||||
elif key == "links":
|
||||
entities = doc_annot.get("entities", {})
|
||||
if value and not entities:
|
||||
|
@ -252,13 +244,6 @@ def _annot2array(vocab, tok_annot, doc_annot):
|
|||
elif key == "MORPH":
|
||||
attrs.append(key)
|
||||
values.append([vocab.morphology.add(v) for v in value])
|
||||
elif key == "ENT_IOB":
|
||||
iob_strings = Token.iob_strings()
|
||||
attrs.append(key)
|
||||
try:
|
||||
values.append([iob_strings.index(v) for v in value])
|
||||
except ValueError:
|
||||
raise ValueError(Errors.E982.format(values=iob_strings, value=values))
|
||||
else:
|
||||
attrs.append(key)
|
||||
values.append([vocab.strings.add(v) for v in value])
|
||||
|
@ -267,6 +252,29 @@ def _annot2array(vocab, tok_annot, doc_annot):
|
|||
return attrs, array.T
|
||||
|
||||
|
||||
def _add_entities_to_doc(doc, ner_data):
|
||||
if ner_data is None:
|
||||
return
|
||||
elif ner_data == []:
|
||||
doc.ents = []
|
||||
elif isinstance(ner_data[0], tuple):
|
||||
return _add_entities_to_doc(
|
||||
doc,
|
||||
biluo_tags_from_offsets(doc, ner_data)
|
||||
)
|
||||
elif isinstance(ner_data[0], str) or ner_data[0] is None:
|
||||
return _add_entities_to_doc(
|
||||
doc,
|
||||
spans_from_biluo_tags(doc, ner_data)
|
||||
)
|
||||
elif isinstance(ner_data[0], Span):
|
||||
# Ugh, this is super messy. Really hard to set O entities
|
||||
doc.ents = ner_data
|
||||
doc.ents = [span for span in ner_data if span.label_]
|
||||
else:
|
||||
raise ValueError("Unexpected type for NER data")
|
||||
|
||||
|
||||
def _parse_example_dict_data(example_dict):
|
||||
return (
|
||||
example_dict["token_annotation"],
|
||||
|
@ -332,7 +340,7 @@ def _parse_ner_tags(biluo_or_offsets, vocab, words, spaces):
|
|||
ent_iobs = []
|
||||
ent_types = []
|
||||
for iob_tag in biluo_to_iob(biluo):
|
||||
if iob_tag is None:
|
||||
if iob_tag in (None, "-"):
|
||||
ent_iobs.append("")
|
||||
ent_types.append("")
|
||||
else:
|
||||
|
|
Loading…
Reference in New Issue
Block a user