mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-11 09:00:36 +03:00
Fix handling of NER data in Example
This commit is contained in:
parent
359e874766
commit
7eb064854e
|
@ -4,6 +4,8 @@ import numpy
|
||||||
|
|
||||||
from ..tokens import Token
|
from ..tokens import Token
|
||||||
from ..tokens.doc cimport Doc
|
from ..tokens.doc cimport Doc
|
||||||
|
from ..tokens.span cimport Span
|
||||||
|
from ..tokens.span import Span
|
||||||
from ..attrs import IDS
|
from ..attrs import IDS
|
||||||
from .align cimport Alignment
|
from .align cimport Alignment
|
||||||
from .iob_utils import biluo_to_iob, biluo_tags_from_offsets, biluo_tags_from_doc
|
from .iob_utils import biluo_to_iob, biluo_tags_from_offsets, biluo_tags_from_doc
|
||||||
|
@ -19,6 +21,8 @@ cpdef Doc annotations2doc(vocab, tok_annot, doc_annot):
|
||||||
output = Doc(vocab, words=tok_annot["ORTH"], spaces=tok_annot["SPACY"])
|
output = Doc(vocab, words=tok_annot["ORTH"], spaces=tok_annot["SPACY"])
|
||||||
if array.size:
|
if array.size:
|
||||||
output = output.from_array(attrs, array)
|
output = output.from_array(attrs, array)
|
||||||
|
if "entities" in doc_annot:
|
||||||
|
_add_entities_to_doc(output, doc_annot["entities"])
|
||||||
# TODO: links ?!
|
# TODO: links ?!
|
||||||
output.cats.update(doc_annot.get("cats", {}))
|
output.cats.update(doc_annot.get("cats", {}))
|
||||||
return output
|
return output
|
||||||
|
@ -99,29 +103,6 @@ cdef class Example:
|
||||||
output[i] = None
|
output[i] = None
|
||||||
else:
|
else:
|
||||||
output[i] = gold_values[gold_i]
|
output[i] = gold_values[gold_i]
|
||||||
|
|
||||||
if field in ["ENT_IOB"]:
|
|
||||||
# Fix many-to-one IOB codes
|
|
||||||
prev_j = -1
|
|
||||||
prev_value = -1
|
|
||||||
for i, value in enumerate(output):
|
|
||||||
if i in i2j_multi:
|
|
||||||
j = i2j_multi[i]
|
|
||||||
if j == prev_j and prev_value == value == 3:
|
|
||||||
output[i] = 1 # set B to I
|
|
||||||
prev_j = j
|
|
||||||
else:
|
|
||||||
prev_j = -1
|
|
||||||
prev_value = value
|
|
||||||
|
|
||||||
if field in ["ENT_IOB", "ENT_TYPE", "ENT_KB_ID"]:
|
|
||||||
# Assign one-to-many NER tags
|
|
||||||
for j, cand_j in enumerate(gold_to_cand):
|
|
||||||
if cand_j is None:
|
|
||||||
if j in j2i_multi:
|
|
||||||
i = j2i_multi[j]
|
|
||||||
if output[i] is None:
|
|
||||||
output[i] = gold_values[j]
|
|
||||||
if as_string and field not in ["ENT_IOB", "SENT_START"]:
|
if as_string and field not in ["ENT_IOB", "SENT_START"]:
|
||||||
output = [vocab.strings[o] if o is not None else o for o in output]
|
output = [vocab.strings[o] if o is not None else o for o in output]
|
||||||
return output
|
return output
|
||||||
|
@ -145,15 +126,30 @@ cdef class Example:
|
||||||
|
|
||||||
def get_aligned_ner(self):
|
def get_aligned_ner(self):
|
||||||
x_ents = []
|
x_ents = []
|
||||||
|
gold_to_cand = self.alignment.gold_to_cand
|
||||||
for y_ent in self.y.ents:
|
for y_ent in self.y.ents:
|
||||||
x_span = self.x.char_span(y_ent.start_char, y_ent.end_char, label=y_ent.label)
|
x_start = gold_to_cand[y_ent.start]
|
||||||
if x_span is not None:
|
x_end = gold_to_cand[y_ent.end-1]
|
||||||
x_ents.append(x_span)
|
if x_start is not None and x_end is not None:
|
||||||
|
x_ents.append(Span(self.x, x_start, x_end+1, label=y_ent.label))
|
||||||
|
else:
|
||||||
|
x_span = self.x.char_span(
|
||||||
|
y_ent.start_char,
|
||||||
|
y_ent.end_char,
|
||||||
|
label=y_ent.label
|
||||||
|
)
|
||||||
|
if x_span is not None:
|
||||||
|
x_ents.append(x_span)
|
||||||
x_tags = biluo_tags_from_offsets(
|
x_tags = biluo_tags_from_offsets(
|
||||||
self.x,
|
self.x,
|
||||||
[(e.start_char, e.end_char, e.label_) for e in x_ents],
|
[(e.start_char, e.end_char, e.label_) for e in x_ents],
|
||||||
missing="O"
|
missing="O"
|
||||||
)
|
)
|
||||||
|
for token in self.y:
|
||||||
|
if token.ent_iob == 0:
|
||||||
|
cand_i = gold_to_cand[token.i]
|
||||||
|
if cand_i is not None:
|
||||||
|
x_tags[cand_i] = None
|
||||||
return x_tags
|
return x_tags
|
||||||
|
|
||||||
def to_dict(self):
|
def to_dict(self):
|
||||||
|
@ -222,11 +218,7 @@ def _annot2array(vocab, tok_annot, doc_annot):
|
||||||
for key, value in doc_annot.items():
|
for key, value in doc_annot.items():
|
||||||
if value:
|
if value:
|
||||||
if key == "entities":
|
if key == "entities":
|
||||||
words = tok_annot["ORTH"]
|
pass
|
||||||
spaces = tok_annot["SPACY"]
|
|
||||||
ent_iobs, ent_types = _parse_ner_tags(value, vocab, words, spaces)
|
|
||||||
tok_annot["ENT_IOB"] = ent_iobs
|
|
||||||
tok_annot["ENT_TYPE"] = ent_types
|
|
||||||
elif key == "links":
|
elif key == "links":
|
||||||
entities = doc_annot.get("entities", {})
|
entities = doc_annot.get("entities", {})
|
||||||
if value and not entities:
|
if value and not entities:
|
||||||
|
@ -252,13 +244,6 @@ def _annot2array(vocab, tok_annot, doc_annot):
|
||||||
elif key == "MORPH":
|
elif key == "MORPH":
|
||||||
attrs.append(key)
|
attrs.append(key)
|
||||||
values.append([vocab.morphology.add(v) for v in value])
|
values.append([vocab.morphology.add(v) for v in value])
|
||||||
elif key == "ENT_IOB":
|
|
||||||
iob_strings = Token.iob_strings()
|
|
||||||
attrs.append(key)
|
|
||||||
try:
|
|
||||||
values.append([iob_strings.index(v) for v in value])
|
|
||||||
except ValueError:
|
|
||||||
raise ValueError(Errors.E982.format(values=iob_strings, value=values))
|
|
||||||
else:
|
else:
|
||||||
attrs.append(key)
|
attrs.append(key)
|
||||||
values.append([vocab.strings.add(v) for v in value])
|
values.append([vocab.strings.add(v) for v in value])
|
||||||
|
@ -267,6 +252,29 @@ def _annot2array(vocab, tok_annot, doc_annot):
|
||||||
return attrs, array.T
|
return attrs, array.T
|
||||||
|
|
||||||
|
|
||||||
|
def _add_entities_to_doc(doc, ner_data):
|
||||||
|
if ner_data is None:
|
||||||
|
return
|
||||||
|
elif ner_data == []:
|
||||||
|
doc.ents = []
|
||||||
|
elif isinstance(ner_data[0], tuple):
|
||||||
|
return _add_entities_to_doc(
|
||||||
|
doc,
|
||||||
|
biluo_tags_from_offsets(doc, ner_data)
|
||||||
|
)
|
||||||
|
elif isinstance(ner_data[0], str) or ner_data[0] is None:
|
||||||
|
return _add_entities_to_doc(
|
||||||
|
doc,
|
||||||
|
spans_from_biluo_tags(doc, ner_data)
|
||||||
|
)
|
||||||
|
elif isinstance(ner_data[0], Span):
|
||||||
|
# Ugh, this is super messy. Really hard to set O entities
|
||||||
|
doc.ents = ner_data
|
||||||
|
doc.ents = [span for span in ner_data if span.label_]
|
||||||
|
else:
|
||||||
|
raise ValueError("Unexpected type for NER data")
|
||||||
|
|
||||||
|
|
||||||
def _parse_example_dict_data(example_dict):
|
def _parse_example_dict_data(example_dict):
|
||||||
return (
|
return (
|
||||||
example_dict["token_annotation"],
|
example_dict["token_annotation"],
|
||||||
|
@ -332,7 +340,7 @@ def _parse_ner_tags(biluo_or_offsets, vocab, words, spaces):
|
||||||
ent_iobs = []
|
ent_iobs = []
|
||||||
ent_types = []
|
ent_types = []
|
||||||
for iob_tag in biluo_to_iob(biluo):
|
for iob_tag in biluo_to_iob(biluo):
|
||||||
if iob_tag is None:
|
if iob_tag in (None, "-"):
|
||||||
ent_iobs.append("")
|
ent_iobs.append("")
|
||||||
ent_types.append("")
|
ent_types.append("")
|
||||||
else:
|
else:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user