diff --git a/spacy/gold/example.pyx b/spacy/gold/example.pyx index 21cee7dc4..9e96aaa60 100644 --- a/spacy/gold/example.pyx +++ b/spacy/gold/example.pyx @@ -4,6 +4,8 @@ import numpy from ..tokens import Token from ..tokens.doc cimport Doc +from ..tokens.span cimport Span +from ..tokens.span import Span from ..attrs import IDS from .align cimport Alignment from .iob_utils import biluo_to_iob, biluo_tags_from_offsets, biluo_tags_from_doc @@ -19,6 +21,8 @@ cpdef Doc annotations2doc(vocab, tok_annot, doc_annot): output = Doc(vocab, words=tok_annot["ORTH"], spaces=tok_annot["SPACY"]) if array.size: output = output.from_array(attrs, array) + if "entities" in doc_annot: + _add_entities_to_doc(output, doc_annot["entities"]) # TODO: links ?! output.cats.update(doc_annot.get("cats", {})) return output @@ -99,29 +103,6 @@ cdef class Example: output[i] = None else: output[i] = gold_values[gold_i] - - if field in ["ENT_IOB"]: - # Fix many-to-one IOB codes - prev_j = -1 - prev_value = -1 - for i, value in enumerate(output): - if i in i2j_multi: - j = i2j_multi[i] - if j == prev_j and prev_value == value == 3: - output[i] = 1 # set B to I - prev_j = j - else: - prev_j = -1 - prev_value = value - - if field in ["ENT_IOB", "ENT_TYPE", "ENT_KB_ID"]: - # Assign one-to-many NER tags - for j, cand_j in enumerate(gold_to_cand): - if cand_j is None: - if j in j2i_multi: - i = j2i_multi[j] - if output[i] is None: - output[i] = gold_values[j] if as_string and field not in ["ENT_IOB", "SENT_START"]: output = [vocab.strings[o] if o is not None else o for o in output] return output @@ -145,15 +126,30 @@ cdef class Example: def get_aligned_ner(self): x_ents = [] + gold_to_cand = self.alignment.gold_to_cand for y_ent in self.y.ents: - x_span = self.x.char_span(y_ent.start_char, y_ent.end_char, label=y_ent.label) - if x_span is not None: - x_ents.append(x_span) + x_start = gold_to_cand[y_ent.start] + x_end = gold_to_cand[y_ent.end-1] + if x_start is not None and x_end is not None: + x_ents.append(Span(self.x, x_start, x_end+1, label=y_ent.label)) + else: + x_span = self.x.char_span( + y_ent.start_char, + y_ent.end_char, + label=y_ent.label + ) + if x_span is not None: + x_ents.append(x_span) x_tags = biluo_tags_from_offsets( self.x, [(e.start_char, e.end_char, e.label_) for e in x_ents], missing="O" ) + for token in self.y: + if token.ent_iob == 0: + cand_i = gold_to_cand[token.i] + if cand_i is not None: + x_tags[cand_i] = None return x_tags def to_dict(self): @@ -222,11 +218,7 @@ def _annot2array(vocab, tok_annot, doc_annot): for key, value in doc_annot.items(): if value: if key == "entities": - words = tok_annot["ORTH"] - spaces = tok_annot["SPACY"] - ent_iobs, ent_types = _parse_ner_tags(value, vocab, words, spaces) - tok_annot["ENT_IOB"] = ent_iobs - tok_annot["ENT_TYPE"] = ent_types + pass elif key == "links": entities = doc_annot.get("entities", {}) if value and not entities: @@ -252,13 +244,6 @@ def _annot2array(vocab, tok_annot, doc_annot): elif key == "MORPH": attrs.append(key) values.append([vocab.morphology.add(v) for v in value]) - elif key == "ENT_IOB": - iob_strings = Token.iob_strings() - attrs.append(key) - try: - values.append([iob_strings.index(v) for v in value]) - except ValueError: - raise ValueError(Errors.E982.format(values=iob_strings, value=values)) else: attrs.append(key) values.append([vocab.strings.add(v) for v in value]) @@ -267,6 +252,29 @@ def _annot2array(vocab, tok_annot, doc_annot): return attrs, array.T +def _add_entities_to_doc(doc, ner_data): + if ner_data is None: + return + elif ner_data == []: + doc.ents = [] + elif isinstance(ner_data[0], tuple): + return _add_entities_to_doc( + doc, + biluo_tags_from_offsets(doc, ner_data) + ) + elif isinstance(ner_data[0], str) or ner_data[0] is None: + return _add_entities_to_doc( + doc, + spans_from_biluo_tags(doc, ner_data) + ) + elif isinstance(ner_data[0], Span): + # Ugh, this is super messy. Really hard to set O entities + doc.ents = ner_data + doc.ents = [span for span in ner_data if span.label_] + else: + raise ValueError("Unexpected type for NER data") + + def _parse_example_dict_data(example_dict): return ( example_dict["token_annotation"], @@ -332,7 +340,7 @@ def _parse_ner_tags(biluo_or_offsets, vocab, words, spaces): ent_iobs = [] ent_types = [] for iob_tag in biluo_to_iob(biluo): - if iob_tag is None: + if iob_tag in (None, "-"): ent_iobs.append("") ent_types.append("") else: