diff --git a/spacy/gold/example.pyx b/spacy/gold/example.pyx index 7ddc59cda..bfc0eb1e8 100644 --- a/spacy/gold/example.pyx +++ b/spacy/gold/example.pyx @@ -5,6 +5,7 @@ from ..tokens.doc cimport Doc from ..attrs import IDS from .align cimport Alignment from .iob_utils import biluo_to_iob, biluo_tags_from_offsets, biluo_tags_from_doc +from .iob_utils import spans_from_biluo_tags from .align import Alignment from ..errors import Errors, AlignmentError from ..structs cimport TokenC @@ -140,6 +141,63 @@ cdef class Example: aligned_deps[cand_i] = deps[gold_i] return aligned_heads, aligned_deps + def get_aligned_ner(self): + cand_to_gold = self.alignment.cand_to_gold + gold_to_cand = self.alignment.gold_to_cand + i2j_multi = self.alignment.i2j_multi + j2i_multi = self.alignment.j2i_multi + y_tags = biluo_tags_from_offsets( + self.y, + [(e.start_char, e.end_char, e.label_) for e in self.y.ents] + ) + x_tags = [None] * self.x.length + for i in range(self.x.length): + if self.x[i].is_space: + pass + elif cand_to_gold[i] is not None: + x_tags[i] = y_tags[cand_to_gold[i]] + elif i in i2j_multi: + # Assign O/- for many-to-one O/- NER tags + if y_tags[i2j_multi[i]] in ("O", "-"): + x_tags[i] = y_tags[i2j_multi[i]] + # Assign O/- for one-to-many O/- NER tags + for gold_i, cand_i in enumerate(gold_to_cand): + if y_tags[gold_i] in ("O", "-"): + if cand_i is None and gold_i in j2i_multi: + x_tags[j2i_multi[gold_i]] = y_tags[gold_i] + # TODO: I'm copying this over from v2.x but this seems kind of nuts? + # If there is entity annotation and some tokens remain unaligned, + # align all entities at the character level to account for all + # possible token misalignments within the entity spans + if list(self.y.ents) and None in x_tags: + # Get offsets based on gold words and BILUO entities + aligned_offsets = [] + aligned_spans = [] + # Filter offsets to identify those that align with doc tokens + for span in spans_from_biluo_tags(self.x, x_tags): + if span and not span.text.isspace(): + aligned_offsets.append( + (span.start_char, span.end_char, span.label_) + ) + aligned_spans.append(span) + # Convert back to BILUO for doc tokens and assign NER for all + # aligned spans + aligned_tags = biluo_tags_from_offsets(self.x, aligned_offsets, missing=None) + for span in aligned_spans: + for i in range(span.start, span.end): + x_tags[i] = aligned_tags[i] + # Prevent whitespace that isn't within entities from being tagged as + # an entity. + for i, token in enumerate(self.x): + if token.is_space: + prev_ner = x_tags[i] if i >= 1 else None + next_ner = x_tags[i+1] if (i+1) < self.x.length else None + if prev_ner == "O" or next_ner == "O": + x_tags[i] = "O" + #print("Y tags", y_tags) + #print("X tags", x_tags) + return x_tags + def to_dict(self): return { "doc_annotation": {