Work on Example.get_aligned_ner method

2025-12-24 10:33:17 +03:00 · 2020-06-22 14:55:33 +02:00 · 2020-06-22 14:55:33 +02:00 · 72ab21166d
commit 72ab21166d
parent 497fef4b5f
1 changed files with 58 additions and 0 deletions
--- a/spacy/gold/example.pyx
+++ b/spacy/gold/example.pyx
@ -5,6 +5,7 @@ from ..tokens.doc cimport Doc
 from ..attrs import IDS
 from .align cimport Alignment
 from .iob_utils import biluo_to_iob, biluo_tags_from_offsets, biluo_tags_from_doc
+from .iob_utils import spans_from_biluo_tags
 from .align import Alignment
 from ..errors import Errors, AlignmentError
 from ..structs cimport TokenC
@ -140,6 +141,63 @@ cdef class Example:
                    aligned_deps[cand_i] = deps[gold_i]
        return aligned_heads, aligned_deps

+    def get_aligned_ner(self):
+        cand_to_gold = self.alignment.cand_to_gold
+        gold_to_cand = self.alignment.gold_to_cand
+        i2j_multi = self.alignment.i2j_multi
+        j2i_multi = self.alignment.j2i_multi
+        y_tags = biluo_tags_from_offsets(
+            self.y, 
+            [(e.start_char, e.end_char, e.label_) for e in self.y.ents]
+        )
+        x_tags = [None] * self.x.length
+        for i in range(self.x.length):
+            if self.x[i].is_space:
+                pass
+            elif cand_to_gold[i] is not None:
+                x_tags[i] = y_tags[cand_to_gold[i]]
+            elif i in i2j_multi:
+                # Assign O/- for many-to-one O/- NER tags
+                if y_tags[i2j_multi[i]] in ("O", "-"):
+                    x_tags[i] = y_tags[i2j_multi[i]]
+        # Assign O/- for one-to-many O/- NER tags
+        for gold_i, cand_i in enumerate(gold_to_cand):
+            if y_tags[gold_i] in ("O", "-"):
+                if cand_i is None and gold_i in j2i_multi:
+                    x_tags[j2i_multi[gold_i]] = y_tags[gold_i]
+        # TODO: I'm copying this over from v2.x but this seems kind of nuts?
+        # If there is entity annotation and some tokens remain unaligned,
+        # align all entities at the character level to account for all
+        # possible token misalignments within the entity spans
+        if list(self.y.ents) and None in x_tags:
+            # Get offsets based on gold words and BILUO entities
+            aligned_offsets = []
+            aligned_spans = []
+            # Filter offsets to identify those that align with doc tokens
+            for span in spans_from_biluo_tags(self.x, x_tags):
+                if span and not span.text.isspace():
+                    aligned_offsets.append(
+                        (span.start_char, span.end_char, span.label_)
+                    )
+                    aligned_spans.append(span)
+            # Convert back to BILUO for doc tokens and assign NER for all
+            # aligned spans
+            aligned_tags = biluo_tags_from_offsets(self.x, aligned_offsets, missing=None)
+            for span in aligned_spans:
+                for i in range(span.start, span.end):
+                    x_tags[i] = aligned_tags[i]
+            # Prevent whitespace that isn't within entities from being tagged as
+            # an entity.
+            for i, token in enumerate(self.x):
+                if token.is_space:
+                    prev_ner = x_tags[i] if i >= 1 else None
+                    next_ner = x_tags[i+1] if (i+1) < self.x.length else None
+                    if prev_ner == "O" or next_ner == "O":
+                        x_tags[i] = "O"
+        #print("Y tags", y_tags)
+        #print("X tags", x_tags)
+        return x_tags
+
    def to_dict(self):
        return {
            "doc_annotation": {