Modify setting missing and blocked entity tokens

In order to make it easier to construct `Doc` objects as training data, modify how missing and blocked entity tokens are set to prioritize setting `O` and missing entity tokens for training purposes over setting blocked entity tokens. * `Doc.ents` setter sets tokens outside entity spans to `O` regardless of the current state of each token * For `Doc.ents`, setting a span with a missing label sets the `ent_iob` to missing instead of blocked * `Doc.block_ents(spans)` marks spans as hard `O` for use with the `EntityRecognizer`
2025-08-10 07:04:53 +03:00 · 2020-09-17 21:10:41 +02:00 · 2020-09-17 21:10:41 +02:00 · 8b650f3a78
commit 8b650f3a78
parent 8303d101a5
5 changed files with 42 additions and 21 deletions
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@ -137,7 +137,7 @@ def test_doc_api_set_ents(en_tokenizer):
    assert len(tokens.ents) == 0
    tokens.ents = [(tokens.vocab.strings["PRODUCT"], 2, 4)]
    assert len(list(tokens.ents)) == 1
-    assert [t.ent_iob for t in tokens] == [0, 0, 3, 1, 0, 0, 0, 0]
+    assert [t.ent_iob for t in tokens] == [2, 2, 3, 1, 2, 2, 2, 2]
    assert tokens.ents[0].label_ == "PRODUCT"
    assert tokens.ents[0].start == 2
    assert tokens.ents[0].end == 4
@ -426,7 +426,7 @@ def test_has_annotation(en_vocab):
    doc[0].lemma_ = "a"
    doc[0].dep_ = "dep"
    doc[0].head = doc[1]
-    doc.ents = [Span(doc, 0, 1, label="HELLO")]
+    doc.ents = [Span(doc, 0, 1, label="HELLO"), Span(doc, 1, 2, label="")]

    for attr in attrs:
        assert doc.has_annotation(attr)
@ -454,3 +454,17 @@ def test_is_flags_deprecated(en_tokenizer):
        doc.is_nered
    with pytest.deprecated_call():
        doc.is_sentenced
+
+
+def test_block_ents(en_tokenizer):
+    doc = en_tokenizer("a b c d e")
+    doc.block_ents([doc[1:2], doc[3:5]])
+    assert [t.ent_iob for t in doc] == [0, 3, 0, 3, 3]
+    assert [t.ent_type for t in doc] == [0, 0, 0, 0, 0]
+    assert doc.ents == tuple()
+
+    # invalid IOB repaired
+    doc.ents = [Span(doc, 3, 5, "ENT")]
+    assert [t.ent_iob for t in doc] == [2, 2, 2, 3, 1]
+    doc.block_ents([doc[3:4]])
+    assert [t.ent_iob for t in doc] == [2, 2, 2, 3, 3]
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@ -168,7 +168,7 @@ def test_accept_blocked_token():
    ner2 = nlp2.create_pipe("ner", config=config)

    # set "New York" to a blocked entity
-    doc2.ents = [(0, 3, 5)]
+    doc2.block_ents([doc2[3:5]])
    assert [token.ent_iob_ for token in doc2] == ["", "", "", "B", "B"]
    assert [token.ent_type_ for token in doc2] == ["", "", "", "", ""]

@ -358,5 +358,5 @@ class BlockerComponent1:
        self.name = name

    def __call__(self, doc):
-        doc.ents = [(0, self.start, self.end)]
+        doc.block_ents([doc[self.start:self.end]])
        return doc
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -590,17 +590,16 @@ cdef class Doc:
                entity_type = 0
                kb_id = 0

-                # Set ent_iob to Missing (0) by default unless this token was nered before
-                ent_iob = 0
-                if self.c[i].ent_iob != 0:
-                    ent_iob = 2
+                # Set ent_iob to Outside (2) by default
+                ent_iob = 2

                # overwrite if the token was part of a specified entity
                if i in tokens_in_ents.keys():
                    ent_start, ent_end, entity_type, kb_id = tokens_in_ents[i]
                    if entity_type is None or entity_type <= 0:
-                        # Blocking this token from being overwritten by downstream NER
-                        ent_iob = 3
+                        # Empty label: Missing, unset this token
+                        ent_iob = 0
+                        entity_type = 0
                    elif ent_start == i:
                        # Marking the start of an entity
                        ent_iob = 3
@ -612,6 +611,20 @@ cdef class Doc:
                self.c[i].ent_kb_id = kb_id
                self.c[i].ent_iob = ent_iob

+    def block_ents(self, spans):
+        """Mark spans as never an entity for the EntityRecognizer.
+
+        spans (List[Span]): The spans to block as never entities.
+        """
+        for span in spans:
+            for i in range(span.start, span.end):
+                self.c[i].ent_iob = 3
+                self.c[i].ent_type = 0
+            # if the following token is I, set to B
+            if span.end < self.length:
+                if self.c[span.end].ent_iob == 1:
+                    self.c[span.end].ent_iob = 3
+
    @property
    def noun_chunks(self):
        """Iterate over the base noun phrases in the document. Yields base
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@ -172,7 +172,7 @@ cdef class Example:
        return output

    def get_aligned_ner(self):
-        if not self.y.is_nered:
+        if not self.y.has_annotation("ENT_IOB"):
            return [None] * len(self.x)  # should this be 'missing' instead of 'None' ?
        x_ents = self.get_aligned_spans_y2x(self.y.ents)
        # Default to 'None' for missing values
@ -303,9 +303,7 @@ def _add_entities_to_doc(doc, ner_data):
            spans_from_biluo_tags(doc, ner_data)
        )
    elif isinstance(ner_data[0], Span):
-        # Ugh, this is super messy. Really hard to set O entities
        doc.ents = ner_data
-        doc.ents = [span for span in ner_data if span.label_]
    else:
        raise ValueError(Errors.E973)

--- a/spacy/training/iob_utils.py
+++ b/spacy/training/iob_utils.py
@ -182,22 +182,18 @@ def tags_to_entities(tags):
    entities = []
    start = None
    for i, tag in enumerate(tags):
-        if tag is None:
-            continue
-        if tag.startswith("O"):
+        if tag is None or tag.startswith("-"):
            # TODO: We shouldn't be getting these malformed inputs. Fix this.
            if start is not None:
                start = None
            else:
                entities.append(("", i, i))
-            continue
-        elif tag == "-":
-            continue
+        elif tag.startswith("O"):
+            pass
        elif tag.startswith("I"):
            if start is None:
                raise ValueError(Errors.E067.format(start="I", tags=tags[: i + 1]))
-            continue
-        if tag.startswith("U"):
+        elif tag.startswith("U"):
            entities.append((tag[2:], i, i))
        elif tag.startswith("B"):
            start = i