Distinction between outside, missing and blocked NER annotations (#4307)

* remove duplicate unit test * unit test (currently failing) for issue 4267 * bugfix: ensure doc.ents preserves kb_id annotations * fix in setting doc.ents with empty label * rename * test for presetting an entity to a certain type * allow overwriting Outside + blocking presets * fix actions when previous label needs to be kept * fix default ent_iob in set entities * cleaner solution with U- action * remove debugging print statements * unit tests with explicit transitions and is_valid testing * remove U- from move_names explicitly * remove unit tests with pre-trained models that don't work * remove (working) unit tests with pre-trained models * clean up unit tests * move unit tests * small fixes * remove two TODO's from doc.ents comments
2025-08-29 16:34:56 +03:00 · 2019-09-18 21:37:17 +02:00 · 2019-09-18 21:37:17 +02:00 · de5a9ecdf3
commit de5a9ecdf3
parent 72463b062f
9 changed files with 273 additions and 62 deletions
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -118,7 +118,7 @@ class Errors(object):
    E011 = ("Unknown operator: '{op}'. Options: {opts}")
    E012 = ("Cannot add pattern for zero tokens to matcher.\nKey: {key}")
    E013 = ("Error selecting action in matcher")
-    E014 = ("Uknown tag ID: {tag}")
+    E014 = ("Unknown tag ID: {tag}")
    E015 = ("Conflicting morphology exception for ({tag}, {orth}). Use "
            "`force=True` to overwrite.")
    E016 = ("MultitaskObjective target should be function or one of: dep, "
--- a/spacy/syntax/ner.pyx
+++ b/spacy/syntax/ner.pyx
@ -66,7 +66,8 @@ cdef class BiluoPushDown(TransitionSystem):
            UNIT: Counter(),
            OUT: Counter()
        }
-        actions[OUT][''] = 1
+        actions[OUT][''] = 1  # Represents a token predicted to be outside of any entity
+        actions[UNIT][''] = 1 # Represents a token prohibited to be in an entity
        for entity_type in kwargs.get('entity_types', []):
            for action in (BEGIN, IN, LAST, UNIT):
                actions[action][entity_type] = 1
@ -161,8 +162,7 @@ cdef class BiluoPushDown(TransitionSystem):
        for i in range(self.n_moves):
            if self.c[i].move == move and self.c[i].label == label:
                return self.c[i]
-        else:
-            raise KeyError(Errors.E022.format(name=name))
+        raise KeyError(Errors.E022.format(name=name))

    cdef Transition init_transition(self, int clas, int move, attr_t label) except *:
        # TODO: Apparent Cython bug here when we try to use the Transition()
@ -266,7 +266,7 @@ cdef class Begin:
            return False
        elif label == 0:
            return False
-        elif preset_ent_iob == 1 or preset_ent_iob == 2:
+        elif preset_ent_iob == 1:
            # Ensure we don't clobber preset entities. If no entity preset,
            # ent_iob is 0
            return False
@ -282,8 +282,8 @@ cdef class Begin:
                # Otherwise, force acceptance, even if we're across a sentence
                # boundary or the token is whitespace.
                return True
-        elif st.B_(1).ent_iob == 2 or st.B_(1).ent_iob == 3:
-            # If the next word is B or O, we can't B now
+        elif st.B_(1).ent_iob == 3:
+            # If the next word is B, we can't B now
            return False
        elif st.B_(1).sent_start == 1:
            # Don't allow entities to extend across sentence boundaries
@ -326,6 +326,7 @@ cdef class In:
    @staticmethod
    cdef bint is_valid(const StateC* st, attr_t label) nogil:
        cdef int preset_ent_iob = st.B_(0).ent_iob
+        cdef attr_t preset_ent_label = st.B_(0).ent_type
        if label == 0:
            return False
        elif st.E_(0).ent_type != label:
@ -335,13 +336,22 @@ cdef class In:
        elif st.B(1) == -1:
            # If we're at the end, we can't I.
            return False
-        elif preset_ent_iob == 2:
-            return False
        elif preset_ent_iob == 3:
            return False
-        elif st.B_(1).ent_iob == 2 or st.B_(1).ent_iob == 3:
-            # If we know the next word is B or O, we can't be I (must be L)
+        elif st.B_(1).ent_iob == 3:
+            # If we know the next word is B, we can't be I (must be L)
            return False
+        elif preset_ent_iob == 1:
+            if st.B_(1).ent_iob in (0, 2):
+                # if next preset is missing or O, this can't be I (must be L)
+                return False
+            elif label != preset_ent_label:
+                # If label isn't right, reject
+                return False
+            else:
+                # Otherwise, force acceptance, even if we're across a sentence
+                # boundary or the token is whitespace.
+                return True
        elif st.B(1) != -1 and st.B_(1).sent_start == 1:
            # Don't allow entities to extend across sentence boundaries
            return False
@ -387,17 +397,24 @@ cdef class In:
        else:
            return 1

-
 cdef class Last:
    @staticmethod
    cdef bint is_valid(const StateC* st, attr_t label) nogil:
+        cdef int preset_ent_iob = st.B_(0).ent_iob
+        cdef attr_t preset_ent_label = st.B_(0).ent_type
        if label == 0:
            return False
        elif not st.entity_is_open():
            return False
-        elif st.B_(0).ent_iob == 1 and st.B_(1).ent_iob != 1:
+        elif preset_ent_iob == 1 and st.B_(1).ent_iob != 1:
            # If a preset entity has I followed by not-I, is L
-            return True
+            if label != preset_ent_label:
+                # If label isn't right, reject
+                return False
+            else:
+                # Otherwise, force acceptance, even if we're across a sentence
+                # boundary or the token is whitespace.
+                return True
        elif st.E_(0).ent_type != label:
            return False
        elif st.B_(1).ent_iob == 1:
@ -450,12 +467,13 @@ cdef class Unit:
        cdef int preset_ent_iob = st.B_(0).ent_iob
        cdef attr_t preset_ent_label = st.B_(0).ent_type
        if label == 0:
-            return False
+            # this is only allowed if it's a preset blocked annotation
+            if preset_ent_label == 0 and preset_ent_iob == 3:
+                return True
+            else:
+                return False
        elif st.entity_is_open():
            return False
-        elif preset_ent_iob == 2:
-            # Don't clobber preset O
-            return False
        elif st.B_(1).ent_iob == 1:
            # If next token is In, we can't be Unit -- must be Begin
            return False
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@ -135,7 +135,9 @@ cdef class Parser:
        names = []
        for i in range(self.moves.n_moves):
            name = self.moves.move_name(self.moves.c[i].move, self.moves.c[i].label)
-            names.append(name)
+            # Explicitly removing the internal "U-" token used for blocking entities
+            if name != "U-":
+                names.append(name)
        return names

    nr_feature = 8
--- a/spacy/tests/doc/test_add_entities.py
+++ b/spacy/tests/doc/test_add_entities.py
@ -16,10 +16,23 @@ def test_doc_add_entities_set_ents_iob(en_vocab):
    ner(doc)
    assert len(list(doc.ents)) == 0
    assert [w.ent_iob_ for w in doc] == (["O"] * len(doc))
+
    doc.ents = [(doc.vocab.strings["ANIMAL"], 3, 4)]
-    assert [w.ent_iob_ for w in doc] == ["", "", "", "B"]
+    assert [w.ent_iob_ for w in doc] == ["O", "O", "O", "B"]
+
    doc.ents = [(doc.vocab.strings["WORD"], 0, 2)]
-    assert [w.ent_iob_ for w in doc] == ["B", "I", "", ""]
+    assert [w.ent_iob_ for w in doc] == ["B", "I", "O", "O"]
+
+
+def test_ents_reset(en_vocab):
+    text = ["This", "is", "a", "lion"]
+    doc = get_doc(en_vocab, text)
+    ner = EntityRecognizer(en_vocab)
+    ner.begin_training([])
+    ner(doc)
+    assert [t.ent_iob_ for t in doc] == (["O"] * len(doc))
+    doc.ents = list(doc.ents)
+    assert [t.ent_iob_ for t in doc] == (["O"] * len(doc))


 def test_add_overlapping_entities(en_vocab):
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@ -2,7 +2,9 @@
 from __future__ import unicode_literals

 import pytest
-from spacy.pipeline import EntityRecognizer
+from spacy.lang.en import English
+
+from spacy.pipeline import EntityRecognizer, EntityRuler
 from spacy.vocab import Vocab
 from spacy.syntax.ner import BiluoPushDown
 from spacy.gold import GoldParse
@ -80,14 +82,145 @@ def test_get_oracle_moves_negative_O(tsys, vocab):
    assert names


-def test_doc_add_entities_set_ents_iob(en_vocab):
-    doc = Doc(en_vocab, words=["This", "is", "a", "lion"])
-    ner = EntityRecognizer(en_vocab)
-    ner.begin_training([])
-    ner(doc)
-    assert len(list(doc.ents)) == 0
-    assert [w.ent_iob_ for w in doc] == (["O"] * len(doc))
-    doc.ents = [(doc.vocab.strings["ANIMAL"], 3, 4)]
-    assert [w.ent_iob_ for w in doc] == ["", "", "", "B"]
-    doc.ents = [(doc.vocab.strings["WORD"], 0, 2)]
-    assert [w.ent_iob_ for w in doc] == ["B", "I", "", ""]
+def test_accept_blocked_token():
+    """Test succesful blocking of tokens to be in an entity."""
+    # 1. test normal behaviour
+    nlp1 = English()
+    doc1 = nlp1("I live in New York")
+    ner1 = EntityRecognizer(doc1.vocab)
+    assert [token.ent_iob_ for token in doc1] == ["", "", "", "", ""]
+    assert [token.ent_type_ for token in doc1] == ["", "", "", "", ""]
+
+    # Add the OUT action
+    ner1.moves.add_action(5, "")
+    ner1.add_label("GPE")
+    # Get into the state just before "New"
+    state1 = ner1.moves.init_batch([doc1])[0]
+    ner1.moves.apply_transition(state1, "O")
+    ner1.moves.apply_transition(state1, "O")
+    ner1.moves.apply_transition(state1, "O")
+    # Check that B-GPE is valid.
+    assert ner1.moves.is_valid(state1, "B-GPE")
+
+    # 2. test blocking behaviour
+    nlp2 = English()
+    doc2 = nlp2("I live in New York")
+    ner2 = EntityRecognizer(doc2.vocab)
+
+    # set "New York" to a blocked entity
+    doc2.ents = [(0, 3, 5)]
+    assert [token.ent_iob_ for token in doc2] == ["", "", "", "B", "B"]
+    assert [token.ent_type_ for token in doc2] == ["", "", "", "", ""]
+
+    # Check that B-GPE is now invalid.
+    ner2.moves.add_action(4, "")
+    ner2.moves.add_action(5, "")
+    ner2.add_label("GPE")
+    state2 = ner2.moves.init_batch([doc2])[0]
+    ner2.moves.apply_transition(state2, "O")
+    ner2.moves.apply_transition(state2, "O")
+    ner2.moves.apply_transition(state2, "O")
+    # we can only use U- for "New"
+    assert not ner2.moves.is_valid(state2, "B-GPE")
+    assert ner2.moves.is_valid(state2, "U-")
+    ner2.moves.apply_transition(state2, "U-")
+    # we can only use U- for "York"
+    assert not ner2.moves.is_valid(state2, "B-GPE")
+    assert ner2.moves.is_valid(state2, "U-")
+
+
+def test_overwrite_token():
+    nlp = English()
+    ner1 = nlp.create_pipe("ner")
+    nlp.add_pipe(ner1, name="ner")
+    nlp.begin_training()
+
+    # The untrained NER will predict O for each token
+    doc = nlp("I live in New York")
+    assert [token.ent_iob_ for token in doc] == ["O", "O", "O", "O", "O"]
+    assert [token.ent_type_ for token in doc] == ["", "", "", "", ""]
+
+    # Check that a new ner can overwrite O
+    ner2 = EntityRecognizer(doc.vocab)
+    ner2.moves.add_action(5, "")
+    ner2.add_label("GPE")
+    state = ner2.moves.init_batch([doc])[0]
+    assert ner2.moves.is_valid(state, "B-GPE")
+    assert ner2.moves.is_valid(state, "U-GPE")
+    ner2.moves.apply_transition(state, "B-GPE")
+    assert ner2.moves.is_valid(state, "I-GPE")
+    assert ner2.moves.is_valid(state, "L-GPE")
+
+
+def test_ruler_before_ner():
+    """ Test that an NER works after an entity_ruler: the second can add annotations """
+    nlp = English()
+
+    # 1 : Entity Ruler - should set "this" to B and everything else to empty
+    ruler = EntityRuler(nlp)
+    patterns = [{"label": "THING", "pattern": "This"}]
+    ruler.add_patterns(patterns)
+    nlp.add_pipe(ruler)
+
+    # 2: untrained NER - should set everything else to O
+    untrained_ner = nlp.create_pipe("ner")
+    untrained_ner.add_label("MY_LABEL")
+    nlp.add_pipe(untrained_ner)
+    nlp.begin_training()
+
+    doc = nlp("This is Antti Korhonen speaking in Finland")
+    expected_iobs = ["B", "O", "O", "O", "O", "O", "O"]
+    expected_types = ["THING", "", "", "", "", "", ""]
+    assert [token.ent_iob_ for token in doc] == expected_iobs
+    assert [token.ent_type_ for token in doc] == expected_types
+
+
+def test_ner_before_ruler():
+    """ Test that an entity_ruler works after an NER: the second can overwrite O annotations """
+    nlp = English()
+
+    # 1: untrained NER - should set everything to O
+    untrained_ner = nlp.create_pipe("ner")
+    untrained_ner.add_label("MY_LABEL")
+    nlp.add_pipe(untrained_ner, name="uner")
+    nlp.begin_training()
+
+    # 2 : Entity Ruler - should set "this" to B and keep everything else O
+    ruler = EntityRuler(nlp)
+    patterns = [{"label": "THING", "pattern": "This"}]
+    ruler.add_patterns(patterns)
+    nlp.add_pipe(ruler)
+
+    doc = nlp("This is Antti Korhonen speaking in Finland")
+    expected_iobs = ["B", "O", "O", "O", "O", "O", "O"]
+    expected_types = ["THING", "", "", "", "", "", ""]
+    assert [token.ent_iob_ for token in doc] == expected_iobs
+    assert [token.ent_type_ for token in doc] == expected_types
+
+
+def test_block_ner():
+    """ Test functionality for blocking tokens so they can't be in a named entity """
+    # block "Antti L Korhonen" from being a named entity
+    nlp = English()
+    nlp.add_pipe(BlockerComponent1(2, 5))
+    untrained_ner = nlp.create_pipe("ner")
+    untrained_ner.add_label("MY_LABEL")
+    nlp.add_pipe(untrained_ner, name="uner")
+    nlp.begin_training()
+    doc = nlp("This is Antti L Korhonen speaking in Finland")
+    expected_iobs = ["O", "O", "B", "B", "B", "O", "O", "O"]
+    expected_types = ["", "", "", "", "", "", "", ""]
+    assert [token.ent_iob_ for token in doc] == expected_iobs
+    assert [token.ent_type_ for token in doc] == expected_types
+
+
+class BlockerComponent1(object):
+    name = "my_blocker"
+
+    def __init__(self, start, end):
+        self.start = start
+        self.end = end
+
+    def __call__(self, doc):
+        doc.ents = [(0, self.start, self.end)]
+        return doc
--- a/spacy/tests/regression/test_issue1-1000.py
+++ b/spacy/tests/regression/test_issue1-1000.py
@ -426,7 +426,7 @@ def test_issue957(en_tokenizer):
 def test_issue999(train_data):
    """Test that adding entities and resuming training works passably OK.
    There are two issues here:
-    1) We have to readd labels. This isn't very nice.
+    1) We have to read labels. This isn't very nice.
    2) There's no way to set the learning rate for the weight update, so we
        end up out-of-scale, causing it to learn too fast.
    """
--- a/spacy/tests/regression/test_issue4267.py
+++ b/spacy/tests/regression/test_issue4267.py
@ -0,0 +1,42 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+import pytest
+
+import spacy
+
+from spacy.lang.en import English
+from spacy.pipeline import EntityRuler
+from spacy.tokens import Span
+
+
+def test_issue4267():
+    """ Test that running an entity_ruler after ner gives consistent results"""
+    nlp = English()
+    ner = nlp.create_pipe("ner")
+    ner.add_label("PEOPLE")
+    nlp.add_pipe(ner)
+    nlp.begin_training()
+
+    assert "ner" in nlp.pipe_names
+
+    # assert that we have correct IOB annotations
+    doc1 = nlp("hi")
+    assert doc1.is_nered
+    for token in doc1:
+        assert token.ent_iob == 2
+
+    # add entity ruler and run again
+    ruler = EntityRuler(nlp)
+    patterns = [{"label": "SOFTWARE", "pattern": "spacy"}]
+
+    ruler.add_patterns(patterns)
+    nlp.add_pipe(ruler)
+    assert "entity_ruler" in nlp.pipe_names
+    assert "ner" in nlp.pipe_names
+
+    # assert that we still have correct IOB annotations
+    doc2 = nlp("hi")
+    assert doc2.is_nered
+    for token in doc2:
+        assert token.ent_iob == 2
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -256,7 +256,7 @@ cdef class Doc:
    def is_nered(self):
        """Check if the document has named entities set. Will return True if
        *any* of the tokens has a named entity tag set (even if the others are
-        uknown values).
+        unknown values).
        """
        if len(self) == 0:
            return True
@ -525,13 +525,11 @@ cdef class Doc:

        def __set__(self, ents):
            # TODO:
-            # 1. Allow negative matches
-            # 2. Ensure pre-set NERs are not over-written during statistical
-            #    prediction
-            # 3. Test basic data-driven ORTH gazetteer
-            # 4. Test more nuanced date and currency regex
+            # 1. Test basic data-driven ORTH gazetteer
+            # 2. Test more nuanced date and currency regex
            tokens_in_ents = {}
            cdef attr_t entity_type
+            cdef attr_t kb_id
            cdef int ent_start, ent_end
            for ent_info in ents:
                entity_type, kb_id, ent_start, ent_end = get_entity_info(ent_info)
@ -545,27 +543,31 @@ cdef class Doc:
                    tokens_in_ents[token_index] = (ent_start, ent_end, entity_type, kb_id)
            cdef int i
            for i in range(self.length):
-                self.c[i].ent_type = 0
-                self.c[i].ent_kb_id = 0
-                self.c[i].ent_iob = 0  # Means missing.
-            cdef attr_t ent_type
-            cdef int start, end
-            for ent_info in ents:
-                ent_type, ent_kb_id, start, end = get_entity_info(ent_info)
-                if ent_type is None or ent_type < 0:
-                    # Mark as O
-                    for i in range(start, end):
-                        self.c[i].ent_type = 0
-                        self.c[i].ent_kb_id = 0
-                        self.c[i].ent_iob = 2
-                else:
-                    # Mark (inside) as I
-                    for i in range(start, end):
-                        self.c[i].ent_type = ent_type
-                        self.c[i].ent_kb_id = ent_kb_id
-                        self.c[i].ent_iob = 1
-                    # Set start as B
-                    self.c[start].ent_iob = 3
+                # default values
+                entity_type = 0
+                kb_id = 0
+
+                # Set ent_iob to Missing (0) bij default unless this token was nered before
+                ent_iob = 0
+                if self.c[i].ent_iob != 0:
+                    ent_iob = 2
+
+                # overwrite if the token was part of a specified entity
+                if i in tokens_in_ents.keys():
+                    ent_start, ent_end, entity_type, kb_id = tokens_in_ents[i]
+                    if entity_type is None or entity_type <= 0:
+                        # Blocking this token from being overwritten by downstream NER
+                        ent_iob = 3
+                    elif ent_start == i:
+                        # Marking the start of an entity
+                        ent_iob = 3
+                    else:
+                        # Marking the inside of an entity
+                        ent_iob = 1
+
+                self.c[i].ent_type = entity_type
+                self.c[i].ent_kb_id = kb_id
+                self.c[i].ent_iob = ent_iob

    @property
    def noun_chunks(self):
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -749,7 +749,8 @@ cdef class Token:
    def ent_iob_(self):
        """IOB code of named entity tag. "B" means the token begins an entity,
        "I" means it is inside an entity, "O" means it is outside an entity,
-        and "" means no entity tag is set.
+        and "" means no entity tag is set. "B" with an empty ent_type
+        means that the token is blocked from further processing by NER.

        RETURNS (unicode): IOB code of named entity tag.
        """