Distinction between outside, missing and blocked NER annotations (#4307)

* remove duplicate unit test * unit test (currently failing) for issue 4267 * bugfix: ensure doc.ents preserves kb_id annotations * fix in setting doc.ents with empty label * rename * test for presetting an entity to a certain type * allow overwriting Outside + blocking presets * fix actions when previous label needs to be kept * fix default ent_iob in set entities * cleaner solution with U- action * remove debugging print statements * unit tests with explicit transitions and is_valid testing * remove U- from move_names explicitly * remove unit tests with pre-trained models that don't work * remove (working) unit tests with pre-trained models * clean up unit tests * move unit tests * small fixes * remove two TODO's from doc.ents comments
2025-09-18 18:12:45 +03:00 · 2019-09-18 21:37:17 +02:00 · 2019-09-18 21:37:17 +02:00 · de5a9ecdf3
commit de5a9ecdf3
parent 72463b062f
9 changed files with 273 additions and 62 deletions
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -118,7 +118,7 @@ class Errors(object):
    E011 = ("Unknown operator: '{op}'. Options: {opts}")
    E012 = ("Cannot add pattern for zero tokens to matcher.\nKey: {key}")
    E013 = ("Error selecting action in matcher")
-    E014 = ("Uknown tag ID: {tag}")
+    E014 = ("Unknown tag ID: {tag}")
    E015 = ("Conflicting morphology exception for ({tag}, {orth}). Use "
            "`force=True` to overwrite.")
    E016 = ("MultitaskObjective target should be function or one of: dep, "
--- a/spacy/syntax/ner.pyx
+++ b/spacy/syntax/ner.pyx
@ -66,7 +66,8 @@ cdef class BiluoPushDown(TransitionSystem):
            UNIT: Counter(),
            OUT: Counter()
        }
-        actions[OUT][''] = 1
+        actions[OUT][''] = 1  # Represents a token predicted to be outside of any entity
        actions[UNIT][''] = 1 # Represents a token prohibited to be in an entity
        for entity_type in kwargs.get('entity_types', []):
            for action in (BEGIN, IN, LAST, UNIT):
                actions[action][entity_type] = 1
@ -161,8 +162,7 @@ cdef class BiluoPushDown(TransitionSystem):
        for i in range(self.n_moves):
            if self.c[i].move == move and self.c[i].label == label:
                return self.c[i]
-        else:
+        raise KeyError(Errors.E022.format(name=name))
            raise KeyError(Errors.E022.format(name=name))
    cdef Transition init_transition(self, int clas, int move, attr_t label) except *:
        # TODO: Apparent Cython bug here when we try to use the Transition()
@ -266,7 +266,7 @@ cdef class Begin:
            return False
        elif label == 0:
            return False
-        elif preset_ent_iob == 1 or preset_ent_iob == 2:
+        elif preset_ent_iob == 1:
            # Ensure we don't clobber preset entities. If no entity preset,
            # ent_iob is 0
            return False
@ -282,8 +282,8 @@ cdef class Begin:
                # Otherwise, force acceptance, even if we're across a sentence
                # boundary or the token is whitespace.
                return True
-        elif st.B_(1).ent_iob == 2 or st.B_(1).ent_iob == 3:
+        elif st.B_(1).ent_iob == 3:
-            # If the next word is B or O, we can't B now
+            # If the next word is B, we can't B now
            return False
        elif st.B_(1).sent_start == 1:
            # Don't allow entities to extend across sentence boundaries
@ -326,6 +326,7 @@ cdef class In:
    @staticmethod
    cdef bint is_valid(const StateC* st, attr_t label) nogil:
        cdef int preset_ent_iob = st.B_(0).ent_iob
        cdef attr_t preset_ent_label = st.B_(0).ent_type
        if label == 0:
            return False
        elif st.E_(0).ent_type != label:
@ -335,13 +336,22 @@ cdef class In:
        elif st.B(1) == -1:
            # If we're at the end, we can't I.
            return False
        elif preset_ent_iob == 2:
            return False
        elif preset_ent_iob == 3:
            return False
-        elif st.B_(1).ent_iob == 2 or st.B_(1).ent_iob == 3:
+        elif st.B_(1).ent_iob == 3:
-            # If we know the next word is B or O, we can't be I (must be L)
+            # If we know the next word is B, we can't be I (must be L)
            return False
        elif preset_ent_iob == 1:
            if st.B_(1).ent_iob in (0, 2):
                # if next preset is missing or O, this can't be I (must be L)
                return False
            elif label != preset_ent_label:
                # If label isn't right, reject
                return False
            else:
                # Otherwise, force acceptance, even if we're across a sentence
                # boundary or the token is whitespace.
                return True
        elif st.B(1) != -1 and st.B_(1).sent_start == 1:
            # Don't allow entities to extend across sentence boundaries
            return False
@ -387,17 +397,24 @@ cdef class In:
        else:
            return 1
 cdef class Last:
    @staticmethod
    cdef bint is_valid(const StateC* st, attr_t label) nogil:
        cdef int preset_ent_iob = st.B_(0).ent_iob
        cdef attr_t preset_ent_label = st.B_(0).ent_type
        if label == 0:
            return False
        elif not st.entity_is_open():
            return False
-        elif st.B_(0).ent_iob == 1 and st.B_(1).ent_iob != 1:
+        elif preset_ent_iob == 1 and st.B_(1).ent_iob != 1:
            # If a preset entity has I followed by not-I, is L
-            return True
+            if label != preset_ent_label:
                # If label isn't right, reject
                return False
            else:
                # Otherwise, force acceptance, even if we're across a sentence
                # boundary or the token is whitespace.
                return True
        elif st.E_(0).ent_type != label:
            return False
        elif st.B_(1).ent_iob == 1:
@ -450,12 +467,13 @@ cdef class Unit:
        cdef int preset_ent_iob = st.B_(0).ent_iob
        cdef attr_t preset_ent_label = st.B_(0).ent_type
        if label == 0:
-            return False
+            # this is only allowed if it's a preset blocked annotation
            if preset_ent_label == 0 and preset_ent_iob == 3:
                return True
            else:
                return False
        elif st.entity_is_open():
            return False
        elif preset_ent_iob == 2:
            # Don't clobber preset O
            return False
        elif st.B_(1).ent_iob == 1:
            # If next token is In, we can't be Unit -- must be Begin
            return False
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@ -135,7 +135,9 @@ cdef class Parser:
        names = []
        for i in range(self.moves.n_moves):
            name = self.moves.move_name(self.moves.c[i].move, self.moves.c[i].label)
-            names.append(name)
+            # Explicitly removing the internal "U-" token used for blocking entities
            if name != "U-":
                names.append(name)
        return names
    nr_feature = 8
--- a/spacy/tests/doc/test_add_entities.py
+++ b/spacy/tests/doc/test_add_entities.py
@ -16,10 +16,23 @@ def test_doc_add_entities_set_ents_iob(en_vocab):
    ner(doc)
    assert len(list(doc.ents)) == 0
    assert [w.ent_iob_ for w in doc] == (["O"] * len(doc))
    doc.ents = [(doc.vocab.strings["ANIMAL"], 3, 4)]
-    assert [w.ent_iob_ for w in doc] == ["", "", "", "B"]
+    assert [w.ent_iob_ for w in doc] == ["O", "O", "O", "B"]
    doc.ents = [(doc.vocab.strings["WORD"], 0, 2)]
-    assert [w.ent_iob_ for w in doc] == ["B", "I", "", ""]
+    assert [w.ent_iob_ for w in doc] == ["B", "I", "O", "O"]
 def test_ents_reset(en_vocab):
    text = ["This", "is", "a", "lion"]
    doc = get_doc(en_vocab, text)
    ner = EntityRecognizer(en_vocab)
    ner.begin_training([])
    ner(doc)
    assert [t.ent_iob_ for t in doc] == (["O"] * len(doc))
    doc.ents = list(doc.ents)
    assert [t.ent_iob_ for t in doc] == (["O"] * len(doc))
 def test_add_overlapping_entities(en_vocab):
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@ -2,7 +2,9 @@
 from __future__ import unicode_literals
 import pytest
-from spacy.pipeline import EntityRecognizer
+from spacy.lang.en import English
 from spacy.pipeline import EntityRecognizer, EntityRuler
 from spacy.vocab import Vocab
 from spacy.syntax.ner import BiluoPushDown
 from spacy.gold import GoldParse
@ -80,14 +82,145 @@ def test_get_oracle_moves_negative_O(tsys, vocab):
    assert names
-def test_doc_add_entities_set_ents_iob(en_vocab):
+def test_accept_blocked_token():
-    doc = Doc(en_vocab, words=["This", "is", "a", "lion"])
+    """Test succesful blocking of tokens to be in an entity."""
-    ner = EntityRecognizer(en_vocab)
+    # 1. test normal behaviour
-    ner.begin_training([])
+    nlp1 = English()
-    ner(doc)
+    doc1 = nlp1("I live in New York")
-    assert len(list(doc.ents)) == 0
+    ner1 = EntityRecognizer(doc1.vocab)
-    assert [w.ent_iob_ for w in doc] == (["O"] * len(doc))
+    assert [token.ent_iob_ for token in doc1] == ["", "", "", "", ""]
-    doc.ents = [(doc.vocab.strings["ANIMAL"], 3, 4)]
+    assert [token.ent_type_ for token in doc1] == ["", "", "", "", ""]
-    assert [w.ent_iob_ for w in doc] == ["", "", "", "B"]
+
-    doc.ents = [(doc.vocab.strings["WORD"], 0, 2)]
+    # Add the OUT action
-    assert [w.ent_iob_ for w in doc] == ["B", "I", "", ""]
+    ner1.moves.add_action(5, "")
    ner1.add_label("GPE")
    # Get into the state just before "New"
    state1 = ner1.moves.init_batch([doc1])[0]
    ner1.moves.apply_transition(state1, "O")
    ner1.moves.apply_transition(state1, "O")
    ner1.moves.apply_transition(state1, "O")
    # Check that B-GPE is valid.
    assert ner1.moves.is_valid(state1, "B-GPE")
    # 2. test blocking behaviour
    nlp2 = English()
    doc2 = nlp2("I live in New York")
    ner2 = EntityRecognizer(doc2.vocab)
    # set "New York" to a blocked entity
    doc2.ents = [(0, 3, 5)]
    assert [token.ent_iob_ for token in doc2] == ["", "", "", "B", "B"]
    assert [token.ent_type_ for token in doc2] == ["", "", "", "", ""]
    # Check that B-GPE is now invalid.
    ner2.moves.add_action(4, "")
    ner2.moves.add_action(5, "")
    ner2.add_label("GPE")
    state2 = ner2.moves.init_batch([doc2])[0]
    ner2.moves.apply_transition(state2, "O")
    ner2.moves.apply_transition(state2, "O")
    ner2.moves.apply_transition(state2, "O")
    # we can only use U- for "New"
    assert not ner2.moves.is_valid(state2, "B-GPE")
    assert ner2.moves.is_valid(state2, "U-")
    ner2.moves.apply_transition(state2, "U-")
    # we can only use U- for "York"
    assert not ner2.moves.is_valid(state2, "B-GPE")
    assert ner2.moves.is_valid(state2, "U-")
 def test_overwrite_token():
    nlp = English()
    ner1 = nlp.create_pipe("ner")
    nlp.add_pipe(ner1, name="ner")
    nlp.begin_training()
    # The untrained NER will predict O for each token
    doc = nlp("I live in New York")
    assert [token.ent_iob_ for token in doc] == ["O", "O", "O", "O", "O"]
    assert [token.ent_type_ for token in doc] == ["", "", "", "", ""]
    # Check that a new ner can overwrite O
    ner2 = EntityRecognizer(doc.vocab)
    ner2.moves.add_action(5, "")
    ner2.add_label("GPE")
    state = ner2.moves.init_batch([doc])[0]
    assert ner2.moves.is_valid(state, "B-GPE")
    assert ner2.moves.is_valid(state, "U-GPE")
    ner2.moves.apply_transition(state, "B-GPE")
    assert ner2.moves.is_valid(state, "I-GPE")
    assert ner2.moves.is_valid(state, "L-GPE")
 def test_ruler_before_ner():
    """ Test that an NER works after an entity_ruler: the second can add annotations """
    nlp = English()
    # 1 : Entity Ruler - should set "this" to B and everything else to empty
    ruler = EntityRuler(nlp)
    patterns = [{"label": "THING", "pattern": "This"}]
    ruler.add_patterns(patterns)
    nlp.add_pipe(ruler)
    # 2: untrained NER - should set everything else to O
    untrained_ner = nlp.create_pipe("ner")
    untrained_ner.add_label("MY_LABEL")
    nlp.add_pipe(untrained_ner)
    nlp.begin_training()
    doc = nlp("This is Antti Korhonen speaking in Finland")
    expected_iobs = ["B", "O", "O", "O", "O", "O", "O"]
    expected_types = ["THING", "", "", "", "", "", ""]
    assert [token.ent_iob_ for token in doc] == expected_iobs
    assert [token.ent_type_ for token in doc] == expected_types
 def test_ner_before_ruler():
    """ Test that an entity_ruler works after an NER: the second can overwrite O annotations """
    nlp = English()
    # 1: untrained NER - should set everything to O
    untrained_ner = nlp.create_pipe("ner")
    untrained_ner.add_label("MY_LABEL")
    nlp.add_pipe(untrained_ner, name="uner")
    nlp.begin_training()
    # 2 : Entity Ruler - should set "this" to B and keep everything else O
    ruler = EntityRuler(nlp)
    patterns = [{"label": "THING", "pattern": "This"}]
    ruler.add_patterns(patterns)
    nlp.add_pipe(ruler)
    doc = nlp("This is Antti Korhonen speaking in Finland")
    expected_iobs = ["B", "O", "O", "O", "O", "O", "O"]
    expected_types = ["THING", "", "", "", "", "", ""]
    assert [token.ent_iob_ for token in doc] == expected_iobs
    assert [token.ent_type_ for token in doc] == expected_types
 def test_block_ner():
    """ Test functionality for blocking tokens so they can't be in a named entity """
    # block "Antti L Korhonen" from being a named entity
    nlp = English()
    nlp.add_pipe(BlockerComponent1(2, 5))
    untrained_ner = nlp.create_pipe("ner")
    untrained_ner.add_label("MY_LABEL")
    nlp.add_pipe(untrained_ner, name="uner")
    nlp.begin_training()
    doc = nlp("This is Antti L Korhonen speaking in Finland")
    expected_iobs = ["O", "O", "B", "B", "B", "O", "O", "O"]
    expected_types = ["", "", "", "", "", "", "", ""]
    assert [token.ent_iob_ for token in doc] == expected_iobs
    assert [token.ent_type_ for token in doc] == expected_types
 class BlockerComponent1(object):
    name = "my_blocker"
    def __init__(self, start, end):
        self.start = start
        self.end = end
    def __call__(self, doc):
        doc.ents = [(0, self.start, self.end)]
        return doc
--- a/spacy/tests/regression/test_issue1-1000.py
+++ b/spacy/tests/regression/test_issue1-1000.py
@ -426,7 +426,7 @@ def test_issue957(en_tokenizer):
 def test_issue999(train_data):
    """Test that adding entities and resuming training works passably OK.
    There are two issues here:
-    1) We have to readd labels. This isn't very nice.
+    1) We have to read labels. This isn't very nice.
    2) There's no way to set the learning rate for the weight update, so we
        end up out-of-scale, causing it to learn too fast.
    """
--- a/spacy/tests/regression/test_issue4267.py
+++ b/spacy/tests/regression/test_issue4267.py
@ -0,0 +1,42 @@
 # coding: utf8
 from __future__ import unicode_literals
 import pytest
 import spacy
 from spacy.lang.en import English
 from spacy.pipeline import EntityRuler
 from spacy.tokens import Span
 def test_issue4267():
    """ Test that running an entity_ruler after ner gives consistent results"""
    nlp = English()
    ner = nlp.create_pipe("ner")
    ner.add_label("PEOPLE")
    nlp.add_pipe(ner)
    nlp.begin_training()
    assert "ner" in nlp.pipe_names
    # assert that we have correct IOB annotations
    doc1 = nlp("hi")
    assert doc1.is_nered
    for token in doc1:
        assert token.ent_iob == 2
    # add entity ruler and run again
    ruler = EntityRuler(nlp)
    patterns = [{"label": "SOFTWARE", "pattern": "spacy"}]
    ruler.add_patterns(patterns)
    nlp.add_pipe(ruler)
    assert "entity_ruler" in nlp.pipe_names
    assert "ner" in nlp.pipe_names
    # assert that we still have correct IOB annotations
    doc2 = nlp("hi")
    assert doc2.is_nered
    for token in doc2:
        assert token.ent_iob == 2
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -256,7 +256,7 @@ cdef class Doc:
    def is_nered(self):
        """Check if the document has named entities set. Will return True if
        *any* of the tokens has a named entity tag set (even if the others are
-        uknown values).
+        unknown values).
        """
        if len(self) == 0:
            return True
@ -525,13 +525,11 @@ cdef class Doc:
        def __set__(self, ents):
            # TODO:
-            # 1. Allow negative matches
+            # 1. Test basic data-driven ORTH gazetteer
-            # 2. Ensure pre-set NERs are not over-written during statistical
+            # 2. Test more nuanced date and currency regex
            #    prediction
            # 3. Test basic data-driven ORTH gazetteer
            # 4. Test more nuanced date and currency regex
            tokens_in_ents = {}
            cdef attr_t entity_type
            cdef attr_t kb_id
            cdef int ent_start, ent_end
            for ent_info in ents:
                entity_type, kb_id, ent_start, ent_end = get_entity_info(ent_info)
@ -545,27 +543,31 @@ cdef class Doc:
                    tokens_in_ents[token_index] = (ent_start, ent_end, entity_type, kb_id)
            cdef int i
            for i in range(self.length):
-                self.c[i].ent_type = 0
+                # default values
-                self.c[i].ent_kb_id = 0
+                entity_type = 0
-                self.c[i].ent_iob = 0  # Means missing.
+                kb_id = 0
-            cdef attr_t ent_type
+
-            cdef int start, end
+                # Set ent_iob to Missing (0) bij default unless this token was nered before
-            for ent_info in ents:
+                ent_iob = 0
-                ent_type, ent_kb_id, start, end = get_entity_info(ent_info)
+                if self.c[i].ent_iob != 0:
-                if ent_type is None or ent_type < 0:
+                    ent_iob = 2
-                    # Mark as O
+
-                    for i in range(start, end):
+                # overwrite if the token was part of a specified entity
-                        self.c[i].ent_type = 0
+                if i in tokens_in_ents.keys():
-                        self.c[i].ent_kb_id = 0
+                    ent_start, ent_end, entity_type, kb_id = tokens_in_ents[i]
-                        self.c[i].ent_iob = 2
+                    if entity_type is None or entity_type <= 0:
-                else:
+                        # Blocking this token from being overwritten by downstream NER
-                    # Mark (inside) as I
+                        ent_iob = 3
-                    for i in range(start, end):
+                    elif ent_start == i:
-                        self.c[i].ent_type = ent_type
+                        # Marking the start of an entity
-                        self.c[i].ent_kb_id = ent_kb_id
+                        ent_iob = 3
-                        self.c[i].ent_iob = 1
+                    else:
-                    # Set start as B
+                        # Marking the inside of an entity
-                    self.c[start].ent_iob = 3
+                        ent_iob = 1
                self.c[i].ent_type = entity_type
                self.c[i].ent_kb_id = kb_id
                self.c[i].ent_iob = ent_iob
    @property
    def noun_chunks(self):
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -749,7 +749,8 @@ cdef class Token:
    def ent_iob_(self):
        """IOB code of named entity tag. "B" means the token begins an entity,
        "I" means it is inside an entity, "O" means it is outside an entity,
-        and "" means no entity tag is set.
+        and "" means no entity tag is set. "B" with an empty ent_type
        means that the token is blocked from further processing by NER.
        RETURNS (unicode): IOB code of named entity tag.
        """