From de5a9ecdf3e60240c78526d04aaa0931e3e825a6 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Wed, 18 Sep 2019 21:37:17 +0200 Subject: [PATCH] Distinction between outside, missing and blocked NER annotations (#4307) * remove duplicate unit test * unit test (currently failing) for issue 4267 * bugfix: ensure doc.ents preserves kb_id annotations * fix in setting doc.ents with empty label * rename * test for presetting an entity to a certain type * allow overwriting Outside + blocking presets * fix actions when previous label needs to be kept * fix default ent_iob in set entities * cleaner solution with U- action * remove debugging print statements * unit tests with explicit transitions and is_valid testing * remove U- from move_names explicitly * remove unit tests with pre-trained models that don't work * remove (working) unit tests with pre-trained models * clean up unit tests * move unit tests * small fixes * remove two TODO's from doc.ents comments --- spacy/errors.py | 2 +- spacy/syntax/ner.pyx | 52 ++++--- spacy/syntax/nn_parser.pyx | 4 +- spacy/tests/doc/test_add_entities.py | 17 ++- spacy/tests/parser/test_ner.py | 157 +++++++++++++++++++-- spacy/tests/regression/test_issue1-1000.py | 2 +- spacy/tests/regression/test_issue4267.py | 42 ++++++ spacy/tokens/doc.pyx | 56 ++++---- spacy/tokens/token.pyx | 3 +- 9 files changed, 273 insertions(+), 62 deletions(-) create mode 100644 spacy/tests/regression/test_issue4267.py diff --git a/spacy/errors.py b/spacy/errors.py index 587a6e700..3b96179d7 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -118,7 +118,7 @@ class Errors(object): E011 = ("Unknown operator: '{op}'. Options: {opts}") E012 = ("Cannot add pattern for zero tokens to matcher.\nKey: {key}") E013 = ("Error selecting action in matcher") - E014 = ("Uknown tag ID: {tag}") + E014 = ("Unknown tag ID: {tag}") E015 = ("Conflicting morphology exception for ({tag}, {orth}). Use " "`force=True` to overwrite.") E016 = ("MultitaskObjective target should be function or one of: dep, " diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx index 767e4c2e0..9f8ad418c 100644 --- a/spacy/syntax/ner.pyx +++ b/spacy/syntax/ner.pyx @@ -66,7 +66,8 @@ cdef class BiluoPushDown(TransitionSystem): UNIT: Counter(), OUT: Counter() } - actions[OUT][''] = 1 + actions[OUT][''] = 1 # Represents a token predicted to be outside of any entity + actions[UNIT][''] = 1 # Represents a token prohibited to be in an entity for entity_type in kwargs.get('entity_types', []): for action in (BEGIN, IN, LAST, UNIT): actions[action][entity_type] = 1 @@ -161,8 +162,7 @@ cdef class BiluoPushDown(TransitionSystem): for i in range(self.n_moves): if self.c[i].move == move and self.c[i].label == label: return self.c[i] - else: - raise KeyError(Errors.E022.format(name=name)) + raise KeyError(Errors.E022.format(name=name)) cdef Transition init_transition(self, int clas, int move, attr_t label) except *: # TODO: Apparent Cython bug here when we try to use the Transition() @@ -266,7 +266,7 @@ cdef class Begin: return False elif label == 0: return False - elif preset_ent_iob == 1 or preset_ent_iob == 2: + elif preset_ent_iob == 1: # Ensure we don't clobber preset entities. If no entity preset, # ent_iob is 0 return False @@ -282,8 +282,8 @@ cdef class Begin: # Otherwise, force acceptance, even if we're across a sentence # boundary or the token is whitespace. return True - elif st.B_(1).ent_iob == 2 or st.B_(1).ent_iob == 3: - # If the next word is B or O, we can't B now + elif st.B_(1).ent_iob == 3: + # If the next word is B, we can't B now return False elif st.B_(1).sent_start == 1: # Don't allow entities to extend across sentence boundaries @@ -326,6 +326,7 @@ cdef class In: @staticmethod cdef bint is_valid(const StateC* st, attr_t label) nogil: cdef int preset_ent_iob = st.B_(0).ent_iob + cdef attr_t preset_ent_label = st.B_(0).ent_type if label == 0: return False elif st.E_(0).ent_type != label: @@ -335,13 +336,22 @@ cdef class In: elif st.B(1) == -1: # If we're at the end, we can't I. return False - elif preset_ent_iob == 2: - return False elif preset_ent_iob == 3: return False - elif st.B_(1).ent_iob == 2 or st.B_(1).ent_iob == 3: - # If we know the next word is B or O, we can't be I (must be L) + elif st.B_(1).ent_iob == 3: + # If we know the next word is B, we can't be I (must be L) return False + elif preset_ent_iob == 1: + if st.B_(1).ent_iob in (0, 2): + # if next preset is missing or O, this can't be I (must be L) + return False + elif label != preset_ent_label: + # If label isn't right, reject + return False + else: + # Otherwise, force acceptance, even if we're across a sentence + # boundary or the token is whitespace. + return True elif st.B(1) != -1 and st.B_(1).sent_start == 1: # Don't allow entities to extend across sentence boundaries return False @@ -387,17 +397,24 @@ cdef class In: else: return 1 - cdef class Last: @staticmethod cdef bint is_valid(const StateC* st, attr_t label) nogil: + cdef int preset_ent_iob = st.B_(0).ent_iob + cdef attr_t preset_ent_label = st.B_(0).ent_type if label == 0: return False elif not st.entity_is_open(): return False - elif st.B_(0).ent_iob == 1 and st.B_(1).ent_iob != 1: + elif preset_ent_iob == 1 and st.B_(1).ent_iob != 1: # If a preset entity has I followed by not-I, is L - return True + if label != preset_ent_label: + # If label isn't right, reject + return False + else: + # Otherwise, force acceptance, even if we're across a sentence + # boundary or the token is whitespace. + return True elif st.E_(0).ent_type != label: return False elif st.B_(1).ent_iob == 1: @@ -450,12 +467,13 @@ cdef class Unit: cdef int preset_ent_iob = st.B_(0).ent_iob cdef attr_t preset_ent_label = st.B_(0).ent_type if label == 0: - return False + # this is only allowed if it's a preset blocked annotation + if preset_ent_label == 0 and preset_ent_iob == 3: + return True + else: + return False elif st.entity_is_open(): return False - elif preset_ent_iob == 2: - # Don't clobber preset O - return False elif st.B_(1).ent_iob == 1: # If next token is In, we can't be Unit -- must be Begin return False diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 85f7b5bb9..18c45fdfc 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -135,7 +135,9 @@ cdef class Parser: names = [] for i in range(self.moves.n_moves): name = self.moves.move_name(self.moves.c[i].move, self.moves.c[i].label) - names.append(name) + # Explicitly removing the internal "U-" token used for blocking entities + if name != "U-": + names.append(name) return names nr_feature = 8 diff --git a/spacy/tests/doc/test_add_entities.py b/spacy/tests/doc/test_add_entities.py index 433541c48..374c3ddd8 100644 --- a/spacy/tests/doc/test_add_entities.py +++ b/spacy/tests/doc/test_add_entities.py @@ -16,10 +16,23 @@ def test_doc_add_entities_set_ents_iob(en_vocab): ner(doc) assert len(list(doc.ents)) == 0 assert [w.ent_iob_ for w in doc] == (["O"] * len(doc)) + doc.ents = [(doc.vocab.strings["ANIMAL"], 3, 4)] - assert [w.ent_iob_ for w in doc] == ["", "", "", "B"] + assert [w.ent_iob_ for w in doc] == ["O", "O", "O", "B"] + doc.ents = [(doc.vocab.strings["WORD"], 0, 2)] - assert [w.ent_iob_ for w in doc] == ["B", "I", "", ""] + assert [w.ent_iob_ for w in doc] == ["B", "I", "O", "O"] + + +def test_ents_reset(en_vocab): + text = ["This", "is", "a", "lion"] + doc = get_doc(en_vocab, text) + ner = EntityRecognizer(en_vocab) + ner.begin_training([]) + ner(doc) + assert [t.ent_iob_ for t in doc] == (["O"] * len(doc)) + doc.ents = list(doc.ents) + assert [t.ent_iob_ for t in doc] == (["O"] * len(doc)) def test_add_overlapping_entities(en_vocab): diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index 43c00a963..65586bda1 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -2,7 +2,9 @@ from __future__ import unicode_literals import pytest -from spacy.pipeline import EntityRecognizer +from spacy.lang.en import English + +from spacy.pipeline import EntityRecognizer, EntityRuler from spacy.vocab import Vocab from spacy.syntax.ner import BiluoPushDown from spacy.gold import GoldParse @@ -80,14 +82,145 @@ def test_get_oracle_moves_negative_O(tsys, vocab): assert names -def test_doc_add_entities_set_ents_iob(en_vocab): - doc = Doc(en_vocab, words=["This", "is", "a", "lion"]) - ner = EntityRecognizer(en_vocab) - ner.begin_training([]) - ner(doc) - assert len(list(doc.ents)) == 0 - assert [w.ent_iob_ for w in doc] == (["O"] * len(doc)) - doc.ents = [(doc.vocab.strings["ANIMAL"], 3, 4)] - assert [w.ent_iob_ for w in doc] == ["", "", "", "B"] - doc.ents = [(doc.vocab.strings["WORD"], 0, 2)] - assert [w.ent_iob_ for w in doc] == ["B", "I", "", ""] +def test_accept_blocked_token(): + """Test succesful blocking of tokens to be in an entity.""" + # 1. test normal behaviour + nlp1 = English() + doc1 = nlp1("I live in New York") + ner1 = EntityRecognizer(doc1.vocab) + assert [token.ent_iob_ for token in doc1] == ["", "", "", "", ""] + assert [token.ent_type_ for token in doc1] == ["", "", "", "", ""] + + # Add the OUT action + ner1.moves.add_action(5, "") + ner1.add_label("GPE") + # Get into the state just before "New" + state1 = ner1.moves.init_batch([doc1])[0] + ner1.moves.apply_transition(state1, "O") + ner1.moves.apply_transition(state1, "O") + ner1.moves.apply_transition(state1, "O") + # Check that B-GPE is valid. + assert ner1.moves.is_valid(state1, "B-GPE") + + # 2. test blocking behaviour + nlp2 = English() + doc2 = nlp2("I live in New York") + ner2 = EntityRecognizer(doc2.vocab) + + # set "New York" to a blocked entity + doc2.ents = [(0, 3, 5)] + assert [token.ent_iob_ for token in doc2] == ["", "", "", "B", "B"] + assert [token.ent_type_ for token in doc2] == ["", "", "", "", ""] + + # Check that B-GPE is now invalid. + ner2.moves.add_action(4, "") + ner2.moves.add_action(5, "") + ner2.add_label("GPE") + state2 = ner2.moves.init_batch([doc2])[0] + ner2.moves.apply_transition(state2, "O") + ner2.moves.apply_transition(state2, "O") + ner2.moves.apply_transition(state2, "O") + # we can only use U- for "New" + assert not ner2.moves.is_valid(state2, "B-GPE") + assert ner2.moves.is_valid(state2, "U-") + ner2.moves.apply_transition(state2, "U-") + # we can only use U- for "York" + assert not ner2.moves.is_valid(state2, "B-GPE") + assert ner2.moves.is_valid(state2, "U-") + + +def test_overwrite_token(): + nlp = English() + ner1 = nlp.create_pipe("ner") + nlp.add_pipe(ner1, name="ner") + nlp.begin_training() + + # The untrained NER will predict O for each token + doc = nlp("I live in New York") + assert [token.ent_iob_ for token in doc] == ["O", "O", "O", "O", "O"] + assert [token.ent_type_ for token in doc] == ["", "", "", "", ""] + + # Check that a new ner can overwrite O + ner2 = EntityRecognizer(doc.vocab) + ner2.moves.add_action(5, "") + ner2.add_label("GPE") + state = ner2.moves.init_batch([doc])[0] + assert ner2.moves.is_valid(state, "B-GPE") + assert ner2.moves.is_valid(state, "U-GPE") + ner2.moves.apply_transition(state, "B-GPE") + assert ner2.moves.is_valid(state, "I-GPE") + assert ner2.moves.is_valid(state, "L-GPE") + + +def test_ruler_before_ner(): + """ Test that an NER works after an entity_ruler: the second can add annotations """ + nlp = English() + + # 1 : Entity Ruler - should set "this" to B and everything else to empty + ruler = EntityRuler(nlp) + patterns = [{"label": "THING", "pattern": "This"}] + ruler.add_patterns(patterns) + nlp.add_pipe(ruler) + + # 2: untrained NER - should set everything else to O + untrained_ner = nlp.create_pipe("ner") + untrained_ner.add_label("MY_LABEL") + nlp.add_pipe(untrained_ner) + nlp.begin_training() + + doc = nlp("This is Antti Korhonen speaking in Finland") + expected_iobs = ["B", "O", "O", "O", "O", "O", "O"] + expected_types = ["THING", "", "", "", "", "", ""] + assert [token.ent_iob_ for token in doc] == expected_iobs + assert [token.ent_type_ for token in doc] == expected_types + + +def test_ner_before_ruler(): + """ Test that an entity_ruler works after an NER: the second can overwrite O annotations """ + nlp = English() + + # 1: untrained NER - should set everything to O + untrained_ner = nlp.create_pipe("ner") + untrained_ner.add_label("MY_LABEL") + nlp.add_pipe(untrained_ner, name="uner") + nlp.begin_training() + + # 2 : Entity Ruler - should set "this" to B and keep everything else O + ruler = EntityRuler(nlp) + patterns = [{"label": "THING", "pattern": "This"}] + ruler.add_patterns(patterns) + nlp.add_pipe(ruler) + + doc = nlp("This is Antti Korhonen speaking in Finland") + expected_iobs = ["B", "O", "O", "O", "O", "O", "O"] + expected_types = ["THING", "", "", "", "", "", ""] + assert [token.ent_iob_ for token in doc] == expected_iobs + assert [token.ent_type_ for token in doc] == expected_types + + +def test_block_ner(): + """ Test functionality for blocking tokens so they can't be in a named entity """ + # block "Antti L Korhonen" from being a named entity + nlp = English() + nlp.add_pipe(BlockerComponent1(2, 5)) + untrained_ner = nlp.create_pipe("ner") + untrained_ner.add_label("MY_LABEL") + nlp.add_pipe(untrained_ner, name="uner") + nlp.begin_training() + doc = nlp("This is Antti L Korhonen speaking in Finland") + expected_iobs = ["O", "O", "B", "B", "B", "O", "O", "O"] + expected_types = ["", "", "", "", "", "", "", ""] + assert [token.ent_iob_ for token in doc] == expected_iobs + assert [token.ent_type_ for token in doc] == expected_types + + +class BlockerComponent1(object): + name = "my_blocker" + + def __init__(self, start, end): + self.start = start + self.end = end + + def __call__(self, doc): + doc.ents = [(0, self.start, self.end)] + return doc diff --git a/spacy/tests/regression/test_issue1-1000.py b/spacy/tests/regression/test_issue1-1000.py index febf2b5b3..b3f347765 100644 --- a/spacy/tests/regression/test_issue1-1000.py +++ b/spacy/tests/regression/test_issue1-1000.py @@ -426,7 +426,7 @@ def test_issue957(en_tokenizer): def test_issue999(train_data): """Test that adding entities and resuming training works passably OK. There are two issues here: - 1) We have to readd labels. This isn't very nice. + 1) We have to read labels. This isn't very nice. 2) There's no way to set the learning rate for the weight update, so we end up out-of-scale, causing it to learn too fast. """ diff --git a/spacy/tests/regression/test_issue4267.py b/spacy/tests/regression/test_issue4267.py new file mode 100644 index 000000000..5fc61e142 --- /dev/null +++ b/spacy/tests/regression/test_issue4267.py @@ -0,0 +1,42 @@ +# coding: utf8 +from __future__ import unicode_literals + +import pytest + +import spacy + +from spacy.lang.en import English +from spacy.pipeline import EntityRuler +from spacy.tokens import Span + + +def test_issue4267(): + """ Test that running an entity_ruler after ner gives consistent results""" + nlp = English() + ner = nlp.create_pipe("ner") + ner.add_label("PEOPLE") + nlp.add_pipe(ner) + nlp.begin_training() + + assert "ner" in nlp.pipe_names + + # assert that we have correct IOB annotations + doc1 = nlp("hi") + assert doc1.is_nered + for token in doc1: + assert token.ent_iob == 2 + + # add entity ruler and run again + ruler = EntityRuler(nlp) + patterns = [{"label": "SOFTWARE", "pattern": "spacy"}] + + ruler.add_patterns(patterns) + nlp.add_pipe(ruler) + assert "entity_ruler" in nlp.pipe_names + assert "ner" in nlp.pipe_names + + # assert that we still have correct IOB annotations + doc2 = nlp("hi") + assert doc2.is_nered + for token in doc2: + assert token.ent_iob == 2 diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index e863b0807..9cf8e7fa5 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -256,7 +256,7 @@ cdef class Doc: def is_nered(self): """Check if the document has named entities set. Will return True if *any* of the tokens has a named entity tag set (even if the others are - uknown values). + unknown values). """ if len(self) == 0: return True @@ -525,13 +525,11 @@ cdef class Doc: def __set__(self, ents): # TODO: - # 1. Allow negative matches - # 2. Ensure pre-set NERs are not over-written during statistical - # prediction - # 3. Test basic data-driven ORTH gazetteer - # 4. Test more nuanced date and currency regex + # 1. Test basic data-driven ORTH gazetteer + # 2. Test more nuanced date and currency regex tokens_in_ents = {} cdef attr_t entity_type + cdef attr_t kb_id cdef int ent_start, ent_end for ent_info in ents: entity_type, kb_id, ent_start, ent_end = get_entity_info(ent_info) @@ -545,27 +543,31 @@ cdef class Doc: tokens_in_ents[token_index] = (ent_start, ent_end, entity_type, kb_id) cdef int i for i in range(self.length): - self.c[i].ent_type = 0 - self.c[i].ent_kb_id = 0 - self.c[i].ent_iob = 0 # Means missing. - cdef attr_t ent_type - cdef int start, end - for ent_info in ents: - ent_type, ent_kb_id, start, end = get_entity_info(ent_info) - if ent_type is None or ent_type < 0: - # Mark as O - for i in range(start, end): - self.c[i].ent_type = 0 - self.c[i].ent_kb_id = 0 - self.c[i].ent_iob = 2 - else: - # Mark (inside) as I - for i in range(start, end): - self.c[i].ent_type = ent_type - self.c[i].ent_kb_id = ent_kb_id - self.c[i].ent_iob = 1 - # Set start as B - self.c[start].ent_iob = 3 + # default values + entity_type = 0 + kb_id = 0 + + # Set ent_iob to Missing (0) bij default unless this token was nered before + ent_iob = 0 + if self.c[i].ent_iob != 0: + ent_iob = 2 + + # overwrite if the token was part of a specified entity + if i in tokens_in_ents.keys(): + ent_start, ent_end, entity_type, kb_id = tokens_in_ents[i] + if entity_type is None or entity_type <= 0: + # Blocking this token from being overwritten by downstream NER + ent_iob = 3 + elif ent_start == i: + # Marking the start of an entity + ent_iob = 3 + else: + # Marking the inside of an entity + ent_iob = 1 + + self.c[i].ent_type = entity_type + self.c[i].ent_kb_id = kb_id + self.c[i].ent_iob = ent_iob @property def noun_chunks(self): diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 07c6f1c99..69b9def38 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -749,7 +749,8 @@ cdef class Token: def ent_iob_(self): """IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, - and "" means no entity tag is set. + and "" means no entity tag is set. "B" with an empty ent_type + means that the token is blocked from further processing by NER. RETURNS (unicode): IOB code of named entity tag. """