Distinction between outside, missing and blocked NER annotations (#4307)

* remove duplicate unit test

* unit test (currently failing) for issue 4267

* bugfix: ensure doc.ents preserves kb_id annotations

* fix in setting doc.ents with empty label

* rename

* test for presetting an entity to a certain type

* allow overwriting Outside + blocking presets

* fix actions when previous label needs to be kept

* fix default ent_iob in set entities

* cleaner solution with U- action

* remove debugging print statements

* unit tests with explicit transitions and is_valid testing

* remove U- from move_names explicitly

* remove unit tests with pre-trained models that don't work

* remove (working) unit tests with pre-trained models

* clean up unit tests

* move unit tests

* small fixes

* remove two TODO's from doc.ents comments
This commit is contained in:
Sofie Van Landeghem 2019-09-18 21:37:17 +02:00 committed by Matthew Honnibal
parent 72463b062f
commit de5a9ecdf3
9 changed files with 273 additions and 62 deletions

View File

@ -118,7 +118,7 @@ class Errors(object):
E011 = ("Unknown operator: '{op}'. Options: {opts}")
E012 = ("Cannot add pattern for zero tokens to matcher.\nKey: {key}")
E013 = ("Error selecting action in matcher")
E014 = ("Uknown tag ID: {tag}")
E014 = ("Unknown tag ID: {tag}")
E015 = ("Conflicting morphology exception for ({tag}, {orth}). Use "
"`force=True` to overwrite.")
E016 = ("MultitaskObjective target should be function or one of: dep, "

View File

@ -66,7 +66,8 @@ cdef class BiluoPushDown(TransitionSystem):
UNIT: Counter(),
OUT: Counter()
}
actions[OUT][''] = 1
actions[OUT][''] = 1 # Represents a token predicted to be outside of any entity
actions[UNIT][''] = 1 # Represents a token prohibited to be in an entity
for entity_type in kwargs.get('entity_types', []):
for action in (BEGIN, IN, LAST, UNIT):
actions[action][entity_type] = 1
@ -161,8 +162,7 @@ cdef class BiluoPushDown(TransitionSystem):
for i in range(self.n_moves):
if self.c[i].move == move and self.c[i].label == label:
return self.c[i]
else:
raise KeyError(Errors.E022.format(name=name))
raise KeyError(Errors.E022.format(name=name))
cdef Transition init_transition(self, int clas, int move, attr_t label) except *:
# TODO: Apparent Cython bug here when we try to use the Transition()
@ -266,7 +266,7 @@ cdef class Begin:
return False
elif label == 0:
return False
elif preset_ent_iob == 1 or preset_ent_iob == 2:
elif preset_ent_iob == 1:
# Ensure we don't clobber preset entities. If no entity preset,
# ent_iob is 0
return False
@ -282,8 +282,8 @@ cdef class Begin:
# Otherwise, force acceptance, even if we're across a sentence
# boundary or the token is whitespace.
return True
elif st.B_(1).ent_iob == 2 or st.B_(1).ent_iob == 3:
# If the next word is B or O, we can't B now
elif st.B_(1).ent_iob == 3:
# If the next word is B, we can't B now
return False
elif st.B_(1).sent_start == 1:
# Don't allow entities to extend across sentence boundaries
@ -326,6 +326,7 @@ cdef class In:
@staticmethod
cdef bint is_valid(const StateC* st, attr_t label) nogil:
cdef int preset_ent_iob = st.B_(0).ent_iob
cdef attr_t preset_ent_label = st.B_(0).ent_type
if label == 0:
return False
elif st.E_(0).ent_type != label:
@ -335,13 +336,22 @@ cdef class In:
elif st.B(1) == -1:
# If we're at the end, we can't I.
return False
elif preset_ent_iob == 2:
return False
elif preset_ent_iob == 3:
return False
elif st.B_(1).ent_iob == 2 or st.B_(1).ent_iob == 3:
# If we know the next word is B or O, we can't be I (must be L)
elif st.B_(1).ent_iob == 3:
# If we know the next word is B, we can't be I (must be L)
return False
elif preset_ent_iob == 1:
if st.B_(1).ent_iob in (0, 2):
# if next preset is missing or O, this can't be I (must be L)
return False
elif label != preset_ent_label:
# If label isn't right, reject
return False
else:
# Otherwise, force acceptance, even if we're across a sentence
# boundary or the token is whitespace.
return True
elif st.B(1) != -1 and st.B_(1).sent_start == 1:
# Don't allow entities to extend across sentence boundaries
return False
@ -387,17 +397,24 @@ cdef class In:
else:
return 1
cdef class Last:
@staticmethod
cdef bint is_valid(const StateC* st, attr_t label) nogil:
cdef int preset_ent_iob = st.B_(0).ent_iob
cdef attr_t preset_ent_label = st.B_(0).ent_type
if label == 0:
return False
elif not st.entity_is_open():
return False
elif st.B_(0).ent_iob == 1 and st.B_(1).ent_iob != 1:
elif preset_ent_iob == 1 and st.B_(1).ent_iob != 1:
# If a preset entity has I followed by not-I, is L
return True
if label != preset_ent_label:
# If label isn't right, reject
return False
else:
# Otherwise, force acceptance, even if we're across a sentence
# boundary or the token is whitespace.
return True
elif st.E_(0).ent_type != label:
return False
elif st.B_(1).ent_iob == 1:
@ -450,12 +467,13 @@ cdef class Unit:
cdef int preset_ent_iob = st.B_(0).ent_iob
cdef attr_t preset_ent_label = st.B_(0).ent_type
if label == 0:
return False
# this is only allowed if it's a preset blocked annotation
if preset_ent_label == 0 and preset_ent_iob == 3:
return True
else:
return False
elif st.entity_is_open():
return False
elif preset_ent_iob == 2:
# Don't clobber preset O
return False
elif st.B_(1).ent_iob == 1:
# If next token is In, we can't be Unit -- must be Begin
return False

View File

@ -135,7 +135,9 @@ cdef class Parser:
names = []
for i in range(self.moves.n_moves):
name = self.moves.move_name(self.moves.c[i].move, self.moves.c[i].label)
names.append(name)
# Explicitly removing the internal "U-" token used for blocking entities
if name != "U-":
names.append(name)
return names
nr_feature = 8

View File

@ -16,10 +16,23 @@ def test_doc_add_entities_set_ents_iob(en_vocab):
ner(doc)
assert len(list(doc.ents)) == 0
assert [w.ent_iob_ for w in doc] == (["O"] * len(doc))
doc.ents = [(doc.vocab.strings["ANIMAL"], 3, 4)]
assert [w.ent_iob_ for w in doc] == ["", "", "", "B"]
assert [w.ent_iob_ for w in doc] == ["O", "O", "O", "B"]
doc.ents = [(doc.vocab.strings["WORD"], 0, 2)]
assert [w.ent_iob_ for w in doc] == ["B", "I", "", ""]
assert [w.ent_iob_ for w in doc] == ["B", "I", "O", "O"]
def test_ents_reset(en_vocab):
text = ["This", "is", "a", "lion"]
doc = get_doc(en_vocab, text)
ner = EntityRecognizer(en_vocab)
ner.begin_training([])
ner(doc)
assert [t.ent_iob_ for t in doc] == (["O"] * len(doc))
doc.ents = list(doc.ents)
assert [t.ent_iob_ for t in doc] == (["O"] * len(doc))
def test_add_overlapping_entities(en_vocab):

View File

@ -2,7 +2,9 @@
from __future__ import unicode_literals
import pytest
from spacy.pipeline import EntityRecognizer
from spacy.lang.en import English
from spacy.pipeline import EntityRecognizer, EntityRuler
from spacy.vocab import Vocab
from spacy.syntax.ner import BiluoPushDown
from spacy.gold import GoldParse
@ -80,14 +82,145 @@ def test_get_oracle_moves_negative_O(tsys, vocab):
assert names
def test_doc_add_entities_set_ents_iob(en_vocab):
doc = Doc(en_vocab, words=["This", "is", "a", "lion"])
ner = EntityRecognizer(en_vocab)
ner.begin_training([])
ner(doc)
assert len(list(doc.ents)) == 0
assert [w.ent_iob_ for w in doc] == (["O"] * len(doc))
doc.ents = [(doc.vocab.strings["ANIMAL"], 3, 4)]
assert [w.ent_iob_ for w in doc] == ["", "", "", "B"]
doc.ents = [(doc.vocab.strings["WORD"], 0, 2)]
assert [w.ent_iob_ for w in doc] == ["B", "I", "", ""]
def test_accept_blocked_token():
"""Test succesful blocking of tokens to be in an entity."""
# 1. test normal behaviour
nlp1 = English()
doc1 = nlp1("I live in New York")
ner1 = EntityRecognizer(doc1.vocab)
assert [token.ent_iob_ for token in doc1] == ["", "", "", "", ""]
assert [token.ent_type_ for token in doc1] == ["", "", "", "", ""]
# Add the OUT action
ner1.moves.add_action(5, "")
ner1.add_label("GPE")
# Get into the state just before "New"
state1 = ner1.moves.init_batch([doc1])[0]
ner1.moves.apply_transition(state1, "O")
ner1.moves.apply_transition(state1, "O")
ner1.moves.apply_transition(state1, "O")
# Check that B-GPE is valid.
assert ner1.moves.is_valid(state1, "B-GPE")
# 2. test blocking behaviour
nlp2 = English()
doc2 = nlp2("I live in New York")
ner2 = EntityRecognizer(doc2.vocab)
# set "New York" to a blocked entity
doc2.ents = [(0, 3, 5)]
assert [token.ent_iob_ for token in doc2] == ["", "", "", "B", "B"]
assert [token.ent_type_ for token in doc2] == ["", "", "", "", ""]
# Check that B-GPE is now invalid.
ner2.moves.add_action(4, "")
ner2.moves.add_action(5, "")
ner2.add_label("GPE")
state2 = ner2.moves.init_batch([doc2])[0]
ner2.moves.apply_transition(state2, "O")
ner2.moves.apply_transition(state2, "O")
ner2.moves.apply_transition(state2, "O")
# we can only use U- for "New"
assert not ner2.moves.is_valid(state2, "B-GPE")
assert ner2.moves.is_valid(state2, "U-")
ner2.moves.apply_transition(state2, "U-")
# we can only use U- for "York"
assert not ner2.moves.is_valid(state2, "B-GPE")
assert ner2.moves.is_valid(state2, "U-")
def test_overwrite_token():
nlp = English()
ner1 = nlp.create_pipe("ner")
nlp.add_pipe(ner1, name="ner")
nlp.begin_training()
# The untrained NER will predict O for each token
doc = nlp("I live in New York")
assert [token.ent_iob_ for token in doc] == ["O", "O", "O", "O", "O"]
assert [token.ent_type_ for token in doc] == ["", "", "", "", ""]
# Check that a new ner can overwrite O
ner2 = EntityRecognizer(doc.vocab)
ner2.moves.add_action(5, "")
ner2.add_label("GPE")
state = ner2.moves.init_batch([doc])[0]
assert ner2.moves.is_valid(state, "B-GPE")
assert ner2.moves.is_valid(state, "U-GPE")
ner2.moves.apply_transition(state, "B-GPE")
assert ner2.moves.is_valid(state, "I-GPE")
assert ner2.moves.is_valid(state, "L-GPE")
def test_ruler_before_ner():
""" Test that an NER works after an entity_ruler: the second can add annotations """
nlp = English()
# 1 : Entity Ruler - should set "this" to B and everything else to empty
ruler = EntityRuler(nlp)
patterns = [{"label": "THING", "pattern": "This"}]
ruler.add_patterns(patterns)
nlp.add_pipe(ruler)
# 2: untrained NER - should set everything else to O
untrained_ner = nlp.create_pipe("ner")
untrained_ner.add_label("MY_LABEL")
nlp.add_pipe(untrained_ner)
nlp.begin_training()
doc = nlp("This is Antti Korhonen speaking in Finland")
expected_iobs = ["B", "O", "O", "O", "O", "O", "O"]
expected_types = ["THING", "", "", "", "", "", ""]
assert [token.ent_iob_ for token in doc] == expected_iobs
assert [token.ent_type_ for token in doc] == expected_types
def test_ner_before_ruler():
""" Test that an entity_ruler works after an NER: the second can overwrite O annotations """
nlp = English()
# 1: untrained NER - should set everything to O
untrained_ner = nlp.create_pipe("ner")
untrained_ner.add_label("MY_LABEL")
nlp.add_pipe(untrained_ner, name="uner")
nlp.begin_training()
# 2 : Entity Ruler - should set "this" to B and keep everything else O
ruler = EntityRuler(nlp)
patterns = [{"label": "THING", "pattern": "This"}]
ruler.add_patterns(patterns)
nlp.add_pipe(ruler)
doc = nlp("This is Antti Korhonen speaking in Finland")
expected_iobs = ["B", "O", "O", "O", "O", "O", "O"]
expected_types = ["THING", "", "", "", "", "", ""]
assert [token.ent_iob_ for token in doc] == expected_iobs
assert [token.ent_type_ for token in doc] == expected_types
def test_block_ner():
""" Test functionality for blocking tokens so they can't be in a named entity """
# block "Antti L Korhonen" from being a named entity
nlp = English()
nlp.add_pipe(BlockerComponent1(2, 5))
untrained_ner = nlp.create_pipe("ner")
untrained_ner.add_label("MY_LABEL")
nlp.add_pipe(untrained_ner, name="uner")
nlp.begin_training()
doc = nlp("This is Antti L Korhonen speaking in Finland")
expected_iobs = ["O", "O", "B", "B", "B", "O", "O", "O"]
expected_types = ["", "", "", "", "", "", "", ""]
assert [token.ent_iob_ for token in doc] == expected_iobs
assert [token.ent_type_ for token in doc] == expected_types
class BlockerComponent1(object):
name = "my_blocker"
def __init__(self, start, end):
self.start = start
self.end = end
def __call__(self, doc):
doc.ents = [(0, self.start, self.end)]
return doc

View File

@ -426,7 +426,7 @@ def test_issue957(en_tokenizer):
def test_issue999(train_data):
"""Test that adding entities and resuming training works passably OK.
There are two issues here:
1) We have to readd labels. This isn't very nice.
1) We have to read labels. This isn't very nice.
2) There's no way to set the learning rate for the weight update, so we
end up out-of-scale, causing it to learn too fast.
"""

View File

@ -0,0 +1,42 @@
# coding: utf8
from __future__ import unicode_literals
import pytest
import spacy
from spacy.lang.en import English
from spacy.pipeline import EntityRuler
from spacy.tokens import Span
def test_issue4267():
""" Test that running an entity_ruler after ner gives consistent results"""
nlp = English()
ner = nlp.create_pipe("ner")
ner.add_label("PEOPLE")
nlp.add_pipe(ner)
nlp.begin_training()
assert "ner" in nlp.pipe_names
# assert that we have correct IOB annotations
doc1 = nlp("hi")
assert doc1.is_nered
for token in doc1:
assert token.ent_iob == 2
# add entity ruler and run again
ruler = EntityRuler(nlp)
patterns = [{"label": "SOFTWARE", "pattern": "spacy"}]
ruler.add_patterns(patterns)
nlp.add_pipe(ruler)
assert "entity_ruler" in nlp.pipe_names
assert "ner" in nlp.pipe_names
# assert that we still have correct IOB annotations
doc2 = nlp("hi")
assert doc2.is_nered
for token in doc2:
assert token.ent_iob == 2

View File

@ -256,7 +256,7 @@ cdef class Doc:
def is_nered(self):
"""Check if the document has named entities set. Will return True if
*any* of the tokens has a named entity tag set (even if the others are
uknown values).
unknown values).
"""
if len(self) == 0:
return True
@ -525,13 +525,11 @@ cdef class Doc:
def __set__(self, ents):
# TODO:
# 1. Allow negative matches
# 2. Ensure pre-set NERs are not over-written during statistical
# prediction
# 3. Test basic data-driven ORTH gazetteer
# 4. Test more nuanced date and currency regex
# 1. Test basic data-driven ORTH gazetteer
# 2. Test more nuanced date and currency regex
tokens_in_ents = {}
cdef attr_t entity_type
cdef attr_t kb_id
cdef int ent_start, ent_end
for ent_info in ents:
entity_type, kb_id, ent_start, ent_end = get_entity_info(ent_info)
@ -545,27 +543,31 @@ cdef class Doc:
tokens_in_ents[token_index] = (ent_start, ent_end, entity_type, kb_id)
cdef int i
for i in range(self.length):
self.c[i].ent_type = 0
self.c[i].ent_kb_id = 0
self.c[i].ent_iob = 0 # Means missing.
cdef attr_t ent_type
cdef int start, end
for ent_info in ents:
ent_type, ent_kb_id, start, end = get_entity_info(ent_info)
if ent_type is None or ent_type < 0:
# Mark as O
for i in range(start, end):
self.c[i].ent_type = 0
self.c[i].ent_kb_id = 0
self.c[i].ent_iob = 2
else:
# Mark (inside) as I
for i in range(start, end):
self.c[i].ent_type = ent_type
self.c[i].ent_kb_id = ent_kb_id
self.c[i].ent_iob = 1
# Set start as B
self.c[start].ent_iob = 3
# default values
entity_type = 0
kb_id = 0
# Set ent_iob to Missing (0) bij default unless this token was nered before
ent_iob = 0
if self.c[i].ent_iob != 0:
ent_iob = 2
# overwrite if the token was part of a specified entity
if i in tokens_in_ents.keys():
ent_start, ent_end, entity_type, kb_id = tokens_in_ents[i]
if entity_type is None or entity_type <= 0:
# Blocking this token from being overwritten by downstream NER
ent_iob = 3
elif ent_start == i:
# Marking the start of an entity
ent_iob = 3
else:
# Marking the inside of an entity
ent_iob = 1
self.c[i].ent_type = entity_type
self.c[i].ent_kb_id = kb_id
self.c[i].ent_iob = ent_iob
@property
def noun_chunks(self):

View File

@ -749,7 +749,8 @@ cdef class Token:
def ent_iob_(self):
"""IOB code of named entity tag. "B" means the token begins an entity,
"I" means it is inside an entity, "O" means it is outside an entity,
and "" means no entity tag is set.
and "" means no entity tag is set. "B" with an empty ent_type
means that the token is blocked from further processing by NER.
RETURNS (unicode): IOB code of named entity tag.
"""