mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-11 17:56:30 +03:00
Distinction between outside, missing and blocked NER annotations (#4307)
* remove duplicate unit test * unit test (currently failing) for issue 4267 * bugfix: ensure doc.ents preserves kb_id annotations * fix in setting doc.ents with empty label * rename * test for presetting an entity to a certain type * allow overwriting Outside + blocking presets * fix actions when previous label needs to be kept * fix default ent_iob in set entities * cleaner solution with U- action * remove debugging print statements * unit tests with explicit transitions and is_valid testing * remove U- from move_names explicitly * remove unit tests with pre-trained models that don't work * remove (working) unit tests with pre-trained models * clean up unit tests * move unit tests * small fixes * remove two TODO's from doc.ents comments
This commit is contained in:
parent
72463b062f
commit
de5a9ecdf3
|
@ -118,7 +118,7 @@ class Errors(object):
|
|||
E011 = ("Unknown operator: '{op}'. Options: {opts}")
|
||||
E012 = ("Cannot add pattern for zero tokens to matcher.\nKey: {key}")
|
||||
E013 = ("Error selecting action in matcher")
|
||||
E014 = ("Uknown tag ID: {tag}")
|
||||
E014 = ("Unknown tag ID: {tag}")
|
||||
E015 = ("Conflicting morphology exception for ({tag}, {orth}). Use "
|
||||
"`force=True` to overwrite.")
|
||||
E016 = ("MultitaskObjective target should be function or one of: dep, "
|
||||
|
|
|
@ -66,7 +66,8 @@ cdef class BiluoPushDown(TransitionSystem):
|
|||
UNIT: Counter(),
|
||||
OUT: Counter()
|
||||
}
|
||||
actions[OUT][''] = 1
|
||||
actions[OUT][''] = 1 # Represents a token predicted to be outside of any entity
|
||||
actions[UNIT][''] = 1 # Represents a token prohibited to be in an entity
|
||||
for entity_type in kwargs.get('entity_types', []):
|
||||
for action in (BEGIN, IN, LAST, UNIT):
|
||||
actions[action][entity_type] = 1
|
||||
|
@ -161,7 +162,6 @@ cdef class BiluoPushDown(TransitionSystem):
|
|||
for i in range(self.n_moves):
|
||||
if self.c[i].move == move and self.c[i].label == label:
|
||||
return self.c[i]
|
||||
else:
|
||||
raise KeyError(Errors.E022.format(name=name))
|
||||
|
||||
cdef Transition init_transition(self, int clas, int move, attr_t label) except *:
|
||||
|
@ -266,7 +266,7 @@ cdef class Begin:
|
|||
return False
|
||||
elif label == 0:
|
||||
return False
|
||||
elif preset_ent_iob == 1 or preset_ent_iob == 2:
|
||||
elif preset_ent_iob == 1:
|
||||
# Ensure we don't clobber preset entities. If no entity preset,
|
||||
# ent_iob is 0
|
||||
return False
|
||||
|
@ -282,8 +282,8 @@ cdef class Begin:
|
|||
# Otherwise, force acceptance, even if we're across a sentence
|
||||
# boundary or the token is whitespace.
|
||||
return True
|
||||
elif st.B_(1).ent_iob == 2 or st.B_(1).ent_iob == 3:
|
||||
# If the next word is B or O, we can't B now
|
||||
elif st.B_(1).ent_iob == 3:
|
||||
# If the next word is B, we can't B now
|
||||
return False
|
||||
elif st.B_(1).sent_start == 1:
|
||||
# Don't allow entities to extend across sentence boundaries
|
||||
|
@ -326,6 +326,7 @@ cdef class In:
|
|||
@staticmethod
|
||||
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||
cdef int preset_ent_iob = st.B_(0).ent_iob
|
||||
cdef attr_t preset_ent_label = st.B_(0).ent_type
|
||||
if label == 0:
|
||||
return False
|
||||
elif st.E_(0).ent_type != label:
|
||||
|
@ -335,13 +336,22 @@ cdef class In:
|
|||
elif st.B(1) == -1:
|
||||
# If we're at the end, we can't I.
|
||||
return False
|
||||
elif preset_ent_iob == 2:
|
||||
return False
|
||||
elif preset_ent_iob == 3:
|
||||
return False
|
||||
elif st.B_(1).ent_iob == 2 or st.B_(1).ent_iob == 3:
|
||||
# If we know the next word is B or O, we can't be I (must be L)
|
||||
elif st.B_(1).ent_iob == 3:
|
||||
# If we know the next word is B, we can't be I (must be L)
|
||||
return False
|
||||
elif preset_ent_iob == 1:
|
||||
if st.B_(1).ent_iob in (0, 2):
|
||||
# if next preset is missing or O, this can't be I (must be L)
|
||||
return False
|
||||
elif label != preset_ent_label:
|
||||
# If label isn't right, reject
|
||||
return False
|
||||
else:
|
||||
# Otherwise, force acceptance, even if we're across a sentence
|
||||
# boundary or the token is whitespace.
|
||||
return True
|
||||
elif st.B(1) != -1 and st.B_(1).sent_start == 1:
|
||||
# Don't allow entities to extend across sentence boundaries
|
||||
return False
|
||||
|
@ -387,16 +397,23 @@ cdef class In:
|
|||
else:
|
||||
return 1
|
||||
|
||||
|
||||
cdef class Last:
|
||||
@staticmethod
|
||||
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||
cdef int preset_ent_iob = st.B_(0).ent_iob
|
||||
cdef attr_t preset_ent_label = st.B_(0).ent_type
|
||||
if label == 0:
|
||||
return False
|
||||
elif not st.entity_is_open():
|
||||
return False
|
||||
elif st.B_(0).ent_iob == 1 and st.B_(1).ent_iob != 1:
|
||||
elif preset_ent_iob == 1 and st.B_(1).ent_iob != 1:
|
||||
# If a preset entity has I followed by not-I, is L
|
||||
if label != preset_ent_label:
|
||||
# If label isn't right, reject
|
||||
return False
|
||||
else:
|
||||
# Otherwise, force acceptance, even if we're across a sentence
|
||||
# boundary or the token is whitespace.
|
||||
return True
|
||||
elif st.E_(0).ent_type != label:
|
||||
return False
|
||||
|
@ -450,12 +467,13 @@ cdef class Unit:
|
|||
cdef int preset_ent_iob = st.B_(0).ent_iob
|
||||
cdef attr_t preset_ent_label = st.B_(0).ent_type
|
||||
if label == 0:
|
||||
# this is only allowed if it's a preset blocked annotation
|
||||
if preset_ent_label == 0 and preset_ent_iob == 3:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
elif st.entity_is_open():
|
||||
return False
|
||||
elif preset_ent_iob == 2:
|
||||
# Don't clobber preset O
|
||||
return False
|
||||
elif st.B_(1).ent_iob == 1:
|
||||
# If next token is In, we can't be Unit -- must be Begin
|
||||
return False
|
||||
|
|
|
@ -135,6 +135,8 @@ cdef class Parser:
|
|||
names = []
|
||||
for i in range(self.moves.n_moves):
|
||||
name = self.moves.move_name(self.moves.c[i].move, self.moves.c[i].label)
|
||||
# Explicitly removing the internal "U-" token used for blocking entities
|
||||
if name != "U-":
|
||||
names.append(name)
|
||||
return names
|
||||
|
||||
|
|
|
@ -16,10 +16,23 @@ def test_doc_add_entities_set_ents_iob(en_vocab):
|
|||
ner(doc)
|
||||
assert len(list(doc.ents)) == 0
|
||||
assert [w.ent_iob_ for w in doc] == (["O"] * len(doc))
|
||||
|
||||
doc.ents = [(doc.vocab.strings["ANIMAL"], 3, 4)]
|
||||
assert [w.ent_iob_ for w in doc] == ["", "", "", "B"]
|
||||
assert [w.ent_iob_ for w in doc] == ["O", "O", "O", "B"]
|
||||
|
||||
doc.ents = [(doc.vocab.strings["WORD"], 0, 2)]
|
||||
assert [w.ent_iob_ for w in doc] == ["B", "I", "", ""]
|
||||
assert [w.ent_iob_ for w in doc] == ["B", "I", "O", "O"]
|
||||
|
||||
|
||||
def test_ents_reset(en_vocab):
|
||||
text = ["This", "is", "a", "lion"]
|
||||
doc = get_doc(en_vocab, text)
|
||||
ner = EntityRecognizer(en_vocab)
|
||||
ner.begin_training([])
|
||||
ner(doc)
|
||||
assert [t.ent_iob_ for t in doc] == (["O"] * len(doc))
|
||||
doc.ents = list(doc.ents)
|
||||
assert [t.ent_iob_ for t in doc] == (["O"] * len(doc))
|
||||
|
||||
|
||||
def test_add_overlapping_entities(en_vocab):
|
||||
|
|
|
@ -2,7 +2,9 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
from spacy.pipeline import EntityRecognizer
|
||||
from spacy.lang.en import English
|
||||
|
||||
from spacy.pipeline import EntityRecognizer, EntityRuler
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.syntax.ner import BiluoPushDown
|
||||
from spacy.gold import GoldParse
|
||||
|
@ -80,14 +82,145 @@ def test_get_oracle_moves_negative_O(tsys, vocab):
|
|||
assert names
|
||||
|
||||
|
||||
def test_doc_add_entities_set_ents_iob(en_vocab):
|
||||
doc = Doc(en_vocab, words=["This", "is", "a", "lion"])
|
||||
ner = EntityRecognizer(en_vocab)
|
||||
ner.begin_training([])
|
||||
ner(doc)
|
||||
assert len(list(doc.ents)) == 0
|
||||
assert [w.ent_iob_ for w in doc] == (["O"] * len(doc))
|
||||
doc.ents = [(doc.vocab.strings["ANIMAL"], 3, 4)]
|
||||
assert [w.ent_iob_ for w in doc] == ["", "", "", "B"]
|
||||
doc.ents = [(doc.vocab.strings["WORD"], 0, 2)]
|
||||
assert [w.ent_iob_ for w in doc] == ["B", "I", "", ""]
|
||||
def test_accept_blocked_token():
|
||||
"""Test succesful blocking of tokens to be in an entity."""
|
||||
# 1. test normal behaviour
|
||||
nlp1 = English()
|
||||
doc1 = nlp1("I live in New York")
|
||||
ner1 = EntityRecognizer(doc1.vocab)
|
||||
assert [token.ent_iob_ for token in doc1] == ["", "", "", "", ""]
|
||||
assert [token.ent_type_ for token in doc1] == ["", "", "", "", ""]
|
||||
|
||||
# Add the OUT action
|
||||
ner1.moves.add_action(5, "")
|
||||
ner1.add_label("GPE")
|
||||
# Get into the state just before "New"
|
||||
state1 = ner1.moves.init_batch([doc1])[0]
|
||||
ner1.moves.apply_transition(state1, "O")
|
||||
ner1.moves.apply_transition(state1, "O")
|
||||
ner1.moves.apply_transition(state1, "O")
|
||||
# Check that B-GPE is valid.
|
||||
assert ner1.moves.is_valid(state1, "B-GPE")
|
||||
|
||||
# 2. test blocking behaviour
|
||||
nlp2 = English()
|
||||
doc2 = nlp2("I live in New York")
|
||||
ner2 = EntityRecognizer(doc2.vocab)
|
||||
|
||||
# set "New York" to a blocked entity
|
||||
doc2.ents = [(0, 3, 5)]
|
||||
assert [token.ent_iob_ for token in doc2] == ["", "", "", "B", "B"]
|
||||
assert [token.ent_type_ for token in doc2] == ["", "", "", "", ""]
|
||||
|
||||
# Check that B-GPE is now invalid.
|
||||
ner2.moves.add_action(4, "")
|
||||
ner2.moves.add_action(5, "")
|
||||
ner2.add_label("GPE")
|
||||
state2 = ner2.moves.init_batch([doc2])[0]
|
||||
ner2.moves.apply_transition(state2, "O")
|
||||
ner2.moves.apply_transition(state2, "O")
|
||||
ner2.moves.apply_transition(state2, "O")
|
||||
# we can only use U- for "New"
|
||||
assert not ner2.moves.is_valid(state2, "B-GPE")
|
||||
assert ner2.moves.is_valid(state2, "U-")
|
||||
ner2.moves.apply_transition(state2, "U-")
|
||||
# we can only use U- for "York"
|
||||
assert not ner2.moves.is_valid(state2, "B-GPE")
|
||||
assert ner2.moves.is_valid(state2, "U-")
|
||||
|
||||
|
||||
def test_overwrite_token():
|
||||
nlp = English()
|
||||
ner1 = nlp.create_pipe("ner")
|
||||
nlp.add_pipe(ner1, name="ner")
|
||||
nlp.begin_training()
|
||||
|
||||
# The untrained NER will predict O for each token
|
||||
doc = nlp("I live in New York")
|
||||
assert [token.ent_iob_ for token in doc] == ["O", "O", "O", "O", "O"]
|
||||
assert [token.ent_type_ for token in doc] == ["", "", "", "", ""]
|
||||
|
||||
# Check that a new ner can overwrite O
|
||||
ner2 = EntityRecognizer(doc.vocab)
|
||||
ner2.moves.add_action(5, "")
|
||||
ner2.add_label("GPE")
|
||||
state = ner2.moves.init_batch([doc])[0]
|
||||
assert ner2.moves.is_valid(state, "B-GPE")
|
||||
assert ner2.moves.is_valid(state, "U-GPE")
|
||||
ner2.moves.apply_transition(state, "B-GPE")
|
||||
assert ner2.moves.is_valid(state, "I-GPE")
|
||||
assert ner2.moves.is_valid(state, "L-GPE")
|
||||
|
||||
|
||||
def test_ruler_before_ner():
|
||||
""" Test that an NER works after an entity_ruler: the second can add annotations """
|
||||
nlp = English()
|
||||
|
||||
# 1 : Entity Ruler - should set "this" to B and everything else to empty
|
||||
ruler = EntityRuler(nlp)
|
||||
patterns = [{"label": "THING", "pattern": "This"}]
|
||||
ruler.add_patterns(patterns)
|
||||
nlp.add_pipe(ruler)
|
||||
|
||||
# 2: untrained NER - should set everything else to O
|
||||
untrained_ner = nlp.create_pipe("ner")
|
||||
untrained_ner.add_label("MY_LABEL")
|
||||
nlp.add_pipe(untrained_ner)
|
||||
nlp.begin_training()
|
||||
|
||||
doc = nlp("This is Antti Korhonen speaking in Finland")
|
||||
expected_iobs = ["B", "O", "O", "O", "O", "O", "O"]
|
||||
expected_types = ["THING", "", "", "", "", "", ""]
|
||||
assert [token.ent_iob_ for token in doc] == expected_iobs
|
||||
assert [token.ent_type_ for token in doc] == expected_types
|
||||
|
||||
|
||||
def test_ner_before_ruler():
|
||||
""" Test that an entity_ruler works after an NER: the second can overwrite O annotations """
|
||||
nlp = English()
|
||||
|
||||
# 1: untrained NER - should set everything to O
|
||||
untrained_ner = nlp.create_pipe("ner")
|
||||
untrained_ner.add_label("MY_LABEL")
|
||||
nlp.add_pipe(untrained_ner, name="uner")
|
||||
nlp.begin_training()
|
||||
|
||||
# 2 : Entity Ruler - should set "this" to B and keep everything else O
|
||||
ruler = EntityRuler(nlp)
|
||||
patterns = [{"label": "THING", "pattern": "This"}]
|
||||
ruler.add_patterns(patterns)
|
||||
nlp.add_pipe(ruler)
|
||||
|
||||
doc = nlp("This is Antti Korhonen speaking in Finland")
|
||||
expected_iobs = ["B", "O", "O", "O", "O", "O", "O"]
|
||||
expected_types = ["THING", "", "", "", "", "", ""]
|
||||
assert [token.ent_iob_ for token in doc] == expected_iobs
|
||||
assert [token.ent_type_ for token in doc] == expected_types
|
||||
|
||||
|
||||
def test_block_ner():
|
||||
""" Test functionality for blocking tokens so they can't be in a named entity """
|
||||
# block "Antti L Korhonen" from being a named entity
|
||||
nlp = English()
|
||||
nlp.add_pipe(BlockerComponent1(2, 5))
|
||||
untrained_ner = nlp.create_pipe("ner")
|
||||
untrained_ner.add_label("MY_LABEL")
|
||||
nlp.add_pipe(untrained_ner, name="uner")
|
||||
nlp.begin_training()
|
||||
doc = nlp("This is Antti L Korhonen speaking in Finland")
|
||||
expected_iobs = ["O", "O", "B", "B", "B", "O", "O", "O"]
|
||||
expected_types = ["", "", "", "", "", "", "", ""]
|
||||
assert [token.ent_iob_ for token in doc] == expected_iobs
|
||||
assert [token.ent_type_ for token in doc] == expected_types
|
||||
|
||||
|
||||
class BlockerComponent1(object):
|
||||
name = "my_blocker"
|
||||
|
||||
def __init__(self, start, end):
|
||||
self.start = start
|
||||
self.end = end
|
||||
|
||||
def __call__(self, doc):
|
||||
doc.ents = [(0, self.start, self.end)]
|
||||
return doc
|
||||
|
|
|
@ -426,7 +426,7 @@ def test_issue957(en_tokenizer):
|
|||
def test_issue999(train_data):
|
||||
"""Test that adding entities and resuming training works passably OK.
|
||||
There are two issues here:
|
||||
1) We have to readd labels. This isn't very nice.
|
||||
1) We have to read labels. This isn't very nice.
|
||||
2) There's no way to set the learning rate for the weight update, so we
|
||||
end up out-of-scale, causing it to learn too fast.
|
||||
"""
|
||||
|
|
42
spacy/tests/regression/test_issue4267.py
Normal file
42
spacy/tests/regression/test_issue4267.py
Normal file
|
@ -0,0 +1,42 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
import spacy
|
||||
|
||||
from spacy.lang.en import English
|
||||
from spacy.pipeline import EntityRuler
|
||||
from spacy.tokens import Span
|
||||
|
||||
|
||||
def test_issue4267():
|
||||
""" Test that running an entity_ruler after ner gives consistent results"""
|
||||
nlp = English()
|
||||
ner = nlp.create_pipe("ner")
|
||||
ner.add_label("PEOPLE")
|
||||
nlp.add_pipe(ner)
|
||||
nlp.begin_training()
|
||||
|
||||
assert "ner" in nlp.pipe_names
|
||||
|
||||
# assert that we have correct IOB annotations
|
||||
doc1 = nlp("hi")
|
||||
assert doc1.is_nered
|
||||
for token in doc1:
|
||||
assert token.ent_iob == 2
|
||||
|
||||
# add entity ruler and run again
|
||||
ruler = EntityRuler(nlp)
|
||||
patterns = [{"label": "SOFTWARE", "pattern": "spacy"}]
|
||||
|
||||
ruler.add_patterns(patterns)
|
||||
nlp.add_pipe(ruler)
|
||||
assert "entity_ruler" in nlp.pipe_names
|
||||
assert "ner" in nlp.pipe_names
|
||||
|
||||
# assert that we still have correct IOB annotations
|
||||
doc2 = nlp("hi")
|
||||
assert doc2.is_nered
|
||||
for token in doc2:
|
||||
assert token.ent_iob == 2
|
|
@ -256,7 +256,7 @@ cdef class Doc:
|
|||
def is_nered(self):
|
||||
"""Check if the document has named entities set. Will return True if
|
||||
*any* of the tokens has a named entity tag set (even if the others are
|
||||
uknown values).
|
||||
unknown values).
|
||||
"""
|
||||
if len(self) == 0:
|
||||
return True
|
||||
|
@ -525,13 +525,11 @@ cdef class Doc:
|
|||
|
||||
def __set__(self, ents):
|
||||
# TODO:
|
||||
# 1. Allow negative matches
|
||||
# 2. Ensure pre-set NERs are not over-written during statistical
|
||||
# prediction
|
||||
# 3. Test basic data-driven ORTH gazetteer
|
||||
# 4. Test more nuanced date and currency regex
|
||||
# 1. Test basic data-driven ORTH gazetteer
|
||||
# 2. Test more nuanced date and currency regex
|
||||
tokens_in_ents = {}
|
||||
cdef attr_t entity_type
|
||||
cdef attr_t kb_id
|
||||
cdef int ent_start, ent_end
|
||||
for ent_info in ents:
|
||||
entity_type, kb_id, ent_start, ent_end = get_entity_info(ent_info)
|
||||
|
@ -545,27 +543,31 @@ cdef class Doc:
|
|||
tokens_in_ents[token_index] = (ent_start, ent_end, entity_type, kb_id)
|
||||
cdef int i
|
||||
for i in range(self.length):
|
||||
self.c[i].ent_type = 0
|
||||
self.c[i].ent_kb_id = 0
|
||||
self.c[i].ent_iob = 0 # Means missing.
|
||||
cdef attr_t ent_type
|
||||
cdef int start, end
|
||||
for ent_info in ents:
|
||||
ent_type, ent_kb_id, start, end = get_entity_info(ent_info)
|
||||
if ent_type is None or ent_type < 0:
|
||||
# Mark as O
|
||||
for i in range(start, end):
|
||||
self.c[i].ent_type = 0
|
||||
self.c[i].ent_kb_id = 0
|
||||
self.c[i].ent_iob = 2
|
||||
# default values
|
||||
entity_type = 0
|
||||
kb_id = 0
|
||||
|
||||
# Set ent_iob to Missing (0) bij default unless this token was nered before
|
||||
ent_iob = 0
|
||||
if self.c[i].ent_iob != 0:
|
||||
ent_iob = 2
|
||||
|
||||
# overwrite if the token was part of a specified entity
|
||||
if i in tokens_in_ents.keys():
|
||||
ent_start, ent_end, entity_type, kb_id = tokens_in_ents[i]
|
||||
if entity_type is None or entity_type <= 0:
|
||||
# Blocking this token from being overwritten by downstream NER
|
||||
ent_iob = 3
|
||||
elif ent_start == i:
|
||||
# Marking the start of an entity
|
||||
ent_iob = 3
|
||||
else:
|
||||
# Mark (inside) as I
|
||||
for i in range(start, end):
|
||||
self.c[i].ent_type = ent_type
|
||||
self.c[i].ent_kb_id = ent_kb_id
|
||||
self.c[i].ent_iob = 1
|
||||
# Set start as B
|
||||
self.c[start].ent_iob = 3
|
||||
# Marking the inside of an entity
|
||||
ent_iob = 1
|
||||
|
||||
self.c[i].ent_type = entity_type
|
||||
self.c[i].ent_kb_id = kb_id
|
||||
self.c[i].ent_iob = ent_iob
|
||||
|
||||
@property
|
||||
def noun_chunks(self):
|
||||
|
|
|
@ -749,7 +749,8 @@ cdef class Token:
|
|||
def ent_iob_(self):
|
||||
"""IOB code of named entity tag. "B" means the token begins an entity,
|
||||
"I" means it is inside an entity, "O" means it is outside an entity,
|
||||
and "" means no entity tag is set.
|
||||
and "" means no entity tag is set. "B" with an empty ent_type
|
||||
means that the token is blocked from further processing by NER.
|
||||
|
||||
RETURNS (unicode): IOB code of named entity tag.
|
||||
"""
|
||||
|
|
Loading…
Reference in New Issue
Block a user