mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
Distinction between outside, missing and blocked NER annotations (#4307)
* remove duplicate unit test * unit test (currently failing) for issue 4267 * bugfix: ensure doc.ents preserves kb_id annotations * fix in setting doc.ents with empty label * rename * test for presetting an entity to a certain type * allow overwriting Outside + blocking presets * fix actions when previous label needs to be kept * fix default ent_iob in set entities * cleaner solution with U- action * remove debugging print statements * unit tests with explicit transitions and is_valid testing * remove U- from move_names explicitly * remove unit tests with pre-trained models that don't work * remove (working) unit tests with pre-trained models * clean up unit tests * move unit tests * small fixes * remove two TODO's from doc.ents comments
This commit is contained in:
parent
72463b062f
commit
de5a9ecdf3
|
@ -118,7 +118,7 @@ class Errors(object):
|
||||||
E011 = ("Unknown operator: '{op}'. Options: {opts}")
|
E011 = ("Unknown operator: '{op}'. Options: {opts}")
|
||||||
E012 = ("Cannot add pattern for zero tokens to matcher.\nKey: {key}")
|
E012 = ("Cannot add pattern for zero tokens to matcher.\nKey: {key}")
|
||||||
E013 = ("Error selecting action in matcher")
|
E013 = ("Error selecting action in matcher")
|
||||||
E014 = ("Uknown tag ID: {tag}")
|
E014 = ("Unknown tag ID: {tag}")
|
||||||
E015 = ("Conflicting morphology exception for ({tag}, {orth}). Use "
|
E015 = ("Conflicting morphology exception for ({tag}, {orth}). Use "
|
||||||
"`force=True` to overwrite.")
|
"`force=True` to overwrite.")
|
||||||
E016 = ("MultitaskObjective target should be function or one of: dep, "
|
E016 = ("MultitaskObjective target should be function or one of: dep, "
|
||||||
|
|
|
@ -66,7 +66,8 @@ cdef class BiluoPushDown(TransitionSystem):
|
||||||
UNIT: Counter(),
|
UNIT: Counter(),
|
||||||
OUT: Counter()
|
OUT: Counter()
|
||||||
}
|
}
|
||||||
actions[OUT][''] = 1
|
actions[OUT][''] = 1 # Represents a token predicted to be outside of any entity
|
||||||
|
actions[UNIT][''] = 1 # Represents a token prohibited to be in an entity
|
||||||
for entity_type in kwargs.get('entity_types', []):
|
for entity_type in kwargs.get('entity_types', []):
|
||||||
for action in (BEGIN, IN, LAST, UNIT):
|
for action in (BEGIN, IN, LAST, UNIT):
|
||||||
actions[action][entity_type] = 1
|
actions[action][entity_type] = 1
|
||||||
|
@ -161,7 +162,6 @@ cdef class BiluoPushDown(TransitionSystem):
|
||||||
for i in range(self.n_moves):
|
for i in range(self.n_moves):
|
||||||
if self.c[i].move == move and self.c[i].label == label:
|
if self.c[i].move == move and self.c[i].label == label:
|
||||||
return self.c[i]
|
return self.c[i]
|
||||||
else:
|
|
||||||
raise KeyError(Errors.E022.format(name=name))
|
raise KeyError(Errors.E022.format(name=name))
|
||||||
|
|
||||||
cdef Transition init_transition(self, int clas, int move, attr_t label) except *:
|
cdef Transition init_transition(self, int clas, int move, attr_t label) except *:
|
||||||
|
@ -266,7 +266,7 @@ cdef class Begin:
|
||||||
return False
|
return False
|
||||||
elif label == 0:
|
elif label == 0:
|
||||||
return False
|
return False
|
||||||
elif preset_ent_iob == 1 or preset_ent_iob == 2:
|
elif preset_ent_iob == 1:
|
||||||
# Ensure we don't clobber preset entities. If no entity preset,
|
# Ensure we don't clobber preset entities. If no entity preset,
|
||||||
# ent_iob is 0
|
# ent_iob is 0
|
||||||
return False
|
return False
|
||||||
|
@ -282,8 +282,8 @@ cdef class Begin:
|
||||||
# Otherwise, force acceptance, even if we're across a sentence
|
# Otherwise, force acceptance, even if we're across a sentence
|
||||||
# boundary or the token is whitespace.
|
# boundary or the token is whitespace.
|
||||||
return True
|
return True
|
||||||
elif st.B_(1).ent_iob == 2 or st.B_(1).ent_iob == 3:
|
elif st.B_(1).ent_iob == 3:
|
||||||
# If the next word is B or O, we can't B now
|
# If the next word is B, we can't B now
|
||||||
return False
|
return False
|
||||||
elif st.B_(1).sent_start == 1:
|
elif st.B_(1).sent_start == 1:
|
||||||
# Don't allow entities to extend across sentence boundaries
|
# Don't allow entities to extend across sentence boundaries
|
||||||
|
@ -326,6 +326,7 @@ cdef class In:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||||
cdef int preset_ent_iob = st.B_(0).ent_iob
|
cdef int preset_ent_iob = st.B_(0).ent_iob
|
||||||
|
cdef attr_t preset_ent_label = st.B_(0).ent_type
|
||||||
if label == 0:
|
if label == 0:
|
||||||
return False
|
return False
|
||||||
elif st.E_(0).ent_type != label:
|
elif st.E_(0).ent_type != label:
|
||||||
|
@ -335,13 +336,22 @@ cdef class In:
|
||||||
elif st.B(1) == -1:
|
elif st.B(1) == -1:
|
||||||
# If we're at the end, we can't I.
|
# If we're at the end, we can't I.
|
||||||
return False
|
return False
|
||||||
elif preset_ent_iob == 2:
|
|
||||||
return False
|
|
||||||
elif preset_ent_iob == 3:
|
elif preset_ent_iob == 3:
|
||||||
return False
|
return False
|
||||||
elif st.B_(1).ent_iob == 2 or st.B_(1).ent_iob == 3:
|
elif st.B_(1).ent_iob == 3:
|
||||||
# If we know the next word is B or O, we can't be I (must be L)
|
# If we know the next word is B, we can't be I (must be L)
|
||||||
return False
|
return False
|
||||||
|
elif preset_ent_iob == 1:
|
||||||
|
if st.B_(1).ent_iob in (0, 2):
|
||||||
|
# if next preset is missing or O, this can't be I (must be L)
|
||||||
|
return False
|
||||||
|
elif label != preset_ent_label:
|
||||||
|
# If label isn't right, reject
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
# Otherwise, force acceptance, even if we're across a sentence
|
||||||
|
# boundary or the token is whitespace.
|
||||||
|
return True
|
||||||
elif st.B(1) != -1 and st.B_(1).sent_start == 1:
|
elif st.B(1) != -1 and st.B_(1).sent_start == 1:
|
||||||
# Don't allow entities to extend across sentence boundaries
|
# Don't allow entities to extend across sentence boundaries
|
||||||
return False
|
return False
|
||||||
|
@ -387,16 +397,23 @@ cdef class In:
|
||||||
else:
|
else:
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
|
|
||||||
cdef class Last:
|
cdef class Last:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||||
|
cdef int preset_ent_iob = st.B_(0).ent_iob
|
||||||
|
cdef attr_t preset_ent_label = st.B_(0).ent_type
|
||||||
if label == 0:
|
if label == 0:
|
||||||
return False
|
return False
|
||||||
elif not st.entity_is_open():
|
elif not st.entity_is_open():
|
||||||
return False
|
return False
|
||||||
elif st.B_(0).ent_iob == 1 and st.B_(1).ent_iob != 1:
|
elif preset_ent_iob == 1 and st.B_(1).ent_iob != 1:
|
||||||
# If a preset entity has I followed by not-I, is L
|
# If a preset entity has I followed by not-I, is L
|
||||||
|
if label != preset_ent_label:
|
||||||
|
# If label isn't right, reject
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
# Otherwise, force acceptance, even if we're across a sentence
|
||||||
|
# boundary or the token is whitespace.
|
||||||
return True
|
return True
|
||||||
elif st.E_(0).ent_type != label:
|
elif st.E_(0).ent_type != label:
|
||||||
return False
|
return False
|
||||||
|
@ -450,12 +467,13 @@ cdef class Unit:
|
||||||
cdef int preset_ent_iob = st.B_(0).ent_iob
|
cdef int preset_ent_iob = st.B_(0).ent_iob
|
||||||
cdef attr_t preset_ent_label = st.B_(0).ent_type
|
cdef attr_t preset_ent_label = st.B_(0).ent_type
|
||||||
if label == 0:
|
if label == 0:
|
||||||
|
# this is only allowed if it's a preset blocked annotation
|
||||||
|
if preset_ent_label == 0 and preset_ent_iob == 3:
|
||||||
|
return True
|
||||||
|
else:
|
||||||
return False
|
return False
|
||||||
elif st.entity_is_open():
|
elif st.entity_is_open():
|
||||||
return False
|
return False
|
||||||
elif preset_ent_iob == 2:
|
|
||||||
# Don't clobber preset O
|
|
||||||
return False
|
|
||||||
elif st.B_(1).ent_iob == 1:
|
elif st.B_(1).ent_iob == 1:
|
||||||
# If next token is In, we can't be Unit -- must be Begin
|
# If next token is In, we can't be Unit -- must be Begin
|
||||||
return False
|
return False
|
||||||
|
|
|
@ -135,6 +135,8 @@ cdef class Parser:
|
||||||
names = []
|
names = []
|
||||||
for i in range(self.moves.n_moves):
|
for i in range(self.moves.n_moves):
|
||||||
name = self.moves.move_name(self.moves.c[i].move, self.moves.c[i].label)
|
name = self.moves.move_name(self.moves.c[i].move, self.moves.c[i].label)
|
||||||
|
# Explicitly removing the internal "U-" token used for blocking entities
|
||||||
|
if name != "U-":
|
||||||
names.append(name)
|
names.append(name)
|
||||||
return names
|
return names
|
||||||
|
|
||||||
|
|
|
@ -16,10 +16,23 @@ def test_doc_add_entities_set_ents_iob(en_vocab):
|
||||||
ner(doc)
|
ner(doc)
|
||||||
assert len(list(doc.ents)) == 0
|
assert len(list(doc.ents)) == 0
|
||||||
assert [w.ent_iob_ for w in doc] == (["O"] * len(doc))
|
assert [w.ent_iob_ for w in doc] == (["O"] * len(doc))
|
||||||
|
|
||||||
doc.ents = [(doc.vocab.strings["ANIMAL"], 3, 4)]
|
doc.ents = [(doc.vocab.strings["ANIMAL"], 3, 4)]
|
||||||
assert [w.ent_iob_ for w in doc] == ["", "", "", "B"]
|
assert [w.ent_iob_ for w in doc] == ["O", "O", "O", "B"]
|
||||||
|
|
||||||
doc.ents = [(doc.vocab.strings["WORD"], 0, 2)]
|
doc.ents = [(doc.vocab.strings["WORD"], 0, 2)]
|
||||||
assert [w.ent_iob_ for w in doc] == ["B", "I", "", ""]
|
assert [w.ent_iob_ for w in doc] == ["B", "I", "O", "O"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_ents_reset(en_vocab):
|
||||||
|
text = ["This", "is", "a", "lion"]
|
||||||
|
doc = get_doc(en_vocab, text)
|
||||||
|
ner = EntityRecognizer(en_vocab)
|
||||||
|
ner.begin_training([])
|
||||||
|
ner(doc)
|
||||||
|
assert [t.ent_iob_ for t in doc] == (["O"] * len(doc))
|
||||||
|
doc.ents = list(doc.ents)
|
||||||
|
assert [t.ent_iob_ for t in doc] == (["O"] * len(doc))
|
||||||
|
|
||||||
|
|
||||||
def test_add_overlapping_entities(en_vocab):
|
def test_add_overlapping_entities(en_vocab):
|
||||||
|
|
|
@ -2,7 +2,9 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from spacy.pipeline import EntityRecognizer
|
from spacy.lang.en import English
|
||||||
|
|
||||||
|
from spacy.pipeline import EntityRecognizer, EntityRuler
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
from spacy.syntax.ner import BiluoPushDown
|
from spacy.syntax.ner import BiluoPushDown
|
||||||
from spacy.gold import GoldParse
|
from spacy.gold import GoldParse
|
||||||
|
@ -80,14 +82,145 @@ def test_get_oracle_moves_negative_O(tsys, vocab):
|
||||||
assert names
|
assert names
|
||||||
|
|
||||||
|
|
||||||
def test_doc_add_entities_set_ents_iob(en_vocab):
|
def test_accept_blocked_token():
|
||||||
doc = Doc(en_vocab, words=["This", "is", "a", "lion"])
|
"""Test succesful blocking of tokens to be in an entity."""
|
||||||
ner = EntityRecognizer(en_vocab)
|
# 1. test normal behaviour
|
||||||
ner.begin_training([])
|
nlp1 = English()
|
||||||
ner(doc)
|
doc1 = nlp1("I live in New York")
|
||||||
assert len(list(doc.ents)) == 0
|
ner1 = EntityRecognizer(doc1.vocab)
|
||||||
assert [w.ent_iob_ for w in doc] == (["O"] * len(doc))
|
assert [token.ent_iob_ for token in doc1] == ["", "", "", "", ""]
|
||||||
doc.ents = [(doc.vocab.strings["ANIMAL"], 3, 4)]
|
assert [token.ent_type_ for token in doc1] == ["", "", "", "", ""]
|
||||||
assert [w.ent_iob_ for w in doc] == ["", "", "", "B"]
|
|
||||||
doc.ents = [(doc.vocab.strings["WORD"], 0, 2)]
|
# Add the OUT action
|
||||||
assert [w.ent_iob_ for w in doc] == ["B", "I", "", ""]
|
ner1.moves.add_action(5, "")
|
||||||
|
ner1.add_label("GPE")
|
||||||
|
# Get into the state just before "New"
|
||||||
|
state1 = ner1.moves.init_batch([doc1])[0]
|
||||||
|
ner1.moves.apply_transition(state1, "O")
|
||||||
|
ner1.moves.apply_transition(state1, "O")
|
||||||
|
ner1.moves.apply_transition(state1, "O")
|
||||||
|
# Check that B-GPE is valid.
|
||||||
|
assert ner1.moves.is_valid(state1, "B-GPE")
|
||||||
|
|
||||||
|
# 2. test blocking behaviour
|
||||||
|
nlp2 = English()
|
||||||
|
doc2 = nlp2("I live in New York")
|
||||||
|
ner2 = EntityRecognizer(doc2.vocab)
|
||||||
|
|
||||||
|
# set "New York" to a blocked entity
|
||||||
|
doc2.ents = [(0, 3, 5)]
|
||||||
|
assert [token.ent_iob_ for token in doc2] == ["", "", "", "B", "B"]
|
||||||
|
assert [token.ent_type_ for token in doc2] == ["", "", "", "", ""]
|
||||||
|
|
||||||
|
# Check that B-GPE is now invalid.
|
||||||
|
ner2.moves.add_action(4, "")
|
||||||
|
ner2.moves.add_action(5, "")
|
||||||
|
ner2.add_label("GPE")
|
||||||
|
state2 = ner2.moves.init_batch([doc2])[0]
|
||||||
|
ner2.moves.apply_transition(state2, "O")
|
||||||
|
ner2.moves.apply_transition(state2, "O")
|
||||||
|
ner2.moves.apply_transition(state2, "O")
|
||||||
|
# we can only use U- for "New"
|
||||||
|
assert not ner2.moves.is_valid(state2, "B-GPE")
|
||||||
|
assert ner2.moves.is_valid(state2, "U-")
|
||||||
|
ner2.moves.apply_transition(state2, "U-")
|
||||||
|
# we can only use U- for "York"
|
||||||
|
assert not ner2.moves.is_valid(state2, "B-GPE")
|
||||||
|
assert ner2.moves.is_valid(state2, "U-")
|
||||||
|
|
||||||
|
|
||||||
|
def test_overwrite_token():
|
||||||
|
nlp = English()
|
||||||
|
ner1 = nlp.create_pipe("ner")
|
||||||
|
nlp.add_pipe(ner1, name="ner")
|
||||||
|
nlp.begin_training()
|
||||||
|
|
||||||
|
# The untrained NER will predict O for each token
|
||||||
|
doc = nlp("I live in New York")
|
||||||
|
assert [token.ent_iob_ for token in doc] == ["O", "O", "O", "O", "O"]
|
||||||
|
assert [token.ent_type_ for token in doc] == ["", "", "", "", ""]
|
||||||
|
|
||||||
|
# Check that a new ner can overwrite O
|
||||||
|
ner2 = EntityRecognizer(doc.vocab)
|
||||||
|
ner2.moves.add_action(5, "")
|
||||||
|
ner2.add_label("GPE")
|
||||||
|
state = ner2.moves.init_batch([doc])[0]
|
||||||
|
assert ner2.moves.is_valid(state, "B-GPE")
|
||||||
|
assert ner2.moves.is_valid(state, "U-GPE")
|
||||||
|
ner2.moves.apply_transition(state, "B-GPE")
|
||||||
|
assert ner2.moves.is_valid(state, "I-GPE")
|
||||||
|
assert ner2.moves.is_valid(state, "L-GPE")
|
||||||
|
|
||||||
|
|
||||||
|
def test_ruler_before_ner():
|
||||||
|
""" Test that an NER works after an entity_ruler: the second can add annotations """
|
||||||
|
nlp = English()
|
||||||
|
|
||||||
|
# 1 : Entity Ruler - should set "this" to B and everything else to empty
|
||||||
|
ruler = EntityRuler(nlp)
|
||||||
|
patterns = [{"label": "THING", "pattern": "This"}]
|
||||||
|
ruler.add_patterns(patterns)
|
||||||
|
nlp.add_pipe(ruler)
|
||||||
|
|
||||||
|
# 2: untrained NER - should set everything else to O
|
||||||
|
untrained_ner = nlp.create_pipe("ner")
|
||||||
|
untrained_ner.add_label("MY_LABEL")
|
||||||
|
nlp.add_pipe(untrained_ner)
|
||||||
|
nlp.begin_training()
|
||||||
|
|
||||||
|
doc = nlp("This is Antti Korhonen speaking in Finland")
|
||||||
|
expected_iobs = ["B", "O", "O", "O", "O", "O", "O"]
|
||||||
|
expected_types = ["THING", "", "", "", "", "", ""]
|
||||||
|
assert [token.ent_iob_ for token in doc] == expected_iobs
|
||||||
|
assert [token.ent_type_ for token in doc] == expected_types
|
||||||
|
|
||||||
|
|
||||||
|
def test_ner_before_ruler():
|
||||||
|
""" Test that an entity_ruler works after an NER: the second can overwrite O annotations """
|
||||||
|
nlp = English()
|
||||||
|
|
||||||
|
# 1: untrained NER - should set everything to O
|
||||||
|
untrained_ner = nlp.create_pipe("ner")
|
||||||
|
untrained_ner.add_label("MY_LABEL")
|
||||||
|
nlp.add_pipe(untrained_ner, name="uner")
|
||||||
|
nlp.begin_training()
|
||||||
|
|
||||||
|
# 2 : Entity Ruler - should set "this" to B and keep everything else O
|
||||||
|
ruler = EntityRuler(nlp)
|
||||||
|
patterns = [{"label": "THING", "pattern": "This"}]
|
||||||
|
ruler.add_patterns(patterns)
|
||||||
|
nlp.add_pipe(ruler)
|
||||||
|
|
||||||
|
doc = nlp("This is Antti Korhonen speaking in Finland")
|
||||||
|
expected_iobs = ["B", "O", "O", "O", "O", "O", "O"]
|
||||||
|
expected_types = ["THING", "", "", "", "", "", ""]
|
||||||
|
assert [token.ent_iob_ for token in doc] == expected_iobs
|
||||||
|
assert [token.ent_type_ for token in doc] == expected_types
|
||||||
|
|
||||||
|
|
||||||
|
def test_block_ner():
|
||||||
|
""" Test functionality for blocking tokens so they can't be in a named entity """
|
||||||
|
# block "Antti L Korhonen" from being a named entity
|
||||||
|
nlp = English()
|
||||||
|
nlp.add_pipe(BlockerComponent1(2, 5))
|
||||||
|
untrained_ner = nlp.create_pipe("ner")
|
||||||
|
untrained_ner.add_label("MY_LABEL")
|
||||||
|
nlp.add_pipe(untrained_ner, name="uner")
|
||||||
|
nlp.begin_training()
|
||||||
|
doc = nlp("This is Antti L Korhonen speaking in Finland")
|
||||||
|
expected_iobs = ["O", "O", "B", "B", "B", "O", "O", "O"]
|
||||||
|
expected_types = ["", "", "", "", "", "", "", ""]
|
||||||
|
assert [token.ent_iob_ for token in doc] == expected_iobs
|
||||||
|
assert [token.ent_type_ for token in doc] == expected_types
|
||||||
|
|
||||||
|
|
||||||
|
class BlockerComponent1(object):
|
||||||
|
name = "my_blocker"
|
||||||
|
|
||||||
|
def __init__(self, start, end):
|
||||||
|
self.start = start
|
||||||
|
self.end = end
|
||||||
|
|
||||||
|
def __call__(self, doc):
|
||||||
|
doc.ents = [(0, self.start, self.end)]
|
||||||
|
return doc
|
||||||
|
|
|
@ -426,7 +426,7 @@ def test_issue957(en_tokenizer):
|
||||||
def test_issue999(train_data):
|
def test_issue999(train_data):
|
||||||
"""Test that adding entities and resuming training works passably OK.
|
"""Test that adding entities and resuming training works passably OK.
|
||||||
There are two issues here:
|
There are two issues here:
|
||||||
1) We have to readd labels. This isn't very nice.
|
1) We have to read labels. This isn't very nice.
|
||||||
2) There's no way to set the learning rate for the weight update, so we
|
2) There's no way to set the learning rate for the weight update, so we
|
||||||
end up out-of-scale, causing it to learn too fast.
|
end up out-of-scale, causing it to learn too fast.
|
||||||
"""
|
"""
|
||||||
|
|
42
spacy/tests/regression/test_issue4267.py
Normal file
42
spacy/tests/regression/test_issue4267.py
Normal file
|
@ -0,0 +1,42 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
import spacy
|
||||||
|
|
||||||
|
from spacy.lang.en import English
|
||||||
|
from spacy.pipeline import EntityRuler
|
||||||
|
from spacy.tokens import Span
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4267():
|
||||||
|
""" Test that running an entity_ruler after ner gives consistent results"""
|
||||||
|
nlp = English()
|
||||||
|
ner = nlp.create_pipe("ner")
|
||||||
|
ner.add_label("PEOPLE")
|
||||||
|
nlp.add_pipe(ner)
|
||||||
|
nlp.begin_training()
|
||||||
|
|
||||||
|
assert "ner" in nlp.pipe_names
|
||||||
|
|
||||||
|
# assert that we have correct IOB annotations
|
||||||
|
doc1 = nlp("hi")
|
||||||
|
assert doc1.is_nered
|
||||||
|
for token in doc1:
|
||||||
|
assert token.ent_iob == 2
|
||||||
|
|
||||||
|
# add entity ruler and run again
|
||||||
|
ruler = EntityRuler(nlp)
|
||||||
|
patterns = [{"label": "SOFTWARE", "pattern": "spacy"}]
|
||||||
|
|
||||||
|
ruler.add_patterns(patterns)
|
||||||
|
nlp.add_pipe(ruler)
|
||||||
|
assert "entity_ruler" in nlp.pipe_names
|
||||||
|
assert "ner" in nlp.pipe_names
|
||||||
|
|
||||||
|
# assert that we still have correct IOB annotations
|
||||||
|
doc2 = nlp("hi")
|
||||||
|
assert doc2.is_nered
|
||||||
|
for token in doc2:
|
||||||
|
assert token.ent_iob == 2
|
|
@ -256,7 +256,7 @@ cdef class Doc:
|
||||||
def is_nered(self):
|
def is_nered(self):
|
||||||
"""Check if the document has named entities set. Will return True if
|
"""Check if the document has named entities set. Will return True if
|
||||||
*any* of the tokens has a named entity tag set (even if the others are
|
*any* of the tokens has a named entity tag set (even if the others are
|
||||||
uknown values).
|
unknown values).
|
||||||
"""
|
"""
|
||||||
if len(self) == 0:
|
if len(self) == 0:
|
||||||
return True
|
return True
|
||||||
|
@ -525,13 +525,11 @@ cdef class Doc:
|
||||||
|
|
||||||
def __set__(self, ents):
|
def __set__(self, ents):
|
||||||
# TODO:
|
# TODO:
|
||||||
# 1. Allow negative matches
|
# 1. Test basic data-driven ORTH gazetteer
|
||||||
# 2. Ensure pre-set NERs are not over-written during statistical
|
# 2. Test more nuanced date and currency regex
|
||||||
# prediction
|
|
||||||
# 3. Test basic data-driven ORTH gazetteer
|
|
||||||
# 4. Test more nuanced date and currency regex
|
|
||||||
tokens_in_ents = {}
|
tokens_in_ents = {}
|
||||||
cdef attr_t entity_type
|
cdef attr_t entity_type
|
||||||
|
cdef attr_t kb_id
|
||||||
cdef int ent_start, ent_end
|
cdef int ent_start, ent_end
|
||||||
for ent_info in ents:
|
for ent_info in ents:
|
||||||
entity_type, kb_id, ent_start, ent_end = get_entity_info(ent_info)
|
entity_type, kb_id, ent_start, ent_end = get_entity_info(ent_info)
|
||||||
|
@ -545,27 +543,31 @@ cdef class Doc:
|
||||||
tokens_in_ents[token_index] = (ent_start, ent_end, entity_type, kb_id)
|
tokens_in_ents[token_index] = (ent_start, ent_end, entity_type, kb_id)
|
||||||
cdef int i
|
cdef int i
|
||||||
for i in range(self.length):
|
for i in range(self.length):
|
||||||
self.c[i].ent_type = 0
|
# default values
|
||||||
self.c[i].ent_kb_id = 0
|
entity_type = 0
|
||||||
self.c[i].ent_iob = 0 # Means missing.
|
kb_id = 0
|
||||||
cdef attr_t ent_type
|
|
||||||
cdef int start, end
|
# Set ent_iob to Missing (0) bij default unless this token was nered before
|
||||||
for ent_info in ents:
|
ent_iob = 0
|
||||||
ent_type, ent_kb_id, start, end = get_entity_info(ent_info)
|
if self.c[i].ent_iob != 0:
|
||||||
if ent_type is None or ent_type < 0:
|
ent_iob = 2
|
||||||
# Mark as O
|
|
||||||
for i in range(start, end):
|
# overwrite if the token was part of a specified entity
|
||||||
self.c[i].ent_type = 0
|
if i in tokens_in_ents.keys():
|
||||||
self.c[i].ent_kb_id = 0
|
ent_start, ent_end, entity_type, kb_id = tokens_in_ents[i]
|
||||||
self.c[i].ent_iob = 2
|
if entity_type is None or entity_type <= 0:
|
||||||
|
# Blocking this token from being overwritten by downstream NER
|
||||||
|
ent_iob = 3
|
||||||
|
elif ent_start == i:
|
||||||
|
# Marking the start of an entity
|
||||||
|
ent_iob = 3
|
||||||
else:
|
else:
|
||||||
# Mark (inside) as I
|
# Marking the inside of an entity
|
||||||
for i in range(start, end):
|
ent_iob = 1
|
||||||
self.c[i].ent_type = ent_type
|
|
||||||
self.c[i].ent_kb_id = ent_kb_id
|
self.c[i].ent_type = entity_type
|
||||||
self.c[i].ent_iob = 1
|
self.c[i].ent_kb_id = kb_id
|
||||||
# Set start as B
|
self.c[i].ent_iob = ent_iob
|
||||||
self.c[start].ent_iob = 3
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def noun_chunks(self):
|
def noun_chunks(self):
|
||||||
|
|
|
@ -749,7 +749,8 @@ cdef class Token:
|
||||||
def ent_iob_(self):
|
def ent_iob_(self):
|
||||||
"""IOB code of named entity tag. "B" means the token begins an entity,
|
"""IOB code of named entity tag. "B" means the token begins an entity,
|
||||||
"I" means it is inside an entity, "O" means it is outside an entity,
|
"I" means it is inside an entity, "O" means it is outside an entity,
|
||||||
and "" means no entity tag is set.
|
and "" means no entity tag is set. "B" with an empty ent_type
|
||||||
|
means that the token is blocked from further processing by NER.
|
||||||
|
|
||||||
RETURNS (unicode): IOB code of named entity tag.
|
RETURNS (unicode): IOB code of named entity tag.
|
||||||
"""
|
"""
|
||||||
|
|
Loading…
Reference in New Issue
Block a user