mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-13 18:10:35 +03:00
Fix NER when preset entities cross sentence boundaries (#3379)
💫 Fix NER when preset entities cross sentence boundaries
This commit is contained in:
parent
3fe5811fa7
commit
a5b1f6dcec
|
@ -157,6 +157,10 @@ cdef void cpu_log_loss(float* d_scores,
|
||||||
cdef double max_, gmax, Z, gZ
|
cdef double max_, gmax, Z, gZ
|
||||||
best = arg_max_if_gold(scores, costs, is_valid, O)
|
best = arg_max_if_gold(scores, costs, is_valid, O)
|
||||||
guess = arg_max_if_valid(scores, is_valid, O)
|
guess = arg_max_if_valid(scores, is_valid, O)
|
||||||
|
if best == -1 or guess == -1:
|
||||||
|
# These shouldn't happen, but if they do, we want to make sure we don't
|
||||||
|
# cause an OOB access.
|
||||||
|
return
|
||||||
Z = 1e-10
|
Z = 1e-10
|
||||||
gZ = 1e-10
|
gZ = 1e-10
|
||||||
max_ = scores[guess]
|
max_ = scores[guess]
|
||||||
|
|
|
@ -323,6 +323,12 @@ cdef cppclass StateC:
|
||||||
if this._s_i >= 1:
|
if this._s_i >= 1:
|
||||||
this._s_i -= 1
|
this._s_i -= 1
|
||||||
|
|
||||||
|
void force_final() nogil:
|
||||||
|
# This should only be used in desperate situations, as it may leave
|
||||||
|
# the analysis in an unexpected state.
|
||||||
|
this._s_i = 0
|
||||||
|
this._b_i = this.length
|
||||||
|
|
||||||
void unshift() nogil:
|
void unshift() nogil:
|
||||||
this._b_i -= 1
|
this._b_i -= 1
|
||||||
this._buffer[this._b_i] = this.S(0)
|
this._buffer[this._b_i] = this.S(0)
|
||||||
|
|
|
@ -257,30 +257,42 @@ cdef class Missing:
|
||||||
cdef class Begin:
|
cdef class Begin:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||||
|
cdef int preset_ent_iob = st.B_(0).ent_iob
|
||||||
|
cdef int preset_ent_label = st.B_(0).ent_type
|
||||||
|
# If we're the last token of the input, we can't B -- must U or O.
|
||||||
|
if st.B(1) == -1:
|
||||||
|
return False
|
||||||
|
elif st.entity_is_open():
|
||||||
|
return False
|
||||||
|
elif label == 0:
|
||||||
|
return False
|
||||||
|
elif preset_ent_iob == 1 or preset_ent_iob == 2:
|
||||||
# Ensure we don't clobber preset entities. If no entity preset,
|
# Ensure we don't clobber preset entities. If no entity preset,
|
||||||
# ent_iob is 0
|
# ent_iob is 0
|
||||||
cdef int preset_ent_iob = st.B_(0).ent_iob
|
|
||||||
if preset_ent_iob == 1:
|
|
||||||
return False
|
return False
|
||||||
elif preset_ent_iob == 2:
|
elif preset_ent_iob == 3:
|
||||||
|
# Okay, we're in a preset entity.
|
||||||
|
if label != preset_ent_label:
|
||||||
|
# If label isn't right, reject
|
||||||
return False
|
return False
|
||||||
elif preset_ent_iob == 3 and st.B_(0).ent_type != label:
|
elif st.B_(1).ent_iob != 1:
|
||||||
|
# If next token isn't marked I, we need to make U, not B.
|
||||||
return False
|
return False
|
||||||
# If the next word is B or O, we can't B now
|
else:
|
||||||
|
# Otherwise, force acceptance, even if we're across a sentence
|
||||||
|
# boundary or the token is whitespace.
|
||||||
|
return True
|
||||||
elif st.B_(1).ent_iob == 2 or st.B_(1).ent_iob == 3:
|
elif st.B_(1).ent_iob == 2 or st.B_(1).ent_iob == 3:
|
||||||
|
# If the next word is B or O, we can't B now
|
||||||
return False
|
return False
|
||||||
# If the current word is B, and the next word isn't I, the current word
|
|
||||||
# is really U
|
|
||||||
elif preset_ent_iob == 3 and st.B_(1).ent_iob != 1:
|
|
||||||
return False
|
|
||||||
# Don't allow entities to extend across sentence boundaries
|
|
||||||
elif st.B_(1).sent_start == 1:
|
elif st.B_(1).sent_start == 1:
|
||||||
|
# Don't allow entities to extend across sentence boundaries
|
||||||
return False
|
return False
|
||||||
# Don't allow entities to start on whitespace
|
# Don't allow entities to start on whitespace
|
||||||
elif Lexeme.get_struct_attr(st.B_(0).lex, IS_SPACE):
|
elif Lexeme.get_struct_attr(st.B_(0).lex, IS_SPACE):
|
||||||
return False
|
return False
|
||||||
else:
|
else:
|
||||||
return label != 0 and not st.entity_is_open()
|
return True
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef int transition(StateC* st, attr_t label) nogil:
|
cdef int transition(StateC* st, attr_t label) nogil:
|
||||||
|
@ -314,18 +326,27 @@ cdef class In:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||||
cdef int preset_ent_iob = st.B_(0).ent_iob
|
cdef int preset_ent_iob = st.B_(0).ent_iob
|
||||||
if preset_ent_iob == 2:
|
if label == 0:
|
||||||
|
return False
|
||||||
|
elif st.E_(0).ent_type != label:
|
||||||
|
return False
|
||||||
|
elif not st.entity_is_open():
|
||||||
|
return False
|
||||||
|
elif st.B(1) == -1:
|
||||||
|
# If we're at the end, we can't I.
|
||||||
|
return False
|
||||||
|
elif preset_ent_iob == 2:
|
||||||
return False
|
return False
|
||||||
elif preset_ent_iob == 3:
|
elif preset_ent_iob == 3:
|
||||||
return False
|
return False
|
||||||
# TODO: Is this quite right? I think it's supposed to be ensuring the
|
elif st.B_(1).ent_iob == 2 or st.B_(1).ent_iob == 3:
|
||||||
# gazetteer matches are maintained
|
# If we know the next word is B or O, we can't be I (must be L)
|
||||||
elif st.B(1) != -1 and st.B_(1).ent_iob != preset_ent_iob:
|
|
||||||
return False
|
return False
|
||||||
# Don't allow entities to extend across sentence boundaries
|
|
||||||
elif st.B(1) != -1 and st.B_(1).sent_start == 1:
|
elif st.B(1) != -1 and st.B_(1).sent_start == 1:
|
||||||
|
# Don't allow entities to extend across sentence boundaries
|
||||||
return False
|
return False
|
||||||
return st.entity_is_open() and label != 0 and st.E_(0).ent_type == label
|
else:
|
||||||
|
return True
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef int transition(StateC* st, attr_t label) nogil:
|
cdef int transition(StateC* st, attr_t label) nogil:
|
||||||
|
@ -370,9 +391,17 @@ cdef class In:
|
||||||
cdef class Last:
|
cdef class Last:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||||
if st.B_(1).ent_iob == 1:
|
if label == 0:
|
||||||
return False
|
return False
|
||||||
return st.entity_is_open() and label != 0 and st.E_(0).ent_type == label
|
elif not st.entity_is_open():
|
||||||
|
return False
|
||||||
|
elif st.E_(0).ent_type != label:
|
||||||
|
return False
|
||||||
|
elif st.B_(1).ent_iob == 1:
|
||||||
|
# If a preset entity has I next, we can't L here.
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
return True
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef int transition(StateC* st, attr_t label) nogil:
|
cdef int transition(StateC* st, attr_t label) nogil:
|
||||||
|
@ -416,17 +445,29 @@ cdef class Unit:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||||
cdef int preset_ent_iob = st.B_(0).ent_iob
|
cdef int preset_ent_iob = st.B_(0).ent_iob
|
||||||
if preset_ent_iob == 2:
|
cdef attr_t preset_ent_label = st.B_(0).ent_type
|
||||||
|
if label == 0:
|
||||||
return False
|
return False
|
||||||
elif preset_ent_iob == 1:
|
elif st.entity_is_open():
|
||||||
return False
|
return False
|
||||||
elif preset_ent_iob == 3 and st.B_(0).ent_type != label:
|
elif preset_ent_iob == 2:
|
||||||
|
# Don't clobber preset O
|
||||||
return False
|
return False
|
||||||
elif st.B_(1).ent_iob == 1:
|
elif st.B_(1).ent_iob == 1:
|
||||||
|
# If next token is In, we can't be Unit -- must be Begin
|
||||||
return False
|
return False
|
||||||
|
elif preset_ent_iob == 3:
|
||||||
|
# Okay, there's a preset entity here
|
||||||
|
if label != preset_ent_label:
|
||||||
|
# Require labels to match
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
# Otherwise return True, ignoring the whitespace constraint.
|
||||||
|
return True
|
||||||
elif Lexeme.get_struct_attr(st.B_(0).lex, IS_SPACE):
|
elif Lexeme.get_struct_attr(st.B_(0).lex, IS_SPACE):
|
||||||
return False
|
return False
|
||||||
return label != 0 and not st.entity_is_open()
|
else:
|
||||||
|
return True
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef int transition(StateC* st, attr_t label) nogil:
|
cdef int transition(StateC* st, attr_t label) nogil:
|
||||||
|
@ -461,11 +502,14 @@ cdef class Out:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||||
cdef int preset_ent_iob = st.B_(0).ent_iob
|
cdef int preset_ent_iob = st.B_(0).ent_iob
|
||||||
if preset_ent_iob == 3:
|
if st.entity_is_open():
|
||||||
|
return False
|
||||||
|
elif preset_ent_iob == 3:
|
||||||
return False
|
return False
|
||||||
elif preset_ent_iob == 1:
|
elif preset_ent_iob == 1:
|
||||||
return False
|
return False
|
||||||
return not st.entity_is_open()
|
else:
|
||||||
|
return True
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef int transition(StateC* st, attr_t label) nogil:
|
cdef int transition(StateC* st, attr_t label) nogil:
|
||||||
|
|
|
@ -363,6 +363,11 @@ cdef class Parser:
|
||||||
for i in range(batch_size):
|
for i in range(batch_size):
|
||||||
self.moves.set_valid(is_valid, states[i])
|
self.moves.set_valid(is_valid, states[i])
|
||||||
guess = arg_max_if_valid(&scores[i*nr_class], is_valid, nr_class)
|
guess = arg_max_if_valid(&scores[i*nr_class], is_valid, nr_class)
|
||||||
|
if guess == -1:
|
||||||
|
# This shouldn't happen, but it's hard to raise an error here,
|
||||||
|
# and we don't want to infinite loop. So, force to end state.
|
||||||
|
states[i].force_final()
|
||||||
|
else:
|
||||||
action = self.moves.c[guess]
|
action = self.moves.c[guess]
|
||||||
action.do(states[i], action.label)
|
action.do(states[i], action.label)
|
||||||
states[i].push_hist(guess)
|
states[i].push_hist(guess)
|
||||||
|
|
|
@ -1,5 +1,8 @@
|
||||||
"""Test interaction between preset entities and sentence boundaries in NER."""
|
# coding: utf8
|
||||||
import spacy
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from spacy.lang.en import English
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
from spacy.pipeline import EntityRuler, EntityRecognizer
|
from spacy.pipeline import EntityRuler, EntityRecognizer
|
||||||
|
|
||||||
|
@ -7,7 +10,7 @@ from spacy.pipeline import EntityRuler, EntityRecognizer
|
||||||
@pytest.mark.xfail
|
@pytest.mark.xfail
|
||||||
def test_issue3345():
|
def test_issue3345():
|
||||||
"""Test case where preset entity crosses sentence boundary."""
|
"""Test case where preset entity crosses sentence boundary."""
|
||||||
nlp = spacy.blank("en")
|
nlp = English()
|
||||||
doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"])
|
doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"])
|
||||||
doc[4].is_sent_start = True
|
doc[4].is_sent_start = True
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user