mirror of
https://github.com/explosion/spaCy.git
synced 2025-03-13 07:55:49 +03:00
NER: Ensure zero-cost sequence with sentence split in entity
If we use a sentence splitter as one of the annotating components during training, an entity can become split in the predicted `Doc`. Before this change, training would fail, because no zero-cost transition sequence could be found. This fixes two scenarios: 1. When the gold action is `B` and a split occurs after the current token, the `BEGIN` action is invalid. However, this was the only possible zero-cost action. This change makes `OUT` a zero-cost action in this case. 2. When the gold action is `I` and a split occurs after the current token, the `IN` action is invalid, removing the only zero-cost action. This change makes `LAST` a zero-cost action, so that the entity can be properly closed.
This commit is contained in:
parent
28de85737f
commit
6906af3d8f
|
@ -120,6 +120,10 @@ cdef bint _entity_is_sunk(const StateC* state, Transition* golds) nogil:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
cdef bint _next_is_sent_start(const StateC* state) nogil:
|
||||||
|
return state.B(1) != -1 and state.B_(1).sent_start == 1
|
||||||
|
|
||||||
|
|
||||||
cdef class BiluoPushDown(TransitionSystem):
|
cdef class BiluoPushDown(TransitionSystem):
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
TransitionSystem.__init__(self, *args, **kwargs)
|
TransitionSystem.__init__(self, *args, **kwargs)
|
||||||
|
@ -388,7 +392,7 @@ cdef class Begin:
|
||||||
elif st.B_(1).ent_iob == 3:
|
elif st.B_(1).ent_iob == 3:
|
||||||
# If the next word is B, we can't B now
|
# If the next word is B, we can't B now
|
||||||
return False
|
return False
|
||||||
elif st.B_(1).sent_start == 1:
|
elif _next_is_sent_start(st):
|
||||||
# Don't allow entities to extend across sentence boundaries
|
# Don't allow entities to extend across sentence boundaries
|
||||||
return False
|
return False
|
||||||
# Don't allow entities to start on whitespace
|
# Don't allow entities to start on whitespace
|
||||||
|
@ -466,7 +470,7 @@ cdef class In:
|
||||||
# Otherwise, force acceptance, even if we're across a sentence
|
# Otherwise, force acceptance, even if we're across a sentence
|
||||||
# boundary or the token is whitespace.
|
# boundary or the token is whitespace.
|
||||||
return True
|
return True
|
||||||
elif st.B(1) != -1 and st.B_(1).sent_start == 1:
|
elif _next_is_sent_start(st):
|
||||||
# Don't allow entities to extend across sentence boundaries
|
# Don't allow entities to extend across sentence boundaries
|
||||||
return False
|
return False
|
||||||
else:
|
else:
|
||||||
|
@ -558,8 +562,9 @@ cdef class Last:
|
||||||
# L, Gold B --> True
|
# L, Gold B --> True
|
||||||
pass
|
pass
|
||||||
elif g_act == IN:
|
elif g_act == IN:
|
||||||
# L, Gold I --> True iff this entity sunk
|
# L, Gold I --> True iff this entity sunk or there is sentence
|
||||||
cost += not _entity_is_sunk(s, gold.ner)
|
# sentence break after the first buffer token.
|
||||||
|
cost += not (_entity_is_sunk(s, gold.ner) or _next_is_sent_start(s))
|
||||||
elif g_act == LAST:
|
elif g_act == LAST:
|
||||||
# L, Gold L --> True
|
# L, Gold L --> True
|
||||||
pass
|
pass
|
||||||
|
@ -674,8 +679,9 @@ cdef class Out:
|
||||||
if g_act == MISSING:
|
if g_act == MISSING:
|
||||||
pass
|
pass
|
||||||
elif g_act == BEGIN:
|
elif g_act == BEGIN:
|
||||||
# O, Gold B --> False
|
# O, Gold B --> False, unless there is a sentence break after
|
||||||
cost += 1
|
# the next buffer token.
|
||||||
|
cost += not _next_is_sent_start(s)
|
||||||
elif g_act == IN:
|
elif g_act == IN:
|
||||||
# O, Gold I --> True
|
# O, Gold I --> True
|
||||||
pass
|
pass
|
||||||
|
|
|
@ -816,6 +816,31 @@ def test_ner_warns_no_lookups(caplog):
|
||||||
assert "W033" not in caplog.text
|
assert "W033" not in caplog.text
|
||||||
|
|
||||||
|
|
||||||
|
def test_train_sent_split_in_entity():
|
||||||
|
# Check that we can train on inputs when entities are sentence-split
|
||||||
|
# by an annotating component.
|
||||||
|
nlp = English()
|
||||||
|
ner = nlp.add_pipe("ner", config={"update_with_oracle_cut_size": 3})
|
||||||
|
|
||||||
|
eg = Example.from_dict(
|
||||||
|
nlp.make_doc("I like the Kinesis Advantage2 LF very much."),
|
||||||
|
{"entities": [(11, 32, "MISC")]},
|
||||||
|
)
|
||||||
|
|
||||||
|
# Go bezerk, put a boundary on every combination of tokens.
|
||||||
|
train_examples = []
|
||||||
|
for i in range(1, len(eg.predicted)):
|
||||||
|
for j in range(1, len(eg.predicted)):
|
||||||
|
eg_ij = eg.copy()
|
||||||
|
eg_ij.predicted[i].is_sent_start = True
|
||||||
|
eg_ij.predicted[j].is_sent_start = True
|
||||||
|
train_examples.append(eg_ij)
|
||||||
|
|
||||||
|
ner.add_label("MISC")
|
||||||
|
nlp.initialize()
|
||||||
|
nlp.update(train_examples, sgd=False, annotates=[])
|
||||||
|
|
||||||
|
|
||||||
@Language.factory("blocker")
|
@Language.factory("blocker")
|
||||||
class BlockerComponent1:
|
class BlockerComponent1:
|
||||||
def __init__(self, nlp, start, end, name="my_blocker"):
|
def __init__(self, nlp, start, end, name="my_blocker"):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user