spaCy/spacy/pipeline/_parser_internals/ner.pyx
Basile Dura b0228d8ea6
ci: add cython linter (#12694)
* chore: add cython-linter dev dependency

* fix: lexeme.pyx

* fix: morphology.pxd

* fix: tokenizer.pxd

* fix: vocab.pxd

* fix: morphology.pxd (line length)

* ci: add cython-lint

* ci: fix cython-lint call

* Fix kb/candidate.pyx.

* Fix kb/kb.pyx.

* Fix kb/kb_in_memory.pyx.

* Fix kb.

* Fix training/ partially.

* Fix training/. Ignore trailing whitespaces and too long lines.

* Fix ml/.

* Fix matcher/.

* Fix pipeline/.

* Fix tokens/.

* Fix build errors. Fix vocab.pyx.

* Fix cython-lint install and run.

* Fix lexeme.pyx, parts_of_speech.pxd, vectors.pyx. Temporarily disable cython-lint execution.

* Fix attrs.pyx, lexeme.pyx, symbols.pxd, isort issues.

* Make cython-lint install conditional. Fix tokenizer.pyx.

* Fix remaining files. Reenable cython-lint check.

* Readded parentheses.

* Fix test_build_dependencies().

* Add explanatory comment to cython-lint execution.

---------

Co-authored-by: Raphael Mitsch <r.mitsch@outlook.com>
2023-07-19 12:03:31 +02:00

689 lines
22 KiB
Cython

from cymem.cymem cimport Pool
from libc.stdint cimport int32_t
from collections import Counter
from thinc.extra.search cimport Beam
from ...tokens.doc cimport Doc
from ...tokens.span import Span
from ...attrs cimport IS_SPACE
from ...lexeme cimport Lexeme
from ...structs cimport SpanC
from ...tokens.span cimport Span
from ...typedefs cimport attr_t, weight_t
from ...training import split_bilu_label
from ...training.example cimport Example
from ._state cimport StateC
from .stateclass cimport StateClass
from .transition_system cimport Transition, do_func_t
from ...errors import Errors
cdef enum:
MISSING
BEGIN
IN
LAST
UNIT
OUT
N_MOVES
MOVE_NAMES = [None] * N_MOVES
MOVE_NAMES[MISSING] = 'M'
MOVE_NAMES[BEGIN] = 'B'
MOVE_NAMES[IN] = 'I'
MOVE_NAMES[LAST] = 'L'
MOVE_NAMES[UNIT] = 'U'
MOVE_NAMES[OUT] = 'O'
cdef struct GoldNERStateC:
Transition* ner
SpanC* negs
int32_t length
int32_t nr_neg
cdef class BiluoGold:
cdef Pool mem
cdef GoldNERStateC c
def __init__(self, BiluoPushDown moves, StateClass stcls, Example example, neg_key):
self.mem = Pool()
self.c = create_gold_state(self.mem, moves, stcls.c, example, neg_key)
def update(self, StateClass stcls):
update_gold_state(&self.c, stcls.c)
cdef GoldNERStateC create_gold_state(
Pool mem,
BiluoPushDown moves,
const StateC* stcls,
Example example,
neg_key
) except *:
cdef GoldNERStateC gs
cdef Span neg
if neg_key is not None:
negs = example.get_aligned_spans_y2x(
example.y.spans.get(neg_key, []),
allow_overlap=True
)
else:
negs = []
assert example.x.length > 0
gs.ner = <Transition*>mem.alloc(example.x.length, sizeof(Transition))
gs.negs = <SpanC*>mem.alloc(len(negs), sizeof(SpanC))
gs.nr_neg = len(negs)
ner_ents, ner_tags = example.get_aligned_ents_and_ner()
for i, ner_tag in enumerate(ner_tags):
gs.ner[i] = moves.lookup_transition(ner_tag)
# Prevent conflicting spans in the data. For NER, spans are equal if they have the same offsets and label.
neg_span_triples = {(neg_ent.start_char, neg_ent.end_char, neg_ent.label) for neg_ent in negs}
for pos_span in ner_ents:
if (pos_span.start_char, pos_span.end_char, pos_span.label) in neg_span_triples:
raise ValueError(Errors.E868.format(span=(pos_span.start_char, pos_span.end_char, pos_span.label_)))
# In order to handle negative samples, we need to maintain the full
# (start, end, label) triple. If we break it down to the 'isnt B-LOC'
# thing, we'll get blocked if there's an incorrect prefix.
for i, neg in enumerate(negs):
gs.negs[i] = neg.c
return gs
cdef void update_gold_state(GoldNERStateC* gs, const StateC* state) except *:
# We don't need to update each time, unlike the parser.
pass
cdef do_func_t[N_MOVES] do_funcs
cdef bint _entity_is_sunk(const StateC* state, Transition* golds) nogil:
if not state.entity_is_open():
return False
cdef const Transition* gold = &golds[state.E(0)]
ent = state.get_ent()
if gold.move != BEGIN and gold.move != UNIT:
return True
elif gold.label != ent.label:
return True
else:
return False
cdef class BiluoPushDown(TransitionSystem):
def __init__(self, *args, **kwargs):
TransitionSystem.__init__(self, *args, **kwargs)
@classmethod
def get_actions(cls, **kwargs):
actions = {
MISSING: Counter(),
BEGIN: Counter(),
IN: Counter(),
LAST: Counter(),
UNIT: Counter(),
OUT: Counter()
}
actions[OUT][''] = 1 # Represents a token predicted to be outside of any entity
actions[UNIT][''] = 1 # Represents a token prohibited to be in an entity
for entity_type in kwargs.get('entity_types', []):
for action in (BEGIN, IN, LAST, UNIT):
actions[action][entity_type] = 1
for example in kwargs.get('examples', []):
for token in example.y:
ent_type = token.ent_type_
if ent_type:
for action in (BEGIN, IN, LAST, UNIT):
actions[action][ent_type] += 1
return actions
@property
def action_types(self):
return (BEGIN, IN, LAST, UNIT, OUT)
def get_doc_labels(self, doc):
labels = set()
for token in doc:
if token.ent_type:
labels.add(token.ent_type_)
return labels
def move_name(self, int move, attr_t label):
if move == OUT:
return 'O'
elif move == MISSING:
return 'M'
else:
return MOVE_NAMES[move] + '-' + self.strings[label]
def init_gold_batch(self, examples):
all_states = self.init_batch([eg.predicted for eg in examples])
golds = []
states = []
for state, eg in zip(all_states, examples):
if self.has_gold(eg) and not state.is_final():
golds.append(self.init_gold(state, eg))
states.append(state)
n_steps = sum([len(s.queue) for s in states])
return states, golds, n_steps
cdef Transition lookup_transition(self, object name) except *:
cdef attr_t label
if name == '-' or name == '' or name is None:
return Transition(clas=0, move=MISSING, label=0, score=0)
elif '-' in name:
move_str, label_str = split_bilu_label(name)
# Deprecated, hacky way to denote 'not this entity'
if label_str.startswith('!'):
raise ValueError(Errors.E869.format(label=name))
label = self.strings.add(label_str)
else:
move_str = name
label = 0
move = MOVE_NAMES.index(move_str)
for i in range(self.n_moves):
if self.c[i].move == move and self.c[i].label == label:
return self.c[i]
raise KeyError(Errors.E022.format(name=name))
cdef Transition init_transition(self, int clas, int move, attr_t label) except *:
# TODO: Apparent Cython bug here when we try to use the Transition()
# constructor with the function pointers
cdef Transition t
t.score = 0
t.clas = clas
t.move = move
t.label = label
if move == MISSING:
t.is_valid = Missing.is_valid
t.do = Missing.transition
t.get_cost = Missing.cost
elif move == BEGIN:
t.is_valid = Begin.is_valid
t.do = Begin.transition
t.get_cost = Begin.cost
elif move == IN:
t.is_valid = In.is_valid
t.do = In.transition
t.get_cost = In.cost
elif move == LAST:
t.is_valid = Last.is_valid
t.do = Last.transition
t.get_cost = Last.cost
elif move == UNIT:
t.is_valid = Unit.is_valid
t.do = Unit.transition
t.get_cost = Unit.cost
elif move == OUT:
t.is_valid = Out.is_valid
t.do = Out.transition
t.get_cost = Out.cost
else:
raise ValueError(Errors.E019.format(action=move, src='ner'))
return t
def add_action(self, int action, label_name, freq=None):
cdef attr_t label_id
if not isinstance(label_name, (int, long)):
label_id = self.strings.add(label_name)
else:
label_id = label_name
if action == OUT and label_id != 0:
return None
if action == MISSING:
return None
# Check we're not creating a move we already have, so that this is
# idempotent
for trans in self.c[:self.n_moves]:
if trans.move == action and trans.label == label_id:
return 0
if self.n_moves >= self._size:
self._size = self.n_moves
self._size *= 2
self.c = <Transition*>self.mem.realloc(self.c, self._size * sizeof(self.c[0]))
self.c[self.n_moves] = self.init_transition(self.n_moves, action, label_id)
self.n_moves += 1
if self.labels.get(action, []):
freq = min(0, min(self.labels[action].values()))
self.labels[action][label_name] = freq-1
else:
self.labels[action] = Counter()
self.labels[action][label_name] = -1
return 1
def set_annotations(self, StateClass state, Doc doc):
cdef int i
ents = []
for i in range(state.c._ents.size()):
ent = state.c._ents.at(i)
if ent.start != -1 and ent.end != -1:
ents.append(Span(doc, ent.start, ent.end, label=ent.label, kb_id=doc.c[ent.start].ent_kb_id))
doc.set_ents(ents, default="unmodified")
# Set non-blocked tokens to O
for i in range(doc.length):
if doc.c[i].ent_iob == 0:
doc.c[i].ent_iob = 2
def get_beam_parses(self, Beam beam):
parses = []
probs = beam.probs
for i in range(beam.size):
state = <StateC*>beam.at(i)
if state.is_final():
prob = probs[i]
parse = []
for j in range(state._ents.size()):
ent = state._ents.at(j)
if ent.start != -1 and ent.end != -1:
parse.append((ent.start, ent.end, self.strings[ent.label]))
parses.append((prob, parse))
return parses
def init_gold(self, StateClass state, Example example):
return BiluoGold(self, state, example, self.neg_key)
def has_gold(self, Example eg, start=0, end=None):
# We get x and y referring to X, we want to check relative to Y,
# the reference
y_spans = eg.get_aligned_spans_x2y([eg.x[start:end]])
if not y_spans:
y_spans = [eg.y[:]]
y_span = y_spans[0]
start = y_span.start
end = y_span.end
neg_key = self.neg_key
if neg_key is not None:
# If we have any negative samples, count that as having annotation.
for span in eg.y.spans.get(neg_key, []):
if span.start >= start and span.end <= end:
return True
for word in eg.y[start:end]:
if word.ent_iob != 0:
return True
else:
return False
def get_cost(self, StateClass stcls, gold, int i):
if not isinstance(gold, BiluoGold):
raise TypeError(Errors.E909.format(name="BiluoGold"))
cdef BiluoGold gold_ = gold
gold_state = gold_.c
if self.c[i].is_valid(stcls.c, self.c[i].label):
cost = self.c[i].get_cost(stcls.c, &gold_state, self.c[i].label)
else:
cost = 9000
return cost
cdef int set_costs(self, int* is_valid, weight_t* costs,
const StateC* state, gold) except -1:
if not isinstance(gold, BiluoGold):
raise TypeError(Errors.E909.format(name="BiluoGold"))
cdef BiluoGold gold_ = gold
gold_state = gold_.c
update_gold_state(&gold_state, state)
n_gold = 0
self.set_valid(is_valid, state)
for i in range(self.n_moves):
if is_valid[i]:
costs[i] = self.c[i].get_cost(state, &gold_state, self.c[i].label)
n_gold += costs[i] <= 0
else:
costs[i] = 9000
cdef class Missing:
@staticmethod
cdef bint is_valid(const StateC* st, attr_t label) nogil:
return False
@staticmethod
cdef int transition(StateC* s, attr_t label) nogil:
pass
@staticmethod
cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
return 9000
cdef class Begin:
@staticmethod
cdef bint is_valid(const StateC* st, attr_t label) nogil:
cdef int preset_ent_iob = st.B_(0).ent_iob
cdef attr_t preset_ent_label = st.B_(0).ent_type
if st.entity_is_open():
return False
if st.buffer_length() < 2:
# If we're the last token of the input, we can't B -- must U or O.
return False
elif label == 0:
return False
elif preset_ent_iob == 1:
# Ensure we don't clobber preset entities. If no entity preset,
# ent_iob is 0
return False
elif preset_ent_iob == 3:
# Okay, we're in a preset entity.
if label != preset_ent_label:
# If label isn't right, reject
return False
elif st.B_(1).ent_iob != 1:
# If next token isn't marked I, we need to make U, not B.
return False
else:
# Otherwise, force acceptance, even if we're across a sentence
# boundary or the token is whitespace.
return True
elif st.B_(1).ent_iob == 3:
# If the next word is B, we can't B now
return False
elif st.B_(1).sent_start == 1:
# Don't allow entities to extend across sentence boundaries
return False
# Don't allow entities to start on whitespace
elif Lexeme.get_struct_attr(st.B_(0).lex, IS_SPACE):
return False
else:
return True
@staticmethod
cdef int transition(StateC* st, attr_t label) nogil:
st.open_ent(label)
st.push()
st.pop()
@staticmethod
cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
gold = <GoldNERStateC*>_gold
b0 = s.B(0)
cdef int cost = 0
cdef int g_act = gold.ner[b0].move
cdef attr_t g_tag = gold.ner[b0].label
if g_act == MISSING:
pass
elif g_act == BEGIN:
# B, Gold B --> Label match
cost += label != g_tag
else:
# B, Gold I --> False (P)
# B, Gold L --> False (P)
# B, Gold O --> False (P)
# B, Gold U --> False (P)
cost += 1
if s.buffer_length() < 3:
# Handle negatives. In general we can't really do much to block
# B, because we don't know whether the whole entity is going to
# be correct or not. However, we can at least tell whether we're
# going to be opening an entity where there's only one possible
# L.
for span in gold.negs[:gold.nr_neg]:
if span.label == label and span.start == b0:
cost += 1
break
return cost
cdef class In:
@staticmethod
cdef bint is_valid(const StateC* st, attr_t label) nogil:
if not st.entity_is_open():
return False
if st.buffer_length() < 2:
# If we're at the end, we can't I.
return False
ent = st.get_ent()
cdef int preset_ent_iob = st.B_(0).ent_iob
cdef attr_t preset_ent_label = st.B_(0).ent_type
if label == 0:
return False
elif ent.label != label:
return False
elif preset_ent_iob == 3:
return False
elif st.B_(1).ent_iob == 3:
# If we know the next word is B, we can't be I (must be L)
return False
elif preset_ent_iob == 1:
if st.B_(1).ent_iob in (0, 2):
# if next preset is missing or O, this can't be I (must be L)
return False
elif label != preset_ent_label:
# If label isn't right, reject
return False
else:
# Otherwise, force acceptance, even if we're across a sentence
# boundary or the token is whitespace.
return True
elif st.B(1) != -1 and st.B_(1).sent_start == 1:
# Don't allow entities to extend across sentence boundaries
return False
else:
return True
@staticmethod
cdef int transition(StateC* st, attr_t label) nogil:
st.push()
st.pop()
@staticmethod
cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
gold = <GoldNERStateC*>_gold
cdef int next_act = gold.ner[s.B(1)].move if s.B(1) >= 0 else OUT
cdef int g_act = gold.ner[s.B(0)].move
cdef bint is_sunk = _entity_is_sunk(s, gold.ner)
if g_act == MISSING:
return 0
elif g_act == BEGIN:
# I, Gold B --> True
# (P of bad open entity sunk, R of this entity sunk)
return 0
elif g_act == IN:
# I, Gold I --> True
# (label forced by prev, if mismatch, P and R both sunk)
return 0
elif g_act == LAST:
# I, Gold L --> True iff this entity sunk and next tag == O
return not (is_sunk and (next_act == OUT or next_act == MISSING))
elif g_act == OUT:
# I, Gold O --> True iff next tag == O
return not (next_act == OUT or next_act == MISSING)
elif g_act == UNIT:
# I, Gold U --> True iff next tag == O
return next_act != OUT
else:
return 1
cdef class Last:
@staticmethod
cdef bint is_valid(const StateC* st, attr_t label) nogil:
cdef int preset_ent_iob = st.B_(0).ent_iob
cdef attr_t preset_ent_label = st.B_(0).ent_type
if label == 0:
return False
elif not st.entity_is_open():
return False
elif preset_ent_iob == 1 and st.B_(1).ent_iob != 1:
# If a preset entity has I followed by not-I, is L
if label != preset_ent_label:
# If label isn't right, reject
return False
else:
# Otherwise, force acceptance, even if we're across a sentence
# boundary or the token is whitespace.
return True
elif st.get_ent().label != label:
return False
elif st.B_(1).ent_iob == 1:
# If a preset entity has I next, we can't L here.
return False
else:
return True
@staticmethod
cdef int transition(StateC* st, attr_t label) nogil:
st.close_ent()
st.push()
st.pop()
@staticmethod
cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
gold = <GoldNERStateC*>_gold
b0 = s.B(0)
ent_start = s.E(0)
cdef int g_act = gold.ner[b0].move
cdef int cost = 0
if g_act == MISSING:
pass
elif g_act == BEGIN:
# L, Gold B --> True
pass
elif g_act == IN:
# L, Gold I --> True iff this entity sunk
cost += not _entity_is_sunk(s, gold.ner)
elif g_act == LAST:
# L, Gold L --> True
pass
elif g_act == OUT:
# L, Gold O --> True
pass
elif g_act == UNIT:
# L, Gold U --> True
pass
else:
cost += 1
# If we have negative-example entities, integrate them into the objective,
# by marking actions that close an entity that we know is incorrect
# as costly.
for span in gold.negs[:gold.nr_neg]:
if span.label == label and (span.end-1) == b0 and span.start == ent_start:
cost += 1
break
return cost
cdef class Unit:
@staticmethod
cdef bint is_valid(const StateC* st, attr_t label) nogil:
cdef int preset_ent_iob = st.B_(0).ent_iob
cdef attr_t preset_ent_label = st.B_(0).ent_type
if label == 0:
# this is only allowed if it's a preset blocked annotation
if preset_ent_label == 0 and preset_ent_iob == 3:
return True
else:
return False
elif st.entity_is_open():
return False
elif st.B_(1).ent_iob == 1:
# If next token is In, we can't be Unit -- must be Begin
return False
elif preset_ent_iob == 3:
# Okay, there's a preset entity here
if label != preset_ent_label:
# Require labels to match
return False
else:
# Otherwise return True, ignoring the whitespace constraint.
return True
elif Lexeme.get_struct_attr(st.B_(0).lex, IS_SPACE):
return False
else:
return True
@staticmethod
cdef int transition(StateC* st, attr_t label) nogil:
st.open_ent(label)
st.close_ent()
st.push()
st.pop()
@staticmethod
cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
gold = <GoldNERStateC*>_gold
cdef int g_act = gold.ner[s.B(0)].move
cdef attr_t g_tag = gold.ner[s.B(0)].label
cdef int cost = 0
if g_act == MISSING:
pass
elif g_act == UNIT:
# U, Gold U --> True iff tag match
cost += label != g_tag
else:
# U, Gold B --> False
# U, Gold I --> False
# U, Gold L --> False
# U, Gold O --> False
cost += 1
# If we have negative-example entities, integrate them into the objective.
# This is fairly straight-forward for U- entities, as we have a single
# action
cdef int b0 = s.B(0)
for span in gold.negs[:gold.nr_neg]:
if span.label == label and span.start == b0 and span.end == (b0+1):
cost += 1
break
return cost
cdef class Out:
@staticmethod
cdef bint is_valid(const StateC* st, attr_t label) nogil:
cdef int preset_ent_iob = st.B_(0).ent_iob
if st.entity_is_open():
return False
elif preset_ent_iob == 3:
return False
elif preset_ent_iob == 1:
return False
else:
return True
@staticmethod
cdef int transition(StateC* st, attr_t label) nogil:
st.push()
st.pop()
@staticmethod
cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
gold = <GoldNERStateC*>_gold
cdef int g_act = gold.ner[s.B(0)].move
cdef weight_t cost = 0
if g_act == MISSING:
pass
elif g_act == BEGIN:
# O, Gold B --> False
cost += 1
elif g_act == IN:
# O, Gold I --> True
pass
elif g_act == LAST:
# O, Gold L --> True
pass
elif g_act == OUT:
# O, Gold O --> True
pass
elif g_act == UNIT:
# O, Gold U --> False
cost += 1
else:
cost += 1
return cost