2023-06-26 12:41:03 +03:00
|
|
|
from cymem.cymem cimport Pool
|
2022-01-12 15:38:52 +03:00
|
|
|
from libcpp.memory cimport shared_ptr
|
|
|
|
from libcpp.vector cimport vector
|
2017-04-15 14:05:15 +03:00
|
|
|
|
2020-10-04 12:16:31 +03:00
|
|
|
from collections import Counter
|
|
|
|
|
2020-12-13 04:08:32 +03:00
|
|
|
from ...tokens.doc cimport Doc
|
2023-06-26 12:41:03 +03:00
|
|
|
|
2020-12-13 04:08:32 +03:00
|
|
|
from ...tokens.span import Span
|
2023-06-26 12:41:03 +03:00
|
|
|
|
2020-07-31 00:30:54 +03:00
|
|
|
from ...attrs cimport IS_SPACE
|
2023-06-26 12:41:03 +03:00
|
|
|
from ...lexeme cimport Lexeme
|
2023-07-19 13:03:31 +03:00
|
|
|
from ...structs cimport SpanC
|
2023-06-26 12:41:03 +03:00
|
|
|
from ...tokens.span cimport Span
|
|
|
|
from ...typedefs cimport attr_t, weight_t
|
|
|
|
|
2022-06-17 22:02:37 +03:00
|
|
|
from ...training import split_bilu_label
|
2023-06-26 12:41:03 +03:00
|
|
|
|
2020-09-09 11:31:03 +03:00
|
|
|
from ...training.example cimport Example
|
2023-06-26 12:41:03 +03:00
|
|
|
from ._state cimport StateC
|
2022-12-17 16:32:19 +03:00
|
|
|
from .search cimport Beam
|
2017-04-15 14:05:15 +03:00
|
|
|
from .stateclass cimport StateClass
|
2020-07-31 00:30:54 +03:00
|
|
|
from .transition_system cimport Transition, do_func_t
|
2015-02-02 08:38:52 +03:00
|
|
|
|
2020-10-04 12:16:31 +03:00
|
|
|
from ...errors import Errors
|
|
|
|
|
2015-02-02 08:38:52 +03:00
|
|
|
|
|
|
|
cdef enum:
|
|
|
|
MISSING
|
|
|
|
BEGIN
|
|
|
|
IN
|
|
|
|
LAST
|
|
|
|
UNIT
|
|
|
|
OUT
|
|
|
|
N_MOVES
|
|
|
|
|
2015-03-23 17:34:08 +03:00
|
|
|
|
2015-03-09 14:06:01 +03:00
|
|
|
MOVE_NAMES = [None] * N_MOVES
|
|
|
|
MOVE_NAMES[MISSING] = 'M'
|
|
|
|
MOVE_NAMES[BEGIN] = 'B'
|
|
|
|
MOVE_NAMES[IN] = 'I'
|
|
|
|
MOVE_NAMES[LAST] = 'L'
|
|
|
|
MOVE_NAMES[UNIT] = 'U'
|
|
|
|
MOVE_NAMES[OUT] = 'O'
|
|
|
|
|
2015-02-02 08:38:52 +03:00
|
|
|
|
2020-06-26 20:34:12 +03:00
|
|
|
cdef struct GoldNERStateC:
|
|
|
|
Transition* ner
|
2022-01-12 15:38:52 +03:00
|
|
|
vector[shared_ptr[SpanC]] negs
|
2020-06-26 20:34:12 +03:00
|
|
|
|
|
|
|
|
|
|
|
cdef class BiluoGold:
|
|
|
|
cdef Pool mem
|
|
|
|
cdef GoldNERStateC c
|
|
|
|
|
2021-06-17 10:33:00 +03:00
|
|
|
def __init__(self, BiluoPushDown moves, StateClass stcls, Example example, neg_key):
|
2020-06-26 20:34:12 +03:00
|
|
|
self.mem = Pool()
|
2021-06-17 10:33:00 +03:00
|
|
|
self.c = create_gold_state(self.mem, moves, stcls.c, example, neg_key)
|
2020-06-26 20:34:12 +03:00
|
|
|
|
|
|
|
def update(self, StateClass stcls):
|
2020-12-13 04:08:32 +03:00
|
|
|
update_gold_state(&self.c, stcls.c)
|
2020-06-26 20:34:12 +03:00
|
|
|
|
|
|
|
|
|
|
|
cdef GoldNERStateC create_gold_state(
|
|
|
|
Pool mem,
|
|
|
|
BiluoPushDown moves,
|
2020-12-13 04:08:32 +03:00
|
|
|
const StateC* stcls,
|
2021-06-17 10:33:00 +03:00
|
|
|
Example example,
|
|
|
|
neg_key
|
2020-06-26 20:34:12 +03:00
|
|
|
) except *:
|
|
|
|
cdef GoldNERStateC gs
|
2021-06-17 10:33:00 +03:00
|
|
|
cdef Span neg
|
|
|
|
if neg_key is not None:
|
|
|
|
negs = example.get_aligned_spans_y2x(
|
|
|
|
example.y.spans.get(neg_key, []),
|
|
|
|
allow_overlap=True
|
|
|
|
)
|
|
|
|
else:
|
|
|
|
negs = []
|
2021-01-06 04:50:17 +03:00
|
|
|
assert example.x.length > 0
|
2020-06-26 20:34:12 +03:00
|
|
|
gs.ner = <Transition*>mem.alloc(example.x.length, sizeof(Transition))
|
2021-06-17 10:33:00 +03:00
|
|
|
ner_ents, ner_tags = example.get_aligned_ents_and_ner()
|
2020-06-26 20:34:12 +03:00
|
|
|
for i, ner_tag in enumerate(ner_tags):
|
|
|
|
gs.ner[i] = moves.lookup_transition(ner_tag)
|
2021-06-17 10:33:00 +03:00
|
|
|
|
|
|
|
# Prevent conflicting spans in the data. For NER, spans are equal if they have the same offsets and label.
|
|
|
|
neg_span_triples = {(neg_ent.start_char, neg_ent.end_char, neg_ent.label) for neg_ent in negs}
|
|
|
|
for pos_span in ner_ents:
|
|
|
|
if (pos_span.start_char, pos_span.end_char, pos_span.label) in neg_span_triples:
|
|
|
|
raise ValueError(Errors.E868.format(span=(pos_span.start_char, pos_span.end_char, pos_span.label_)))
|
|
|
|
|
|
|
|
# In order to handle negative samples, we need to maintain the full
|
|
|
|
# (start, end, label) triple. If we break it down to the 'isnt B-LOC'
|
|
|
|
# thing, we'll get blocked if there's an incorrect prefix.
|
2022-01-12 15:38:52 +03:00
|
|
|
for neg in negs:
|
|
|
|
gs.negs.push_back(neg.c)
|
2020-06-26 20:34:12 +03:00
|
|
|
return gs
|
|
|
|
|
|
|
|
|
2020-12-13 04:08:32 +03:00
|
|
|
cdef void update_gold_state(GoldNERStateC* gs, const StateC* state) except *:
|
2020-06-26 20:34:12 +03:00
|
|
|
# We don't need to update each time, unlike the parser.
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
2015-03-09 08:46:22 +03:00
|
|
|
cdef do_func_t[N_MOVES] do_funcs
|
|
|
|
|
|
|
|
|
2020-12-13 04:08:32 +03:00
|
|
|
cdef bint _entity_is_sunk(const StateC* state, Transition* golds) nogil:
|
|
|
|
if not state.entity_is_open():
|
2015-03-09 08:46:22 +03:00
|
|
|
return False
|
|
|
|
|
2020-12-13 04:08:32 +03:00
|
|
|
cdef const Transition* gold = &golds[state.E(0)]
|
|
|
|
ent = state.get_ent()
|
2015-03-09 08:46:22 +03:00
|
|
|
if gold.move != BEGIN and gold.move != UNIT:
|
|
|
|
return True
|
2020-12-13 04:08:32 +03:00
|
|
|
elif gold.label != ent.label:
|
2015-03-09 08:46:22 +03:00
|
|
|
return True
|
|
|
|
else:
|
|
|
|
return False
|
|
|
|
|
2015-08-09 03:31:53 +03:00
|
|
|
|
2015-03-09 08:46:22 +03:00
|
|
|
cdef class BiluoPushDown(TransitionSystem):
|
2017-05-27 23:50:21 +03:00
|
|
|
def __init__(self, *args, **kwargs):
|
|
|
|
TransitionSystem.__init__(self, *args, **kwargs)
|
|
|
|
|
2015-03-09 08:46:22 +03:00
|
|
|
@classmethod
|
2016-10-16 22:34:57 +03:00
|
|
|
def get_actions(cls, **kwargs):
|
2018-03-27 20:23:02 +03:00
|
|
|
actions = {
|
|
|
|
MISSING: Counter(),
|
|
|
|
BEGIN: Counter(),
|
|
|
|
IN: Counter(),
|
|
|
|
LAST: Counter(),
|
|
|
|
UNIT: Counter(),
|
|
|
|
OUT: Counter()
|
|
|
|
}
|
2019-09-18 22:37:17 +03:00
|
|
|
actions[OUT][''] = 1 # Represents a token predicted to be outside of any entity
|
2023-07-19 13:03:31 +03:00
|
|
|
actions[UNIT][''] = 1 # Represents a token prohibited to be in an entity
|
2016-10-16 22:34:57 +03:00
|
|
|
for entity_type in kwargs.get('entity_types', []):
|
|
|
|
for action in (BEGIN, IN, LAST, UNIT):
|
2018-03-27 20:23:02 +03:00
|
|
|
actions[action][entity_type] = 1
|
2020-06-26 20:34:12 +03:00
|
|
|
for example in kwargs.get('examples', []):
|
|
|
|
for token in example.y:
|
|
|
|
ent_type = token.ent_type_
|
|
|
|
if ent_type:
|
2019-11-25 18:03:28 +03:00
|
|
|
for action in (BEGIN, IN, LAST, UNIT):
|
2020-06-26 20:34:12 +03:00
|
|
|
actions[action][ent_type] += 1
|
2016-10-16 22:34:57 +03:00
|
|
|
return actions
|
2015-03-09 08:46:22 +03:00
|
|
|
|
2019-03-11 17:59:09 +03:00
|
|
|
@property
|
|
|
|
def action_types(self):
|
|
|
|
return (BEGIN, IN, LAST, UNIT, OUT)
|
2016-01-19 21:07:43 +03:00
|
|
|
|
2021-01-27 04:54:47 +03:00
|
|
|
def get_doc_labels(self, doc):
|
|
|
|
labels = set()
|
|
|
|
for token in doc:
|
|
|
|
if token.ent_type:
|
|
|
|
labels.add(token.ent_type_)
|
|
|
|
return labels
|
2023-12-08 22:23:08 +03:00
|
|
|
|
2017-05-28 15:06:40 +03:00
|
|
|
def move_name(self, int move, attr_t label):
|
2015-03-10 20:00:23 +03:00
|
|
|
if move == OUT:
|
|
|
|
return 'O'
|
2017-05-27 23:50:21 +03:00
|
|
|
elif move == MISSING:
|
2015-03-10 20:00:23 +03:00
|
|
|
return 'M'
|
|
|
|
else:
|
2016-09-30 21:11:49 +03:00
|
|
|
return MOVE_NAMES[move] + '-' + self.strings[label]
|
2015-03-10 20:00:23 +03:00
|
|
|
|
2020-06-26 20:34:12 +03:00
|
|
|
def init_gold_batch(self, examples):
|
|
|
|
all_states = self.init_batch([eg.predicted for eg in examples])
|
|
|
|
golds = []
|
|
|
|
states = []
|
|
|
|
for state, eg in zip(all_states, examples):
|
|
|
|
if self.has_gold(eg) and not state.is_final():
|
|
|
|
golds.append(self.init_gold(state, eg))
|
|
|
|
states.append(state)
|
|
|
|
n_steps = sum([len(s.queue) for s in states])
|
|
|
|
return states, golds, n_steps
|
2017-07-29 22:59:02 +03:00
|
|
|
|
2015-03-09 14:06:01 +03:00
|
|
|
cdef Transition lookup_transition(self, object name) except *:
|
2017-05-28 19:09:27 +03:00
|
|
|
cdef attr_t label
|
2019-02-27 14:06:32 +03:00
|
|
|
if name == '-' or name == '' or name is None:
|
2017-09-14 17:59:45 +03:00
|
|
|
return Transition(clas=0, move=MISSING, label=0, score=0)
|
2015-03-10 20:00:23 +03:00
|
|
|
elif '-' in name:
|
2022-06-17 22:02:37 +03:00
|
|
|
move_str, label_str = split_bilu_label(name)
|
2021-06-17 10:33:00 +03:00
|
|
|
# Deprecated, hacky way to denote 'not this entity'
|
2017-04-15 00:52:17 +03:00
|
|
|
if label_str.startswith('!'):
|
2021-06-17 10:33:00 +03:00
|
|
|
raise ValueError(Errors.E869.format(label=name))
|
2017-05-28 15:06:40 +03:00
|
|
|
label = self.strings.add(label_str)
|
2015-03-09 14:06:01 +03:00
|
|
|
else:
|
|
|
|
move_str = name
|
|
|
|
label = 0
|
|
|
|
move = MOVE_NAMES.index(move_str)
|
|
|
|
for i in range(self.n_moves):
|
|
|
|
if self.c[i].move == move and self.c[i].label == label:
|
|
|
|
return self.c[i]
|
2019-09-18 22:37:17 +03:00
|
|
|
raise KeyError(Errors.E022.format(name=name))
|
2015-03-09 14:06:01 +03:00
|
|
|
|
2017-05-28 15:06:40 +03:00
|
|
|
cdef Transition init_transition(self, int clas, int move, attr_t label) except *:
|
2015-03-09 08:46:22 +03:00
|
|
|
# TODO: Apparent Cython bug here when we try to use the Transition()
|
|
|
|
# constructor with the function pointers
|
|
|
|
cdef Transition t
|
|
|
|
t.score = 0
|
|
|
|
t.clas = clas
|
|
|
|
t.move = move
|
|
|
|
t.label = label
|
2015-06-05 03:27:17 +03:00
|
|
|
if move == MISSING:
|
|
|
|
t.is_valid = Missing.is_valid
|
|
|
|
t.do = Missing.transition
|
|
|
|
t.get_cost = Missing.cost
|
|
|
|
elif move == BEGIN:
|
|
|
|
t.is_valid = Begin.is_valid
|
|
|
|
t.do = Begin.transition
|
|
|
|
t.get_cost = Begin.cost
|
|
|
|
elif move == IN:
|
|
|
|
t.is_valid = In.is_valid
|
|
|
|
t.do = In.transition
|
|
|
|
t.get_cost = In.cost
|
|
|
|
elif move == LAST:
|
|
|
|
t.is_valid = Last.is_valid
|
|
|
|
t.do = Last.transition
|
|
|
|
t.get_cost = Last.cost
|
|
|
|
elif move == UNIT:
|
|
|
|
t.is_valid = Unit.is_valid
|
|
|
|
t.do = Unit.transition
|
|
|
|
t.get_cost = Unit.cost
|
|
|
|
elif move == OUT:
|
|
|
|
t.is_valid = Out.is_valid
|
|
|
|
t.do = Out.transition
|
|
|
|
t.get_cost = Out.cost
|
|
|
|
else:
|
2018-04-03 16:50:31 +03:00
|
|
|
raise ValueError(Errors.E019.format(action=move, src='ner'))
|
2015-03-09 08:46:22 +03:00
|
|
|
return t
|
|
|
|
|
2018-03-27 20:23:02 +03:00
|
|
|
def add_action(self, int action, label_name, freq=None):
|
2017-10-07 20:02:40 +03:00
|
|
|
cdef attr_t label_id
|
|
|
|
if not isinstance(label_name, (int, long)):
|
|
|
|
label_id = self.strings.add(label_name)
|
|
|
|
else:
|
|
|
|
label_id = label_name
|
|
|
|
if action == OUT and label_id != 0:
|
2018-12-10 08:31:00 +03:00
|
|
|
return None
|
2021-06-17 10:33:00 +03:00
|
|
|
if action == MISSING:
|
2018-12-10 08:31:00 +03:00
|
|
|
return None
|
2017-10-07 20:02:40 +03:00
|
|
|
# Check we're not creating a move we already have, so that this is
|
|
|
|
# idempotent
|
|
|
|
for trans in self.c[:self.n_moves]:
|
|
|
|
if trans.move == action and trans.label == label_id:
|
|
|
|
return 0
|
|
|
|
if self.n_moves >= self._size:
|
2018-12-10 08:31:00 +03:00
|
|
|
self._size = self.n_moves
|
2017-10-07 20:02:40 +03:00
|
|
|
self._size *= 2
|
|
|
|
self.c = <Transition*>self.mem.realloc(self.c, self._size * sizeof(self.c[0]))
|
|
|
|
self.c[self.n_moves] = self.init_transition(self.n_moves, action, label_id)
|
|
|
|
self.n_moves += 1
|
2018-03-27 20:23:02 +03:00
|
|
|
if self.labels.get(action, []):
|
|
|
|
freq = min(0, min(self.labels[action].values()))
|
|
|
|
self.labels[action][label_name] = freq-1
|
|
|
|
else:
|
|
|
|
self.labels[action] = Counter()
|
|
|
|
self.labels[action][label_name] = -1
|
2017-10-07 20:02:40 +03:00
|
|
|
return 1
|
2017-09-14 17:16:41 +03:00
|
|
|
|
2020-12-13 04:08:32 +03:00
|
|
|
def set_annotations(self, StateClass state, Doc doc):
|
|
|
|
cdef int i
|
|
|
|
ents = []
|
|
|
|
for i in range(state.c._ents.size()):
|
|
|
|
ent = state.c._ents.at(i)
|
|
|
|
if ent.start != -1 and ent.end != -1:
|
2021-05-06 11:49:55 +03:00
|
|
|
ents.append(Span(doc, ent.start, ent.end, label=ent.label, kb_id=doc.c[ent.start].ent_kb_id))
|
2020-12-13 04:08:32 +03:00
|
|
|
doc.set_ents(ents, default="unmodified")
|
|
|
|
# Set non-blocked tokens to O
|
|
|
|
for i in range(doc.length):
|
|
|
|
if doc.c[i].ent_iob == 0:
|
|
|
|
doc.c[i].ent_iob = 2
|
2016-10-27 19:01:55 +03:00
|
|
|
|
2021-01-06 14:02:32 +03:00
|
|
|
def get_beam_parses(self, Beam beam):
|
|
|
|
parses = []
|
|
|
|
probs = beam.probs
|
|
|
|
for i in range(beam.size):
|
|
|
|
state = <StateC*>beam.at(i)
|
|
|
|
if state.is_final():
|
|
|
|
prob = probs[i]
|
|
|
|
parse = []
|
|
|
|
for j in range(state._ents.size()):
|
|
|
|
ent = state._ents.at(j)
|
2021-01-07 08:28:27 +03:00
|
|
|
if ent.start != -1 and ent.end != -1:
|
|
|
|
parse.append((ent.start, ent.end, self.strings[ent.label]))
|
2021-01-06 14:02:32 +03:00
|
|
|
parses.append((prob, parse))
|
|
|
|
return parses
|
|
|
|
|
2020-06-26 20:34:12 +03:00
|
|
|
def init_gold(self, StateClass state, Example example):
|
2021-06-17 10:33:00 +03:00
|
|
|
return BiluoGold(self, state, example, self.neg_key)
|
2020-06-26 20:34:12 +03:00
|
|
|
|
|
|
|
def has_gold(self, Example eg, start=0, end=None):
|
2021-06-17 10:33:00 +03:00
|
|
|
# We get x and y referring to X, we want to check relative to Y,
|
|
|
|
# the reference
|
|
|
|
y_spans = eg.get_aligned_spans_x2y([eg.x[start:end]])
|
|
|
|
if not y_spans:
|
|
|
|
y_spans = [eg.y[:]]
|
|
|
|
y_span = y_spans[0]
|
|
|
|
start = y_span.start
|
|
|
|
end = y_span.end
|
|
|
|
neg_key = self.neg_key
|
|
|
|
if neg_key is not None:
|
|
|
|
# If we have any negative samples, count that as having annotation.
|
|
|
|
for span in eg.y.spans.get(neg_key, []):
|
|
|
|
if span.start >= start and span.end <= end:
|
|
|
|
return True
|
2020-06-26 20:34:12 +03:00
|
|
|
for word in eg.y[start:end]:
|
|
|
|
if word.ent_iob != 0:
|
|
|
|
return True
|
|
|
|
else:
|
|
|
|
return False
|
|
|
|
|
|
|
|
def get_cost(self, StateClass stcls, gold, int i):
|
|
|
|
if not isinstance(gold, BiluoGold):
|
2020-10-04 12:16:31 +03:00
|
|
|
raise TypeError(Errors.E909.format(name="BiluoGold"))
|
2020-06-26 20:34:12 +03:00
|
|
|
cdef BiluoGold gold_ = gold
|
|
|
|
gold_state = gold_.c
|
|
|
|
if self.c[i].is_valid(stcls.c, self.c[i].label):
|
2020-12-13 04:08:32 +03:00
|
|
|
cost = self.c[i].get_cost(stcls.c, &gold_state, self.c[i].label)
|
2020-06-26 20:34:12 +03:00
|
|
|
else:
|
|
|
|
cost = 9000
|
|
|
|
return cost
|
|
|
|
|
|
|
|
cdef int set_costs(self, int* is_valid, weight_t* costs,
|
2020-12-13 04:08:32 +03:00
|
|
|
const StateC* state, gold) except -1:
|
2020-06-26 20:34:12 +03:00
|
|
|
if not isinstance(gold, BiluoGold):
|
2020-10-04 12:16:31 +03:00
|
|
|
raise TypeError(Errors.E909.format(name="BiluoGold"))
|
2020-06-26 20:34:12 +03:00
|
|
|
cdef BiluoGold gold_ = gold
|
|
|
|
gold_state = gold_.c
|
2020-12-13 04:08:32 +03:00
|
|
|
update_gold_state(&gold_state, state)
|
2020-06-26 20:34:12 +03:00
|
|
|
n_gold = 0
|
2020-12-13 04:08:32 +03:00
|
|
|
self.set_valid(is_valid, state)
|
2020-06-26 20:34:12 +03:00
|
|
|
for i in range(self.n_moves):
|
2020-12-13 04:08:32 +03:00
|
|
|
if is_valid[i]:
|
|
|
|
costs[i] = self.c[i].get_cost(state, &gold_state, self.c[i].label)
|
2020-06-26 20:34:12 +03:00
|
|
|
n_gold += costs[i] <= 0
|
|
|
|
else:
|
|
|
|
costs[i] = 9000
|
|
|
|
|
2015-06-02 00:05:25 +03:00
|
|
|
|
2015-06-05 03:27:17 +03:00
|
|
|
cdef class Missing:
|
|
|
|
@staticmethod
|
2017-05-28 15:06:40 +03:00
|
|
|
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
2015-06-05 03:27:17 +03:00
|
|
|
return False
|
|
|
|
|
|
|
|
@staticmethod
|
2017-05-28 15:06:40 +03:00
|
|
|
cdef int transition(StateC* s, attr_t label) nogil:
|
2015-06-10 08:09:17 +03:00
|
|
|
pass
|
2015-03-09 08:46:22 +03:00
|
|
|
|
2015-06-05 03:27:17 +03:00
|
|
|
@staticmethod
|
2020-12-13 04:08:32 +03:00
|
|
|
cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
|
2015-03-09 08:46:22 +03:00
|
|
|
return 9000
|
|
|
|
|
2015-03-11 09:25:08 +03:00
|
|
|
|
2015-06-05 03:27:17 +03:00
|
|
|
cdef class Begin:
|
|
|
|
@staticmethod
|
2017-05-28 15:06:40 +03:00
|
|
|
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
2015-08-06 01:35:40 +03:00
|
|
|
cdef int preset_ent_iob = st.B_(0).ent_iob
|
2019-09-04 14:42:42 +03:00
|
|
|
cdef attr_t preset_ent_label = st.B_(0).ent_type
|
2020-12-13 04:08:32 +03:00
|
|
|
if st.entity_is_open():
|
2015-08-06 01:35:40 +03:00
|
|
|
return False
|
2020-12-13 04:08:32 +03:00
|
|
|
if st.buffer_length() < 2:
|
|
|
|
# If we're the last token of the input, we can't B -- must U or O.
|
2015-08-06 01:35:40 +03:00
|
|
|
return False
|
2019-03-10 16:53:03 +03:00
|
|
|
elif label == 0:
|
2015-08-06 01:35:40 +03:00
|
|
|
return False
|
2019-09-18 22:37:17 +03:00
|
|
|
elif preset_ent_iob == 1:
|
2019-03-10 16:53:03 +03:00
|
|
|
# Ensure we don't clobber preset entities. If no entity preset,
|
|
|
|
# ent_iob is 0
|
2015-11-08 18:19:12 +03:00
|
|
|
return False
|
2019-03-10 16:53:03 +03:00
|
|
|
elif preset_ent_iob == 3:
|
|
|
|
# Okay, we're in a preset entity.
|
|
|
|
if label != preset_ent_label:
|
|
|
|
# If label isn't right, reject
|
|
|
|
return False
|
|
|
|
elif st.B_(1).ent_iob != 1:
|
|
|
|
# If next token isn't marked I, we need to make U, not B.
|
|
|
|
return False
|
|
|
|
else:
|
|
|
|
# Otherwise, force acceptance, even if we're across a sentence
|
|
|
|
# boundary or the token is whitespace.
|
|
|
|
return True
|
2019-09-18 22:37:17 +03:00
|
|
|
elif st.B_(1).ent_iob == 3:
|
|
|
|
# If the next word is B, we can't B now
|
2015-11-11 20:48:23 +03:00
|
|
|
return False
|
2018-02-08 17:25:41 +03:00
|
|
|
elif st.B_(1).sent_start == 1:
|
2019-03-10 16:53:03 +03:00
|
|
|
# Don't allow entities to extend across sentence boundaries
|
2015-11-06 21:30:44 +03:00
|
|
|
return False
|
2018-12-07 03:12:12 +03:00
|
|
|
# Don't allow entities to start on whitespace
|
|
|
|
elif Lexeme.get_struct_attr(st.B_(0).lex, IS_SPACE):
|
|
|
|
return False
|
2015-08-06 01:35:40 +03:00
|
|
|
else:
|
2019-03-10 16:53:03 +03:00
|
|
|
return True
|
2015-06-05 03:27:17 +03:00
|
|
|
|
|
|
|
@staticmethod
|
2017-05-28 15:06:40 +03:00
|
|
|
cdef int transition(StateC* st, attr_t label) nogil:
|
2015-06-10 02:35:28 +03:00
|
|
|
st.open_ent(label)
|
|
|
|
st.push()
|
|
|
|
st.pop()
|
2015-06-05 03:27:17 +03:00
|
|
|
|
|
|
|
@staticmethod
|
2020-12-13 04:08:32 +03:00
|
|
|
cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
|
2020-06-26 20:34:12 +03:00
|
|
|
gold = <GoldNERStateC*>_gold
|
2021-06-17 10:33:00 +03:00
|
|
|
b0 = s.B(0)
|
|
|
|
cdef int cost = 0
|
|
|
|
cdef int g_act = gold.ner[b0].move
|
|
|
|
cdef attr_t g_tag = gold.ner[b0].label
|
2015-06-05 18:11:26 +03:00
|
|
|
|
2022-01-12 15:38:52 +03:00
|
|
|
cdef shared_ptr[SpanC] span
|
|
|
|
|
2015-06-05 18:11:26 +03:00
|
|
|
if g_act == MISSING:
|
2021-06-17 10:33:00 +03:00
|
|
|
pass
|
2015-06-06 00:48:43 +03:00
|
|
|
elif g_act == BEGIN:
|
2015-02-02 08:38:52 +03:00
|
|
|
# B, Gold B --> Label match
|
2021-06-17 10:33:00 +03:00
|
|
|
cost += label != g_tag
|
2015-02-02 08:38:52 +03:00
|
|
|
else:
|
|
|
|
# B, Gold I --> False (P)
|
|
|
|
# B, Gold L --> False (P)
|
|
|
|
# B, Gold O --> False (P)
|
|
|
|
# B, Gold U --> False (P)
|
2021-06-17 10:33:00 +03:00
|
|
|
cost += 1
|
|
|
|
if s.buffer_length() < 3:
|
|
|
|
# Handle negatives. In general we can't really do much to block
|
|
|
|
# B, because we don't know whether the whole entity is going to
|
|
|
|
# be correct or not. However, we can at least tell whether we're
|
|
|
|
# going to be opening an entity where there's only one possible
|
|
|
|
# L.
|
2022-01-12 15:38:52 +03:00
|
|
|
for span in gold.negs:
|
|
|
|
if span.get().label == label and span.get().start == b0:
|
2021-06-17 10:33:00 +03:00
|
|
|
cost += 1
|
|
|
|
break
|
|
|
|
return cost
|
2015-06-05 03:27:17 +03:00
|
|
|
|
2015-06-10 00:23:28 +03:00
|
|
|
|
2015-06-05 03:27:17 +03:00
|
|
|
cdef class In:
|
|
|
|
@staticmethod
|
2017-05-28 15:06:40 +03:00
|
|
|
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
2020-12-13 04:08:32 +03:00
|
|
|
if not st.entity_is_open():
|
|
|
|
return False
|
|
|
|
if st.buffer_length() < 2:
|
|
|
|
# If we're at the end, we can't I.
|
|
|
|
return False
|
|
|
|
ent = st.get_ent()
|
2015-08-06 01:35:40 +03:00
|
|
|
cdef int preset_ent_iob = st.B_(0).ent_iob
|
2019-09-18 22:37:17 +03:00
|
|
|
cdef attr_t preset_ent_label = st.B_(0).ent_type
|
2019-03-10 16:53:03 +03:00
|
|
|
if label == 0:
|
|
|
|
return False
|
2020-12-13 04:08:32 +03:00
|
|
|
elif ent.label != label:
|
2019-03-10 16:53:03 +03:00
|
|
|
return False
|
2015-08-06 01:35:40 +03:00
|
|
|
elif preset_ent_iob == 3:
|
|
|
|
return False
|
2019-09-18 22:37:17 +03:00
|
|
|
elif st.B_(1).ent_iob == 3:
|
|
|
|
# If we know the next word is B, we can't be I (must be L)
|
2015-08-06 01:35:40 +03:00
|
|
|
return False
|
2019-09-18 22:37:17 +03:00
|
|
|
elif preset_ent_iob == 1:
|
|
|
|
if st.B_(1).ent_iob in (0, 2):
|
|
|
|
# if next preset is missing or O, this can't be I (must be L)
|
|
|
|
return False
|
|
|
|
elif label != preset_ent_label:
|
|
|
|
# If label isn't right, reject
|
|
|
|
return False
|
|
|
|
else:
|
|
|
|
# Otherwise, force acceptance, even if we're across a sentence
|
|
|
|
# boundary or the token is whitespace.
|
|
|
|
return True
|
2018-12-09 10:06:45 +03:00
|
|
|
elif st.B(1) != -1 and st.B_(1).sent_start == 1:
|
2019-03-10 16:53:03 +03:00
|
|
|
# Don't allow entities to extend across sentence boundaries
|
2015-11-06 21:30:44 +03:00
|
|
|
return False
|
2019-03-10 16:53:03 +03:00
|
|
|
else:
|
|
|
|
return True
|
2017-03-11 20:12:01 +03:00
|
|
|
|
2015-06-05 03:27:17 +03:00
|
|
|
@staticmethod
|
2017-05-28 15:06:40 +03:00
|
|
|
cdef int transition(StateC* st, attr_t label) nogil:
|
2015-06-10 02:35:28 +03:00
|
|
|
st.push()
|
|
|
|
st.pop()
|
2015-06-05 03:27:17 +03:00
|
|
|
|
|
|
|
@staticmethod
|
2020-12-13 04:08:32 +03:00
|
|
|
cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
|
2020-06-26 20:34:12 +03:00
|
|
|
gold = <GoldNERStateC*>_gold
|
2018-10-27 01:46:30 +03:00
|
|
|
cdef int next_act = gold.ner[s.B(1)].move if s.B(1) >= 0 else OUT
|
2015-06-10 07:57:41 +03:00
|
|
|
cdef int g_act = gold.ner[s.B(0)].move
|
2015-06-05 03:27:17 +03:00
|
|
|
cdef bint is_sunk = _entity_is_sunk(s, gold.ner)
|
2017-03-11 20:12:01 +03:00
|
|
|
|
2015-06-05 18:11:26 +03:00
|
|
|
if g_act == MISSING:
|
|
|
|
return 0
|
|
|
|
elif g_act == BEGIN:
|
2017-10-27 20:45:57 +03:00
|
|
|
# I, Gold B --> True
|
|
|
|
# (P of bad open entity sunk, R of this entity sunk)
|
2015-06-05 03:27:17 +03:00
|
|
|
return 0
|
2015-02-02 08:38:52 +03:00
|
|
|
elif g_act == IN:
|
2017-10-27 20:45:57 +03:00
|
|
|
# I, Gold I --> True
|
|
|
|
# (label forced by prev, if mismatch, P and R both sunk)
|
2015-06-05 03:27:17 +03:00
|
|
|
return 0
|
2015-02-02 08:38:52 +03:00
|
|
|
elif g_act == LAST:
|
|
|
|
# I, Gold L --> True iff this entity sunk and next tag == O
|
2015-06-05 03:27:17 +03:00
|
|
|
return not (is_sunk and (next_act == OUT or next_act == MISSING))
|
2015-02-02 08:38:52 +03:00
|
|
|
elif g_act == OUT:
|
|
|
|
# I, Gold O --> True iff next tag == O
|
2015-06-05 03:27:17 +03:00
|
|
|
return not (next_act == OUT or next_act == MISSING)
|
2015-02-02 08:38:52 +03:00
|
|
|
elif g_act == UNIT:
|
|
|
|
# I, Gold U --> True iff next tag == O
|
2015-06-05 03:27:17 +03:00
|
|
|
return next_act != OUT
|
2015-06-05 18:11:26 +03:00
|
|
|
else:
|
|
|
|
return 1
|
2015-06-05 03:27:17 +03:00
|
|
|
|
|
|
|
cdef class Last:
|
|
|
|
@staticmethod
|
2017-05-28 15:06:40 +03:00
|
|
|
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
2019-09-18 22:37:17 +03:00
|
|
|
cdef int preset_ent_iob = st.B_(0).ent_iob
|
|
|
|
cdef attr_t preset_ent_label = st.B_(0).ent_type
|
2019-03-10 16:53:03 +03:00
|
|
|
if label == 0:
|
|
|
|
return False
|
|
|
|
elif not st.entity_is_open():
|
2015-08-06 01:35:40 +03:00
|
|
|
return False
|
2019-09-18 22:37:17 +03:00
|
|
|
elif preset_ent_iob == 1 and st.B_(1).ent_iob != 1:
|
2019-09-04 14:42:42 +03:00
|
|
|
# If a preset entity has I followed by not-I, is L
|
2019-09-18 22:37:17 +03:00
|
|
|
if label != preset_ent_label:
|
|
|
|
# If label isn't right, reject
|
|
|
|
return False
|
|
|
|
else:
|
|
|
|
# Otherwise, force acceptance, even if we're across a sentence
|
|
|
|
# boundary or the token is whitespace.
|
|
|
|
return True
|
2020-12-13 04:08:32 +03:00
|
|
|
elif st.get_ent().label != label:
|
2019-03-10 16:53:03 +03:00
|
|
|
return False
|
|
|
|
elif st.B_(1).ent_iob == 1:
|
|
|
|
# If a preset entity has I next, we can't L here.
|
|
|
|
return False
|
|
|
|
else:
|
|
|
|
return True
|
2015-06-05 03:27:17 +03:00
|
|
|
|
|
|
|
@staticmethod
|
2017-05-28 15:06:40 +03:00
|
|
|
cdef int transition(StateC* st, attr_t label) nogil:
|
2015-06-10 02:35:28 +03:00
|
|
|
st.close_ent()
|
|
|
|
st.push()
|
|
|
|
st.pop()
|
2015-06-05 03:27:17 +03:00
|
|
|
|
|
|
|
@staticmethod
|
2020-12-13 04:08:32 +03:00
|
|
|
cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
|
2020-06-26 20:34:12 +03:00
|
|
|
gold = <GoldNERStateC*>_gold
|
2021-06-17 10:33:00 +03:00
|
|
|
b0 = s.B(0)
|
|
|
|
ent_start = s.E(0)
|
2015-06-05 18:11:26 +03:00
|
|
|
|
2021-06-17 10:33:00 +03:00
|
|
|
cdef int g_act = gold.ner[b0].move
|
|
|
|
|
|
|
|
cdef int cost = 0
|
2017-03-11 20:12:01 +03:00
|
|
|
|
2015-06-05 18:11:26 +03:00
|
|
|
if g_act == MISSING:
|
2021-06-17 10:33:00 +03:00
|
|
|
pass
|
2015-06-05 18:11:26 +03:00
|
|
|
elif g_act == BEGIN:
|
2015-02-02 08:38:52 +03:00
|
|
|
# L, Gold B --> True
|
2021-06-17 10:33:00 +03:00
|
|
|
pass
|
2015-02-02 08:38:52 +03:00
|
|
|
elif g_act == IN:
|
|
|
|
# L, Gold I --> True iff this entity sunk
|
2021-06-17 10:33:00 +03:00
|
|
|
cost += not _entity_is_sunk(s, gold.ner)
|
2015-02-02 08:38:52 +03:00
|
|
|
elif g_act == LAST:
|
|
|
|
# L, Gold L --> True
|
2021-06-17 10:33:00 +03:00
|
|
|
pass
|
2015-02-02 08:38:52 +03:00
|
|
|
elif g_act == OUT:
|
|
|
|
# L, Gold O --> True
|
2021-06-17 10:33:00 +03:00
|
|
|
pass
|
2015-02-02 08:38:52 +03:00
|
|
|
elif g_act == UNIT:
|
|
|
|
# L, Gold U --> True
|
2021-06-17 10:33:00 +03:00
|
|
|
pass
|
2015-06-05 18:11:26 +03:00
|
|
|
else:
|
2021-06-17 10:33:00 +03:00
|
|
|
cost += 1
|
|
|
|
# If we have negative-example entities, integrate them into the objective,
|
|
|
|
# by marking actions that close an entity that we know is incorrect
|
|
|
|
# as costly.
|
2022-01-12 15:38:52 +03:00
|
|
|
cdef shared_ptr[SpanC] span
|
|
|
|
for span in gold.negs:
|
|
|
|
if span.get().label == label and (span.get().end-1) == b0 and span.get().start == ent_start:
|
2021-06-17 10:33:00 +03:00
|
|
|
cost += 1
|
|
|
|
break
|
|
|
|
return cost
|
2015-06-05 03:27:17 +03:00
|
|
|
|
|
|
|
|
|
|
|
cdef class Unit:
|
|
|
|
@staticmethod
|
2017-05-28 15:06:40 +03:00
|
|
|
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
2015-08-06 01:35:40 +03:00
|
|
|
cdef int preset_ent_iob = st.B_(0).ent_iob
|
2019-03-10 16:53:03 +03:00
|
|
|
cdef attr_t preset_ent_label = st.B_(0).ent_type
|
|
|
|
if label == 0:
|
2019-09-18 22:37:17 +03:00
|
|
|
# this is only allowed if it's a preset blocked annotation
|
|
|
|
if preset_ent_label == 0 and preset_ent_iob == 3:
|
|
|
|
return True
|
|
|
|
else:
|
|
|
|
return False
|
2019-03-10 16:53:03 +03:00
|
|
|
elif st.entity_is_open():
|
2015-08-06 01:35:40 +03:00
|
|
|
return False
|
|
|
|
elif st.B_(1).ent_iob == 1:
|
2019-03-10 16:53:03 +03:00
|
|
|
# If next token is In, we can't be Unit -- must be Begin
|
2015-08-06 01:35:40 +03:00
|
|
|
return False
|
2019-03-10 16:53:03 +03:00
|
|
|
elif preset_ent_iob == 3:
|
|
|
|
# Okay, there's a preset entity here
|
|
|
|
if label != preset_ent_label:
|
|
|
|
# Require labels to match
|
|
|
|
return False
|
|
|
|
else:
|
|
|
|
# Otherwise return True, ignoring the whitespace constraint.
|
|
|
|
return True
|
2018-12-07 03:12:12 +03:00
|
|
|
elif Lexeme.get_struct_attr(st.B_(0).lex, IS_SPACE):
|
|
|
|
return False
|
2019-03-10 16:53:03 +03:00
|
|
|
else:
|
|
|
|
return True
|
2015-06-05 03:27:17 +03:00
|
|
|
|
|
|
|
@staticmethod
|
2017-05-28 15:06:40 +03:00
|
|
|
cdef int transition(StateC* st, attr_t label) nogil:
|
2015-06-10 02:35:28 +03:00
|
|
|
st.open_ent(label)
|
|
|
|
st.close_ent()
|
|
|
|
st.push()
|
|
|
|
st.pop()
|
2015-06-05 03:27:17 +03:00
|
|
|
|
|
|
|
@staticmethod
|
2020-12-13 04:08:32 +03:00
|
|
|
cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
|
2020-06-26 20:34:12 +03:00
|
|
|
gold = <GoldNERStateC*>_gold
|
2015-06-10 07:57:41 +03:00
|
|
|
cdef int g_act = gold.ner[s.B(0)].move
|
2017-05-28 15:06:40 +03:00
|
|
|
cdef attr_t g_tag = gold.ner[s.B(0)].label
|
2021-06-17 10:33:00 +03:00
|
|
|
cdef int cost = 0
|
2015-06-05 03:27:17 +03:00
|
|
|
|
2015-06-05 18:11:26 +03:00
|
|
|
if g_act == MISSING:
|
2021-06-17 10:33:00 +03:00
|
|
|
pass
|
2015-06-05 18:11:26 +03:00
|
|
|
elif g_act == UNIT:
|
2015-02-02 08:38:52 +03:00
|
|
|
# U, Gold U --> True iff tag match
|
2021-06-17 10:33:00 +03:00
|
|
|
cost += label != g_tag
|
2015-02-02 08:38:52 +03:00
|
|
|
else:
|
|
|
|
# U, Gold B --> False
|
|
|
|
# U, Gold I --> False
|
|
|
|
# U, Gold L --> False
|
|
|
|
# U, Gold O --> False
|
2021-06-17 10:33:00 +03:00
|
|
|
cost += 1
|
|
|
|
# If we have negative-example entities, integrate them into the objective.
|
|
|
|
# This is fairly straight-forward for U- entities, as we have a single
|
|
|
|
# action
|
|
|
|
cdef int b0 = s.B(0)
|
2022-01-12 15:38:52 +03:00
|
|
|
cdef shared_ptr[SpanC] span
|
|
|
|
for span in gold.negs:
|
|
|
|
if span.get().label == label and span.get().start == b0 and span.get().end == (b0+1):
|
2021-06-17 10:33:00 +03:00
|
|
|
cost += 1
|
|
|
|
break
|
|
|
|
return cost
|
2023-12-08 22:23:08 +03:00
|
|
|
|
2015-02-02 08:38:52 +03:00
|
|
|
|
2015-06-05 03:27:17 +03:00
|
|
|
cdef class Out:
|
|
|
|
@staticmethod
|
2017-05-28 15:06:40 +03:00
|
|
|
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
2015-08-06 01:35:40 +03:00
|
|
|
cdef int preset_ent_iob = st.B_(0).ent_iob
|
2019-03-10 16:53:03 +03:00
|
|
|
if st.entity_is_open():
|
|
|
|
return False
|
|
|
|
elif preset_ent_iob == 3:
|
2015-08-06 01:35:40 +03:00
|
|
|
return False
|
|
|
|
elif preset_ent_iob == 1:
|
|
|
|
return False
|
2019-03-10 16:53:03 +03:00
|
|
|
else:
|
|
|
|
return True
|
2015-02-02 08:38:52 +03:00
|
|
|
|
2015-06-05 03:27:17 +03:00
|
|
|
@staticmethod
|
2017-05-28 15:06:40 +03:00
|
|
|
cdef int transition(StateC* st, attr_t label) nogil:
|
2015-06-10 02:35:28 +03:00
|
|
|
st.push()
|
|
|
|
st.pop()
|
2017-03-11 20:12:01 +03:00
|
|
|
|
2015-06-05 03:27:17 +03:00
|
|
|
@staticmethod
|
2020-12-13 04:08:32 +03:00
|
|
|
cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
|
2020-06-26 20:34:12 +03:00
|
|
|
gold = <GoldNERStateC*>_gold
|
2015-06-10 07:57:41 +03:00
|
|
|
cdef int g_act = gold.ner[s.B(0)].move
|
2021-06-17 10:33:00 +03:00
|
|
|
cdef weight_t cost = 0
|
|
|
|
if g_act == MISSING:
|
|
|
|
pass
|
2015-06-05 18:11:26 +03:00
|
|
|
elif g_act == BEGIN:
|
2015-06-05 03:27:17 +03:00
|
|
|
# O, Gold B --> False
|
2021-06-17 10:33:00 +03:00
|
|
|
cost += 1
|
2015-06-05 03:27:17 +03:00
|
|
|
elif g_act == IN:
|
|
|
|
# O, Gold I --> True
|
2021-06-17 10:33:00 +03:00
|
|
|
pass
|
2015-06-05 03:27:17 +03:00
|
|
|
elif g_act == LAST:
|
|
|
|
# O, Gold L --> True
|
2021-06-17 10:33:00 +03:00
|
|
|
pass
|
2015-06-05 03:27:17 +03:00
|
|
|
elif g_act == OUT:
|
|
|
|
# O, Gold O --> True
|
2021-06-17 10:33:00 +03:00
|
|
|
pass
|
2015-06-05 03:27:17 +03:00
|
|
|
elif g_act == UNIT:
|
|
|
|
# O, Gold U --> False
|
2021-06-17 10:33:00 +03:00
|
|
|
cost += 1
|
2015-06-05 18:11:26 +03:00
|
|
|
else:
|
2021-06-17 10:33:00 +03:00
|
|
|
cost += 1
|
|
|
|
return cost
|