2017-04-15 14:05:15 +03:00
|
|
|
# coding: utf-8
|
2015-02-02 08:38:52 +03:00
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
2017-04-15 14:05:15 +03:00
|
|
|
from thinc.typedefs cimport weight_t
|
2017-05-27 23:50:21 +03:00
|
|
|
from collections import OrderedDict
|
2017-04-15 14:05:15 +03:00
|
|
|
|
|
|
|
from .stateclass cimport StateClass
|
|
|
|
from ._state cimport StateC
|
2015-03-09 08:46:22 +03:00
|
|
|
from .transition_system cimport Transition
|
|
|
|
from .transition_system cimport do_func_t
|
|
|
|
from ..structs cimport TokenC, Entity
|
2015-06-02 21:01:06 +03:00
|
|
|
from ..gold cimport GoldParseC
|
2015-05-24 22:35:02 +03:00
|
|
|
from ..gold cimport GoldParse
|
2015-07-19 16:18:17 +03:00
|
|
|
from ..attrs cimport ENT_TYPE, ENT_IOB
|
2015-02-02 08:38:52 +03:00
|
|
|
|
|
|
|
|
|
|
|
cdef enum:
|
|
|
|
MISSING
|
|
|
|
BEGIN
|
|
|
|
IN
|
|
|
|
LAST
|
|
|
|
UNIT
|
|
|
|
OUT
|
2017-04-15 00:52:17 +03:00
|
|
|
ISNT
|
2015-02-02 08:38:52 +03:00
|
|
|
N_MOVES
|
|
|
|
|
2015-03-23 17:34:08 +03:00
|
|
|
|
2015-03-09 14:06:01 +03:00
|
|
|
MOVE_NAMES = [None] * N_MOVES
|
|
|
|
MOVE_NAMES[MISSING] = 'M'
|
|
|
|
MOVE_NAMES[BEGIN] = 'B'
|
|
|
|
MOVE_NAMES[IN] = 'I'
|
|
|
|
MOVE_NAMES[LAST] = 'L'
|
|
|
|
MOVE_NAMES[UNIT] = 'U'
|
|
|
|
MOVE_NAMES[OUT] = 'O'
|
2017-04-15 00:52:17 +03:00
|
|
|
MOVE_NAMES[ISNT] = 'x'
|
2015-03-09 14:06:01 +03:00
|
|
|
|
2015-02-02 08:38:52 +03:00
|
|
|
|
2015-03-09 08:46:22 +03:00
|
|
|
cdef do_func_t[N_MOVES] do_funcs
|
|
|
|
|
|
|
|
|
2015-06-10 08:09:17 +03:00
|
|
|
cdef bint _entity_is_sunk(StateClass st, Transition* golds) nogil:
|
2015-06-10 01:40:43 +03:00
|
|
|
if not st.entity_is_open():
|
2015-03-09 08:46:22 +03:00
|
|
|
return False
|
|
|
|
|
2015-06-10 01:40:43 +03:00
|
|
|
cdef const Transition* gold = &golds[st.E(0)]
|
2015-03-09 08:46:22 +03:00
|
|
|
if gold.move != BEGIN and gold.move != UNIT:
|
|
|
|
return True
|
2015-06-10 01:40:43 +03:00
|
|
|
elif gold.label != st.E_(0).ent_type:
|
2015-03-09 08:46:22 +03:00
|
|
|
return True
|
|
|
|
else:
|
|
|
|
return False
|
|
|
|
|
2015-08-09 03:31:53 +03:00
|
|
|
|
2015-03-09 08:46:22 +03:00
|
|
|
cdef class BiluoPushDown(TransitionSystem):
|
2017-05-27 23:50:21 +03:00
|
|
|
def __init__(self, *args, **kwargs):
|
|
|
|
TransitionSystem.__init__(self, *args, **kwargs)
|
|
|
|
|
|
|
|
def __reduce__(self):
|
|
|
|
labels_by_action = OrderedDict()
|
|
|
|
cdef Transition t
|
|
|
|
for trans in self.c[:self.n_moves]:
|
|
|
|
label_str = self.strings[trans.label]
|
|
|
|
labels_by_action.setdefault(trans.move, []).append(label_str)
|
|
|
|
return (BiluoPushDown, (self.strings, labels_by_action),
|
|
|
|
None, None)
|
|
|
|
|
2015-03-09 08:46:22 +03:00
|
|
|
@classmethod
|
2016-10-16 22:34:57 +03:00
|
|
|
def get_actions(cls, **kwargs):
|
2017-03-11 20:12:01 +03:00
|
|
|
actions = kwargs.get('actions',
|
2017-05-27 23:50:21 +03:00
|
|
|
OrderedDict((
|
|
|
|
(MISSING, ['']),
|
|
|
|
(BEGIN, []),
|
|
|
|
(IN, []),
|
|
|
|
(LAST, []),
|
|
|
|
(UNIT, []),
|
|
|
|
(OUT, [''])
|
|
|
|
)))
|
2017-04-15 00:52:17 +03:00
|
|
|
seen_entities = set()
|
2016-10-16 22:34:57 +03:00
|
|
|
for entity_type in kwargs.get('entity_types', []):
|
2017-04-15 00:52:17 +03:00
|
|
|
if entity_type in seen_entities:
|
|
|
|
continue
|
|
|
|
seen_entities.add(entity_type)
|
2016-10-16 22:34:57 +03:00
|
|
|
for action in (BEGIN, IN, LAST, UNIT):
|
2017-04-15 00:52:17 +03:00
|
|
|
actions[action].append(entity_type)
|
2015-03-10 20:00:23 +03:00
|
|
|
moves = ('M', 'B', 'I', 'L', 'U')
|
2016-11-25 18:02:22 +03:00
|
|
|
for raw_text, sents in kwargs.get('gold_parses', []):
|
2015-05-30 02:25:46 +03:00
|
|
|
for (ids, words, tags, heads, labels, biluo), _ in sents:
|
|
|
|
for i, ner_tag in enumerate(biluo):
|
|
|
|
if ner_tag != 'O' and ner_tag != '-':
|
|
|
|
if ner_tag.count('-') != 1:
|
|
|
|
raise ValueError(ner_tag)
|
|
|
|
_, label = ner_tag.split('-')
|
2017-04-15 00:52:17 +03:00
|
|
|
if label not in seen_entities:
|
|
|
|
seen_entities.add(label)
|
|
|
|
for move_str in ('B', 'I', 'L', 'U'):
|
|
|
|
actions[moves.index(move_str)].append(label)
|
2016-10-16 22:34:57 +03:00
|
|
|
return actions
|
2015-03-09 08:46:22 +03:00
|
|
|
|
2016-01-19 21:07:43 +03:00
|
|
|
property action_types:
|
|
|
|
def __get__(self):
|
|
|
|
return (BEGIN, IN, LAST, UNIT, OUT)
|
|
|
|
|
2017-05-28 15:06:40 +03:00
|
|
|
def move_name(self, int move, attr_t label):
|
2015-03-10 20:00:23 +03:00
|
|
|
if move == OUT:
|
|
|
|
return 'O'
|
2017-05-27 23:50:21 +03:00
|
|
|
elif move == MISSING:
|
2015-03-10 20:00:23 +03:00
|
|
|
return 'M'
|
|
|
|
else:
|
2016-09-30 21:11:49 +03:00
|
|
|
return MOVE_NAMES[move] + '-' + self.strings[label]
|
2015-03-10 20:00:23 +03:00
|
|
|
|
2017-05-26 19:31:23 +03:00
|
|
|
def has_gold(self, GoldParse gold, start=0, end=None):
|
|
|
|
end = end or len(gold.ner)
|
|
|
|
if all([tag == '-' for tag in gold.ner[start:end]]):
|
|
|
|
return False
|
|
|
|
else:
|
|
|
|
return True
|
|
|
|
|
2017-05-22 18:30:12 +03:00
|
|
|
def preprocess_gold(self, GoldParse gold):
|
2017-05-26 19:31:23 +03:00
|
|
|
if not self.has_gold(gold):
|
2017-05-22 18:30:12 +03:00
|
|
|
return None
|
2015-03-09 14:06:01 +03:00
|
|
|
for i in range(gold.length):
|
2015-06-02 21:01:06 +03:00
|
|
|
gold.c.ner[i] = self.lookup_transition(gold.ner[i])
|
2017-05-22 18:30:12 +03:00
|
|
|
return gold
|
2015-03-09 14:06:01 +03:00
|
|
|
|
|
|
|
cdef Transition lookup_transition(self, object name) except *:
|
2017-05-28 19:09:27 +03:00
|
|
|
cdef attr_t label
|
2017-03-16 17:38:28 +03:00
|
|
|
if name == '-' or name == None:
|
2015-03-10 20:00:23 +03:00
|
|
|
move_str = 'M'
|
|
|
|
label = 0
|
|
|
|
elif '-' in name:
|
2015-03-09 14:06:01 +03:00
|
|
|
move_str, label_str = name.split('-', 1)
|
2017-04-15 00:52:17 +03:00
|
|
|
# Hacky way to denote 'not this entity'
|
|
|
|
if label_str.startswith('!'):
|
|
|
|
label_str = label_str[1:]
|
|
|
|
move_str = 'x'
|
2017-05-28 15:06:40 +03:00
|
|
|
label = self.strings.add(label_str)
|
2015-03-09 14:06:01 +03:00
|
|
|
else:
|
|
|
|
move_str = name
|
|
|
|
label = 0
|
|
|
|
move = MOVE_NAMES.index(move_str)
|
2017-04-15 00:52:17 +03:00
|
|
|
if move == ISNT:
|
|
|
|
return Transition(clas=0, move=ISNT, label=label, score=0)
|
2015-03-09 14:06:01 +03:00
|
|
|
for i in range(self.n_moves):
|
|
|
|
if self.c[i].move == move and self.c[i].label == label:
|
|
|
|
return self.c[i]
|
2015-03-11 09:25:08 +03:00
|
|
|
else:
|
2015-04-16 02:36:22 +03:00
|
|
|
raise KeyError(name)
|
2015-03-09 14:06:01 +03:00
|
|
|
|
2017-05-28 15:06:40 +03:00
|
|
|
cdef Transition init_transition(self, int clas, int move, attr_t label) except *:
|
2015-03-09 08:46:22 +03:00
|
|
|
# TODO: Apparent Cython bug here when we try to use the Transition()
|
|
|
|
# constructor with the function pointers
|
|
|
|
cdef Transition t
|
|
|
|
t.score = 0
|
|
|
|
t.clas = clas
|
|
|
|
t.move = move
|
|
|
|
t.label = label
|
2015-06-05 03:27:17 +03:00
|
|
|
if move == MISSING:
|
|
|
|
t.is_valid = Missing.is_valid
|
|
|
|
t.do = Missing.transition
|
|
|
|
t.get_cost = Missing.cost
|
|
|
|
elif move == BEGIN:
|
|
|
|
t.is_valid = Begin.is_valid
|
|
|
|
t.do = Begin.transition
|
|
|
|
t.get_cost = Begin.cost
|
|
|
|
elif move == IN:
|
|
|
|
t.is_valid = In.is_valid
|
|
|
|
t.do = In.transition
|
|
|
|
t.get_cost = In.cost
|
|
|
|
elif move == LAST:
|
|
|
|
t.is_valid = Last.is_valid
|
|
|
|
t.do = Last.transition
|
|
|
|
t.get_cost = Last.cost
|
|
|
|
elif move == UNIT:
|
|
|
|
t.is_valid = Unit.is_valid
|
|
|
|
t.do = Unit.transition
|
|
|
|
t.get_cost = Unit.cost
|
|
|
|
elif move == OUT:
|
|
|
|
t.is_valid = Out.is_valid
|
|
|
|
t.do = Out.transition
|
|
|
|
t.get_cost = Out.cost
|
|
|
|
else:
|
|
|
|
raise Exception(move)
|
2015-03-09 08:46:22 +03:00
|
|
|
return t
|
|
|
|
|
2016-10-27 19:01:55 +03:00
|
|
|
cdef int initialize_state(self, StateC* st) nogil:
|
2017-03-11 20:12:01 +03:00
|
|
|
# This is especially necessary when we use limited training data.
|
2016-10-27 19:01:55 +03:00
|
|
|
for i in range(st.length):
|
|
|
|
if st._sent[i].ent_type != 0:
|
|
|
|
with gil:
|
|
|
|
self.add_action(BEGIN, st._sent[i].ent_type)
|
|
|
|
self.add_action(IN, st._sent[i].ent_type)
|
|
|
|
self.add_action(UNIT, st._sent[i].ent_type)
|
|
|
|
self.add_action(LAST, st._sent[i].ent_type)
|
|
|
|
|
2015-06-02 00:05:25 +03:00
|
|
|
|
2015-06-05 03:27:17 +03:00
|
|
|
cdef class Missing:
|
|
|
|
@staticmethod
|
2017-05-28 15:06:40 +03:00
|
|
|
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
2015-06-05 03:27:17 +03:00
|
|
|
return False
|
|
|
|
|
|
|
|
@staticmethod
|
2017-05-28 15:06:40 +03:00
|
|
|
cdef int transition(StateC* s, attr_t label) nogil:
|
2015-06-10 08:09:17 +03:00
|
|
|
pass
|
2015-03-09 08:46:22 +03:00
|
|
|
|
2015-06-05 03:27:17 +03:00
|
|
|
@staticmethod
|
2017-05-28 15:06:40 +03:00
|
|
|
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
2015-03-09 08:46:22 +03:00
|
|
|
return 9000
|
|
|
|
|
2015-03-11 09:25:08 +03:00
|
|
|
|
2015-06-05 03:27:17 +03:00
|
|
|
cdef class Begin:
|
|
|
|
@staticmethod
|
2017-05-28 15:06:40 +03:00
|
|
|
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
2015-08-06 01:35:40 +03:00
|
|
|
# Ensure we don't clobber preset entities. If no entity preset,
|
|
|
|
# ent_iob is 0
|
|
|
|
cdef int preset_ent_iob = st.B_(0).ent_iob
|
|
|
|
if preset_ent_iob == 1:
|
|
|
|
return False
|
|
|
|
elif preset_ent_iob == 2:
|
|
|
|
return False
|
|
|
|
elif preset_ent_iob == 3 and st.B_(0).ent_type != label:
|
|
|
|
return False
|
2015-11-08 18:19:12 +03:00
|
|
|
# If the next word is B or O, we can't B now
|
|
|
|
elif st.B_(1).ent_iob == 2 or st.B_(1).ent_iob == 3:
|
|
|
|
return False
|
2015-11-11 20:48:23 +03:00
|
|
|
# If the current word is B, and the next word isn't I, the current word
|
|
|
|
# is really U
|
|
|
|
elif preset_ent_iob == 3 and st.B_(1).ent_iob != 1:
|
|
|
|
return False
|
2015-11-06 21:30:44 +03:00
|
|
|
# Don't allow entities to extend across sentence boundaries
|
|
|
|
elif st.B_(1).sent_start:
|
|
|
|
return False
|
2015-08-06 01:35:40 +03:00
|
|
|
else:
|
|
|
|
return label != 0 and not st.entity_is_open()
|
2015-06-05 03:27:17 +03:00
|
|
|
|
|
|
|
@staticmethod
|
2017-05-28 15:06:40 +03:00
|
|
|
cdef int transition(StateC* st, attr_t label) nogil:
|
2015-06-10 02:35:28 +03:00
|
|
|
st.open_ent(label)
|
|
|
|
st.set_ent_tag(st.B(0), 3, label)
|
|
|
|
st.push()
|
|
|
|
st.pop()
|
2015-06-05 03:27:17 +03:00
|
|
|
|
|
|
|
@staticmethod
|
2017-05-28 15:06:40 +03:00
|
|
|
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
2015-06-10 07:57:41 +03:00
|
|
|
cdef int g_act = gold.ner[s.B(0)].move
|
2017-05-28 19:09:27 +03:00
|
|
|
cdef attr_t g_tag = gold.ner[s.B(0)].label
|
2015-06-05 18:11:26 +03:00
|
|
|
|
|
|
|
if g_act == MISSING:
|
|
|
|
return 0
|
2015-06-06 00:48:43 +03:00
|
|
|
elif g_act == BEGIN:
|
2015-02-02 08:38:52 +03:00
|
|
|
# B, Gold B --> Label match
|
2015-06-05 03:27:17 +03:00
|
|
|
return label != g_tag
|
2017-04-15 00:52:17 +03:00
|
|
|
# Support partial supervision in the form of "not this label"
|
|
|
|
elif g_act == ISNT:
|
|
|
|
return label == g_tag
|
2015-02-02 08:38:52 +03:00
|
|
|
else:
|
|
|
|
# B, Gold I --> False (P)
|
|
|
|
# B, Gold L --> False (P)
|
|
|
|
# B, Gold O --> False (P)
|
|
|
|
# B, Gold U --> False (P)
|
2015-06-05 03:27:17 +03:00
|
|
|
return 1
|
|
|
|
|
2015-06-10 00:23:28 +03:00
|
|
|
|
2015-06-05 03:27:17 +03:00
|
|
|
cdef class In:
|
|
|
|
@staticmethod
|
2017-05-28 15:06:40 +03:00
|
|
|
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
2015-08-06 01:35:40 +03:00
|
|
|
cdef int preset_ent_iob = st.B_(0).ent_iob
|
|
|
|
if preset_ent_iob == 2:
|
|
|
|
return False
|
|
|
|
elif preset_ent_iob == 3:
|
|
|
|
return False
|
|
|
|
# TODO: Is this quite right?
|
2015-11-06 21:30:44 +03:00
|
|
|
# I think it's supposed to be ensuring the gazetteer matches are maintained
|
2015-08-06 01:35:40 +03:00
|
|
|
elif st.B_(1).ent_iob != preset_ent_iob:
|
|
|
|
return False
|
2015-11-06 21:30:44 +03:00
|
|
|
# Don't allow entities to extend across sentence boundaries
|
|
|
|
elif st.B_(1).sent_start:
|
|
|
|
return False
|
2015-06-10 00:23:28 +03:00
|
|
|
return st.entity_is_open() and label != 0 and st.E_(0).ent_type == label
|
2017-03-11 20:12:01 +03:00
|
|
|
|
2015-06-05 03:27:17 +03:00
|
|
|
@staticmethod
|
2017-05-28 15:06:40 +03:00
|
|
|
cdef int transition(StateC* st, attr_t label) nogil:
|
2015-06-10 02:35:28 +03:00
|
|
|
st.set_ent_tag(st.B(0), 1, label)
|
|
|
|
st.push()
|
|
|
|
st.pop()
|
2015-06-05 03:27:17 +03:00
|
|
|
|
|
|
|
@staticmethod
|
2017-05-28 15:06:40 +03:00
|
|
|
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
2015-06-05 18:11:26 +03:00
|
|
|
move = IN
|
2016-02-01 04:37:08 +03:00
|
|
|
cdef int next_act = gold.ner[s.B(1)].move if s.B(0) < s.c.length else OUT
|
2015-06-10 07:57:41 +03:00
|
|
|
cdef int g_act = gold.ner[s.B(0)].move
|
2017-05-28 15:06:40 +03:00
|
|
|
cdef attr_t g_tag = gold.ner[s.B(0)].label
|
2015-06-05 03:27:17 +03:00
|
|
|
cdef bint is_sunk = _entity_is_sunk(s, gold.ner)
|
2017-03-11 20:12:01 +03:00
|
|
|
|
2015-06-05 18:11:26 +03:00
|
|
|
if g_act == MISSING:
|
|
|
|
return 0
|
|
|
|
elif g_act == BEGIN:
|
2015-02-02 08:38:52 +03:00
|
|
|
# I, Gold B --> True (P of bad open entity sunk, R of this entity sunk)
|
2015-06-05 03:27:17 +03:00
|
|
|
return 0
|
2015-02-02 08:38:52 +03:00
|
|
|
elif g_act == IN:
|
|
|
|
# I, Gold I --> True (label forced by prev, if mismatch, P and R both sunk)
|
2015-06-05 03:27:17 +03:00
|
|
|
return 0
|
2015-02-02 08:38:52 +03:00
|
|
|
elif g_act == LAST:
|
|
|
|
# I, Gold L --> True iff this entity sunk and next tag == O
|
2015-06-05 03:27:17 +03:00
|
|
|
return not (is_sunk and (next_act == OUT or next_act == MISSING))
|
2015-02-02 08:38:52 +03:00
|
|
|
elif g_act == OUT:
|
|
|
|
# I, Gold O --> True iff next tag == O
|
2015-06-05 03:27:17 +03:00
|
|
|
return not (next_act == OUT or next_act == MISSING)
|
2015-02-02 08:38:52 +03:00
|
|
|
elif g_act == UNIT:
|
|
|
|
# I, Gold U --> True iff next tag == O
|
2015-06-05 03:27:17 +03:00
|
|
|
return next_act != OUT
|
2015-06-05 18:11:26 +03:00
|
|
|
else:
|
|
|
|
return 1
|
2015-06-05 03:27:17 +03:00
|
|
|
|
|
|
|
|
|
|
|
cdef class Last:
|
|
|
|
@staticmethod
|
2017-05-28 15:06:40 +03:00
|
|
|
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
2015-11-08 14:17:51 +03:00
|
|
|
if st.B_(1).ent_iob == 1:
|
2015-08-06 01:35:40 +03:00
|
|
|
return False
|
2015-06-10 00:23:28 +03:00
|
|
|
return st.entity_is_open() and label != 0 and st.E_(0).ent_type == label
|
2015-06-05 03:27:17 +03:00
|
|
|
|
|
|
|
@staticmethod
|
2017-05-28 15:06:40 +03:00
|
|
|
cdef int transition(StateC* st, attr_t label) nogil:
|
2015-06-10 02:35:28 +03:00
|
|
|
st.close_ent()
|
2015-11-06 22:51:41 +03:00
|
|
|
st.set_ent_tag(st.B(0), 1, label)
|
2015-06-10 02:35:28 +03:00
|
|
|
st.push()
|
|
|
|
st.pop()
|
2015-06-05 03:27:17 +03:00
|
|
|
|
|
|
|
@staticmethod
|
2017-05-28 15:06:40 +03:00
|
|
|
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
2015-06-05 18:11:26 +03:00
|
|
|
move = LAST
|
|
|
|
|
2015-06-10 07:57:41 +03:00
|
|
|
cdef int g_act = gold.ner[s.B(0)].move
|
2017-05-28 15:06:40 +03:00
|
|
|
cdef attr_t g_tag = gold.ner[s.B(0)].label
|
2017-03-11 20:12:01 +03:00
|
|
|
|
2015-06-05 18:11:26 +03:00
|
|
|
if g_act == MISSING:
|
|
|
|
return 0
|
|
|
|
elif g_act == BEGIN:
|
2015-02-02 08:38:52 +03:00
|
|
|
# L, Gold B --> True
|
2015-06-05 03:27:17 +03:00
|
|
|
return 0
|
2015-02-02 08:38:52 +03:00
|
|
|
elif g_act == IN:
|
|
|
|
# L, Gold I --> True iff this entity sunk
|
2015-06-05 03:27:17 +03:00
|
|
|
return not _entity_is_sunk(s, gold.ner)
|
2015-02-02 08:38:52 +03:00
|
|
|
elif g_act == LAST:
|
|
|
|
# L, Gold L --> True
|
2015-06-05 03:27:17 +03:00
|
|
|
return 0
|
2015-02-02 08:38:52 +03:00
|
|
|
elif g_act == OUT:
|
|
|
|
# L, Gold O --> True
|
2015-06-05 03:27:17 +03:00
|
|
|
return 0
|
2015-02-02 08:38:52 +03:00
|
|
|
elif g_act == UNIT:
|
|
|
|
# L, Gold U --> True
|
2015-06-05 03:27:17 +03:00
|
|
|
return 0
|
2015-06-05 18:11:26 +03:00
|
|
|
else:
|
|
|
|
return 1
|
2015-06-05 03:27:17 +03:00
|
|
|
|
|
|
|
|
|
|
|
cdef class Unit:
|
|
|
|
@staticmethod
|
2017-05-28 15:06:40 +03:00
|
|
|
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
2015-08-06 01:35:40 +03:00
|
|
|
cdef int preset_ent_iob = st.B_(0).ent_iob
|
|
|
|
if preset_ent_iob == 2:
|
|
|
|
return False
|
|
|
|
elif preset_ent_iob == 1:
|
|
|
|
return False
|
2015-08-06 17:09:08 +03:00
|
|
|
elif preset_ent_iob == 3 and st.B_(0).ent_type != label:
|
|
|
|
return False
|
2015-08-06 01:35:40 +03:00
|
|
|
elif st.B_(1).ent_iob == 1:
|
|
|
|
return False
|
2015-06-10 00:23:28 +03:00
|
|
|
return label != 0 and not st.entity_is_open()
|
2015-06-05 03:27:17 +03:00
|
|
|
|
|
|
|
@staticmethod
|
2017-05-28 15:06:40 +03:00
|
|
|
cdef int transition(StateC* st, attr_t label) nogil:
|
2015-06-10 02:35:28 +03:00
|
|
|
st.open_ent(label)
|
|
|
|
st.close_ent()
|
|
|
|
st.set_ent_tag(st.B(0), 3, label)
|
|
|
|
st.push()
|
|
|
|
st.pop()
|
2015-06-05 03:27:17 +03:00
|
|
|
|
|
|
|
@staticmethod
|
2017-05-28 15:06:40 +03:00
|
|
|
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
2015-06-10 07:57:41 +03:00
|
|
|
cdef int g_act = gold.ner[s.B(0)].move
|
2017-05-28 15:06:40 +03:00
|
|
|
cdef attr_t g_tag = gold.ner[s.B(0)].label
|
2015-06-05 03:27:17 +03:00
|
|
|
|
2015-06-05 18:11:26 +03:00
|
|
|
if g_act == MISSING:
|
|
|
|
return 0
|
|
|
|
elif g_act == UNIT:
|
2015-02-02 08:38:52 +03:00
|
|
|
# U, Gold U --> True iff tag match
|
2015-06-05 03:27:17 +03:00
|
|
|
return label != g_tag
|
2017-04-15 00:52:17 +03:00
|
|
|
# Support partial supervision in the form of "not this label"
|
|
|
|
elif g_act == ISNT:
|
|
|
|
return label == g_tag
|
2015-02-02 08:38:52 +03:00
|
|
|
else:
|
|
|
|
# U, Gold B --> False
|
|
|
|
# U, Gold I --> False
|
|
|
|
# U, Gold L --> False
|
|
|
|
# U, Gold O --> False
|
2015-06-05 03:27:17 +03:00
|
|
|
return 1
|
2015-02-02 08:38:52 +03:00
|
|
|
|
|
|
|
|
2015-06-05 03:27:17 +03:00
|
|
|
cdef class Out:
|
|
|
|
@staticmethod
|
2017-05-28 15:06:40 +03:00
|
|
|
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
2015-08-06 01:35:40 +03:00
|
|
|
cdef int preset_ent_iob = st.B_(0).ent_iob
|
|
|
|
if preset_ent_iob == 3:
|
|
|
|
return False
|
|
|
|
elif preset_ent_iob == 1:
|
|
|
|
return False
|
2015-06-10 00:23:28 +03:00
|
|
|
return not st.entity_is_open()
|
2015-02-02 08:38:52 +03:00
|
|
|
|
2015-06-05 03:27:17 +03:00
|
|
|
@staticmethod
|
2017-05-28 15:06:40 +03:00
|
|
|
cdef int transition(StateC* st, attr_t label) nogil:
|
2015-06-10 02:35:28 +03:00
|
|
|
st.set_ent_tag(st.B(0), 2, 0)
|
|
|
|
st.push()
|
|
|
|
st.pop()
|
2017-03-11 20:12:01 +03:00
|
|
|
|
2015-06-05 03:27:17 +03:00
|
|
|
@staticmethod
|
2017-05-28 15:06:40 +03:00
|
|
|
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
2015-06-10 07:57:41 +03:00
|
|
|
cdef int g_act = gold.ner[s.B(0)].move
|
2017-05-28 15:06:40 +03:00
|
|
|
cdef attr_t g_tag = gold.ner[s.B(0)].label
|
2015-03-09 08:46:22 +03:00
|
|
|
|
2017-04-15 00:52:17 +03:00
|
|
|
if g_act == MISSING or g_act == ISNT:
|
2015-06-05 18:11:26 +03:00
|
|
|
return 0
|
|
|
|
elif g_act == BEGIN:
|
2015-06-05 03:27:17 +03:00
|
|
|
# O, Gold B --> False
|
|
|
|
return 1
|
|
|
|
elif g_act == IN:
|
|
|
|
# O, Gold I --> True
|
|
|
|
return 0
|
|
|
|
elif g_act == LAST:
|
|
|
|
# O, Gold L --> True
|
|
|
|
return 0
|
|
|
|
elif g_act == OUT:
|
|
|
|
# O, Gold O --> True
|
|
|
|
return 0
|
|
|
|
elif g_act == UNIT:
|
|
|
|
# O, Gold U --> False
|
|
|
|
return 1
|
2015-06-05 18:11:26 +03:00
|
|
|
else:
|
|
|
|
return 1
|
2017-03-11 20:12:01 +03:00
|
|
|
|
2015-02-02 08:38:52 +03:00
|
|
|
|
|
|
|
class OracleError(Exception):
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
class UnknownMove(Exception):
|
|
|
|
pass
|