mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 01:46:28 +03:00
* Add NER transition system
This commit is contained in:
parent
043b758cf4
commit
3d0570685c
28
spacy/syntax/ner.pxd
Normal file
28
spacy/syntax/ner.pxd
Normal file
|
@ -0,0 +1,28 @@
|
|||
from cymem.cymem cimport Pool
|
||||
|
||||
from thinc.typedefs cimport weight_t
|
||||
|
||||
|
||||
from ._state cimport State
|
||||
|
||||
|
||||
cdef struct Transition:
|
||||
int clas
|
||||
int move
|
||||
int label
|
||||
int cost
|
||||
weight_t score
|
||||
|
||||
|
||||
cdef class TransitionSystem:
|
||||
cdef Pool mem
|
||||
cdef readonly int n_moves
|
||||
cdef dict label_ids
|
||||
|
||||
cdef const Transition* _moves
|
||||
|
||||
cdef Transition best_valid(self, const weight_t* scores, const State* s) except *
|
||||
cdef Transition best_gold(self, Transition* guess, const weight_t* scores,
|
||||
const State* s,
|
||||
const int* gold_heads, const int* gold_labels) except *
|
||||
cdef int transition(self, State *s, const Transition* t) except -1
|
215
spacy/syntax/ner.pyx
Normal file
215
spacy/syntax/ner.pyx
Normal file
|
@ -0,0 +1,215 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from ._state cimport State
|
||||
from ._state cimport has_head, get_idx, get_s0, get_n0
|
||||
from ._state cimport is_final, at_eol, pop_stack, push_stack, add_dep
|
||||
from ._state cimport head_in_buffer, children_in_buffer
|
||||
from ._state cimport head_in_stack, children_in_stack
|
||||
|
||||
from ..structs cimport TokenC
|
||||
|
||||
|
||||
DEF NON_MONOTONIC = True
|
||||
DEF USE_BREAK = True
|
||||
|
||||
|
||||
cdef enum:
|
||||
MISSING
|
||||
BEGIN
|
||||
IN
|
||||
LAST
|
||||
UNIT
|
||||
OUT
|
||||
N_MOVES
|
||||
|
||||
|
||||
cdef int is_valid(ActionType act, int label, State* s) except -1:
|
||||
if act == BEGIN:
|
||||
return not entity_is_open(s)
|
||||
elif act == IN:
|
||||
return entity_is_open(s) and s.curr.label == label
|
||||
elif act == LAST:
|
||||
return entity_is_open(s) and s.curr.label == label
|
||||
elif act == UNIT:
|
||||
return not entity_is_open(s)
|
||||
elif act == OUT:
|
||||
return not entity_is_open(s)
|
||||
else:
|
||||
raise UnknownMove(act, label)
|
||||
|
||||
|
||||
cdef bint is_gold(ActionType act, int tag, ActionType g_act, int g_tag,
|
||||
ActionType next_act, bint is_sunk):
|
||||
if g_act == MISSING:
|
||||
return True
|
||||
if act == BEGIN:
|
||||
if g_act == BEGIN:
|
||||
# B, Gold B --> Label match
|
||||
return tag == g_tag
|
||||
else:
|
||||
# B, Gold I --> False (P)
|
||||
# B, Gold L --> False (P)
|
||||
# B, Gold O --> False (P)
|
||||
# B, Gold U --> False (P)
|
||||
return False
|
||||
elif act == IN:
|
||||
if g_act == BEGIN:
|
||||
# I, Gold B --> True (P of bad open entity sunk, R of this entity sunk)
|
||||
return True
|
||||
elif g_act == IN:
|
||||
# I, Gold I --> True (label forced by prev, if mismatch, P and R both sunk)
|
||||
return True
|
||||
elif g_act == LAST:
|
||||
# I, Gold L --> True iff this entity sunk and next tag == O
|
||||
return is_sunk and (next_act == OUT or next_act == MISSING)
|
||||
elif g_act == OUT:
|
||||
# I, Gold O --> True iff next tag == O
|
||||
return next_act == OUT or next_act == MISSING
|
||||
elif g_act == UNIT:
|
||||
# I, Gold U --> True iff next tag == O
|
||||
return next_act == OUT
|
||||
elif act == LAST:
|
||||
if g_act == BEGIN:
|
||||
# L, Gold B --> True
|
||||
return True
|
||||
elif g_act == IN:
|
||||
# L, Gold I --> True iff this entity sunk
|
||||
return is_sunk
|
||||
elif g_act == LAST:
|
||||
# L, Gold L --> True
|
||||
return True
|
||||
elif g_act == OUT:
|
||||
# L, Gold O --> True
|
||||
return True
|
||||
elif g_act == UNIT:
|
||||
# L, Gold U --> True
|
||||
return True
|
||||
elif act == OUT:
|
||||
if g_act == BEGIN:
|
||||
# O, Gold B --> False
|
||||
return False
|
||||
elif g_act == IN:
|
||||
# O, Gold I --> True
|
||||
return True
|
||||
elif g_act == LAST:
|
||||
# O, Gold L --> True
|
||||
return True
|
||||
elif g_act == OUT:
|
||||
# O, Gold O --> True
|
||||
return True
|
||||
elif g_act == UNIT:
|
||||
# O, Gold U --> False
|
||||
return False
|
||||
elif act == UNIT:
|
||||
if g_act == UNIT:
|
||||
# U, Gold U --> True iff tag match
|
||||
return tag == g_tag
|
||||
else:
|
||||
# U, Gold B --> False
|
||||
# U, Gold I --> False
|
||||
# U, Gold L --> False
|
||||
# U, Gold O --> False
|
||||
return False
|
||||
|
||||
|
||||
cdef bint entity_is_open(State *s) except -1:
|
||||
return s.sent[s.i - 1].ent.tag >= 1
|
||||
|
||||
|
||||
cdef bint entity_is_sunk(State *s, Move* golds) except -1:
|
||||
if not entity_is_open(s):
|
||||
return False
|
||||
|
||||
cdef const Entity* curr = &s.sent[s.i - 1].ent
|
||||
cdef Move* gold = &golds[(s.i - 1) + curr.start]
|
||||
if gold.action != BEGIN and gold.action != UNIT:
|
||||
return True
|
||||
elif gold.label != s.curr.label:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
cdef class TransitionSystem:
|
||||
def __init__(self, list entity_type_strs):
|
||||
self.mem = Pool()
|
||||
|
||||
cdef Move* m
|
||||
label_names = {'-': 0}
|
||||
for i, tag_name in enumerate(tag_names):
|
||||
m = &moves[i]
|
||||
if '-' in tag_name:
|
||||
action_str, label = tag_name.split('-')
|
||||
elif tag_name == 'O':
|
||||
action_str = 'O'
|
||||
label = '-'
|
||||
elif tag_name == 'NULL' or tag_name == 'EOL':
|
||||
action_str = '?'
|
||||
label = '-'
|
||||
else:
|
||||
raise StandardError(tag_name)
|
||||
m.action = ACTION_NAMES.index(action_str)
|
||||
m.label = label_names.setdefault(label, len(label_names))
|
||||
m.clas = i
|
||||
|
||||
cdef int transition(self, State *s, Move* move) except -1:
|
||||
if move.action == BEGIN:
|
||||
s.curr.start = s.i
|
||||
s.curr.label = label
|
||||
elif move.action == IN:
|
||||
pass
|
||||
elif move.action == LAST:
|
||||
s.curr.end = s.i
|
||||
s.ents[s.j] = s.curr
|
||||
s.j += 1
|
||||
s.curr.start = 0
|
||||
s.curr.label = -1
|
||||
s.curr.end = 0
|
||||
elif move.action == UNIT:
|
||||
begin_entity(s, move.label)
|
||||
end_entity(s)
|
||||
elif move.action == OUT:
|
||||
pass
|
||||
s.tags[s.i] = move.clas
|
||||
s.i += 1
|
||||
|
||||
cdef Transition best_valid(self, const weight_t* scores, const State* s) except *:
|
||||
cdef int best = -1
|
||||
cdef weight_t score = -90000
|
||||
cdef const Transition* m
|
||||
cdef int i
|
||||
for i in range(self.n_moves):
|
||||
m = &self._moves[i]
|
||||
if _is_valid(s, m.ent_move, m.ent_label) and scores[i] > score:
|
||||
best = i
|
||||
score = scores[i]
|
||||
assert best >= 0
|
||||
cdef Transition t = self._moves[best]
|
||||
t.score = score
|
||||
return t
|
||||
|
||||
cdef Transition best_gold(self, Transition* guess, const weight_t* scores,
|
||||
const State* s, Move* golds) except *:
|
||||
|
||||
cdef Move* g = &golds[s.i]
|
||||
cdef ActionType next_act = <ActionType>golds[s.i+1].action if s.i < s.length else OUT
|
||||
cdef bint is_sunk = entity_is_sunk(s, golds)
|
||||
cdef Move* m
|
||||
cdef int n_accept = 0
|
||||
for i in range(1, self.n_classes):
|
||||
m = &moves[i]
|
||||
if _is_valid(s, m.move, m.label) and \
|
||||
_is_gold(s, m.move, m.label, next_act, is_sunk) and \
|
||||
scores[i] > score:
|
||||
best = i
|
||||
score = scores[i]
|
||||
assert best >= 0
|
||||
return self._moves[best]
|
||||
|
||||
|
||||
class OracleError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class UnknownMove(Exception):
|
||||
pass
|
Loading…
Reference in New Issue
Block a user