mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-27 09:44:36 +03:00
* Tmp
This commit is contained in:
parent
220ce8bfed
commit
b3eda03c9c
|
@ -42,9 +42,17 @@ cdef struct PosTag:
|
||||||
univ_pos_t pos
|
univ_pos_t pos
|
||||||
|
|
||||||
|
|
||||||
|
cdef struct Entity:
|
||||||
|
int start
|
||||||
|
int end
|
||||||
|
int tag
|
||||||
|
int label
|
||||||
|
|
||||||
|
|
||||||
cdef struct TokenC:
|
cdef struct TokenC:
|
||||||
const LexemeC* lex
|
const LexemeC* lex
|
||||||
Morphology morph
|
Morphology morph
|
||||||
|
Entity ent
|
||||||
univ_pos_t pos
|
univ_pos_t pos
|
||||||
int tag
|
int tag
|
||||||
int idx
|
int idx
|
||||||
|
|
|
@ -2,15 +2,17 @@ from libc.stdint cimport uint32_t
|
||||||
|
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
|
|
||||||
from ..structs cimport TokenC
|
from ..structs cimport TokenC, Entity
|
||||||
|
|
||||||
|
|
||||||
cdef struct State:
|
cdef struct State:
|
||||||
TokenC* sent
|
TokenC* sent
|
||||||
int* stack
|
int* stack
|
||||||
|
Entity* ent
|
||||||
int i
|
int i
|
||||||
int sent_len
|
int sent_len
|
||||||
int stack_len
|
int stack_len
|
||||||
|
int ents_len
|
||||||
|
|
||||||
|
|
||||||
cdef int add_dep(const State *s, const int head, const int child, const int label) except -1
|
cdef int add_dep(const State *s, const int head, const int child, const int label) except -1
|
||||||
|
|
|
@ -35,16 +35,16 @@ cdef get_cost_func_t[N_MOVES] get_cost_funcs
|
||||||
cdef class ArcEager(TransitionSystem):
|
cdef class ArcEager(TransitionSystem):
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_labels(cls, gold_parses):
|
def get_labels(cls, gold_parses):
|
||||||
labels = {SHIFT: {'ROOT': True}, REDUCE: {'ROOT': True}, RIGHT: {},
|
move_labels = {SHIFT: {'ROOT': True}, REDUCE: {'ROOT': True}, RIGHT: {},
|
||||||
LEFT: {}, BREAK: {'ROOT': True}}
|
LEFT: {}, BREAK: {'ROOT': True}}
|
||||||
for parse in gold_parses:
|
for raw_text, segmented, (ids, tags, heads, labels, iob) in gold_parses:
|
||||||
for i, (head, label) in enumerate(zip(parse.heads, parse.labels)):
|
for i, (head, label) in enumerate(zip(heads, labels)):
|
||||||
if label != 'ROOT':
|
if label != 'ROOT':
|
||||||
if head > i:
|
if head > i:
|
||||||
labels[RIGHT][label] = True
|
move_labels[RIGHT][label] = True
|
||||||
elif head < i:
|
elif head < i:
|
||||||
labels[LEFT][label] = True
|
move_labels[LEFT][label] = True
|
||||||
return labels
|
return move_labels
|
||||||
|
|
||||||
cdef Transition init_transition(self, int clas, int move, int label) except *:
|
cdef Transition init_transition(self, int clas, int move, int label) except *:
|
||||||
# TODO: Apparent Cython bug here when we try to use the Transition()
|
# TODO: Apparent Cython bug here when we try to use the Transition()
|
||||||
|
|
|
@ -1,22 +1,33 @@
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
|
|
||||||
from ..structs cimport TokenC
|
from ..structs cimport TokenC
|
||||||
|
from .transition_system cimport Transition
|
||||||
|
|
||||||
|
|
||||||
cdef class GoldParse:
|
cdef class GoldParse:
|
||||||
cdef Pool mem
|
cdef Pool mem
|
||||||
|
|
||||||
|
cdef int length
|
||||||
|
cdef readonly int loss
|
||||||
|
cdef readonly object ids
|
||||||
|
cdef readonly object tags
|
||||||
|
cdef readonly object heads
|
||||||
|
cdef readonly object labels
|
||||||
|
|
||||||
|
cdef readonly object tags_
|
||||||
|
cdef readonly object labels_
|
||||||
|
cdef readonly object ner_
|
||||||
|
|
||||||
|
cdef Transition* ner
|
||||||
cdef int* c_heads
|
cdef int* c_heads
|
||||||
cdef int* c_labels
|
cdef int* c_labels
|
||||||
|
|
||||||
cdef int length
|
|
||||||
cdef int loss
|
|
||||||
|
|
||||||
cdef readonly unicode raw_text
|
|
||||||
cdef readonly list words
|
|
||||||
cdef readonly list ids
|
|
||||||
cdef readonly list tags
|
|
||||||
cdef readonly list heads
|
|
||||||
cdef readonly list labels
|
|
||||||
|
|
||||||
|
|
||||||
cdef int heads_correct(self, TokenC* tokens, bint score_punct=?) except -1
|
cdef int heads_correct(self, TokenC* tokens, bint score_punct=?) except -1
|
||||||
|
|
||||||
|
|
||||||
|
cdef class NERAnnotation:
|
||||||
|
cdef Pool mem
|
||||||
|
cdef int* starts
|
||||||
|
cdef int* ends
|
||||||
|
cdef int* labels
|
||||||
|
cdef readonly list entities
|
||||||
|
|
|
@ -1,67 +1,24 @@
|
||||||
cdef class GoldParse:
|
import numpy
|
||||||
def __init__(self, raw_text, words, ids, tags, heads, labels):
|
import codecs
|
||||||
self.mem = Pool()
|
from .ner_util import iob_to_biluo
|
||||||
self.loss = 0
|
|
||||||
self.length = len(words)
|
|
||||||
self.raw_text = raw_text
|
|
||||||
self.words = words
|
|
||||||
self.ids = ids
|
|
||||||
self.tags = tags
|
|
||||||
self.heads = heads
|
|
||||||
self.labels = labels
|
|
||||||
self.c_heads = <int*>self.mem.alloc(self.length, sizeof(int))
|
|
||||||
self.c_labels = <int*>self.mem.alloc(self.length, sizeof(int))
|
|
||||||
|
|
||||||
@property
|
from libc.string cimport memset
|
||||||
def n_non_punct(self):
|
|
||||||
return len([l for l in self.labels if l != 'P'])
|
|
||||||
|
|
||||||
@property
|
|
||||||
def py_heads(self):
|
|
||||||
return [self.c_heads[i] for i in range(self.length)]
|
|
||||||
|
|
||||||
cdef int heads_correct(self, TokenC* tokens, bint score_punct=False) except -1:
|
def read_docparse_file(loc):
|
||||||
n = 0
|
sents = []
|
||||||
for i in range(self.length):
|
for sent_str in codecs.open(loc, 'r', 'utf8').read().strip().split('\n\n'):
|
||||||
if not score_punct and self.labels[i] == 'P':
|
|
||||||
continue
|
|
||||||
n += (i + tokens[i].head) == self.c_heads[i]
|
|
||||||
return n
|
|
||||||
|
|
||||||
def is_correct(self, i, head):
|
|
||||||
return head == self.c_heads[i]
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_conll(cls, unicode sent_str):
|
|
||||||
ids = []
|
|
||||||
words = []
|
|
||||||
heads = []
|
|
||||||
labels = []
|
|
||||||
tags = []
|
|
||||||
for i, line in enumerate(sent_str.split('\n')):
|
|
||||||
id_, word, pos_string, head_idx, label = _parse_line(line)
|
|
||||||
words.append(word)
|
|
||||||
if head_idx == -1:
|
|
||||||
head_idx = i
|
|
||||||
ids.append(id_)
|
|
||||||
heads.append(head_idx)
|
|
||||||
labels.append(label)
|
|
||||||
tags.append(pos_string)
|
|
||||||
text = ' '.join(words)
|
|
||||||
return cls(text, [words], ids, words, tags, heads, labels)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_docparse(cls, unicode sent_str):
|
|
||||||
words = []
|
words = []
|
||||||
heads = []
|
heads = []
|
||||||
labels = []
|
labels = []
|
||||||
tags = []
|
tags = []
|
||||||
ids = []
|
ids = []
|
||||||
|
iob_ents = []
|
||||||
lines = sent_str.strip().split('\n')
|
lines = sent_str.strip().split('\n')
|
||||||
raw_text = lines.pop(0).strip()
|
raw_text = lines.pop(0).strip()
|
||||||
tok_text = lines.pop(0).strip()
|
tok_text = lines.pop(0).strip()
|
||||||
for i, line in enumerate(lines):
|
for i, line in enumerate(lines):
|
||||||
id_, word, pos_string, head_idx, label = _parse_line(line)
|
id_, word, pos_string, head_idx, label, iob_ent = _parse_line(line)
|
||||||
if label == 'root':
|
if label == 'root':
|
||||||
label = 'ROOT'
|
label = 'ROOT'
|
||||||
words.append(word)
|
words.append(word)
|
||||||
|
@ -71,57 +28,78 @@ cdef class GoldParse:
|
||||||
heads.append(head_idx)
|
heads.append(head_idx)
|
||||||
labels.append(label)
|
labels.append(label)
|
||||||
tags.append(pos_string)
|
tags.append(pos_string)
|
||||||
tokenized = [sent_str.replace('<SEP>', ' ').split(' ')
|
iob_ents.append(iob_ent)
|
||||||
for sent_str in tok_text.split('<SENT>')]
|
tokenized = [s.replace('<SEP>', ' ').split(' ')
|
||||||
return cls(raw_text, words, ids, tags, heads, labels)
|
for s in tok_text.split('<SENT>')]
|
||||||
|
sents.append((raw_text, tokenized, (ids, tags, heads, labels, iob_ents)))
|
||||||
|
return sents
|
||||||
|
|
||||||
def align_to_tokens(self, tokens, label_ids):
|
|
||||||
orig_words = list(self.words)
|
cdef class GoldParse:
|
||||||
annot = zip(self.ids, self.tags, self.heads, self.labels)
|
def __init__(self, tokens, annot_tuples, pos_tags, dep_labels, entity_types):
|
||||||
self.ids = []
|
self.mem = Pool()
|
||||||
self.tags = []
|
self.loss = 0
|
||||||
self.heads = []
|
|
||||||
self.labels = []
|
|
||||||
missed = []
|
|
||||||
for token in tokens:
|
|
||||||
while annot and token.idx > annot[0][0]:
|
|
||||||
miss_id, miss_tag, miss_head, miss_label = annot.pop(0)
|
|
||||||
if not is_punct_label(miss_label):
|
|
||||||
self.loss += 1
|
|
||||||
if not annot:
|
|
||||||
self.tags.append(None)
|
|
||||||
self.heads.append(None)
|
|
||||||
self.labels.append(None)
|
|
||||||
continue
|
|
||||||
id_, tag, head, label = annot[0]
|
|
||||||
if token.idx == id_:
|
|
||||||
self.tags.append(tag)
|
|
||||||
self.heads.append(head)
|
|
||||||
self.labels.append(label)
|
|
||||||
annot.pop(0)
|
|
||||||
elif token.idx < id_:
|
|
||||||
self.tags.append(None)
|
|
||||||
self.heads.append(None)
|
|
||||||
self.labels.append(None)
|
|
||||||
else:
|
|
||||||
raise StandardError
|
|
||||||
self.length = len(tokens)
|
self.length = len(tokens)
|
||||||
self.c_heads = <int*>self.mem.alloc(self.length, sizeof(int))
|
self.ids = numpy.empty(shape=(len(tokens), 1), dtype=numpy.int32)
|
||||||
self.c_labels = <int*>self.mem.alloc(self.length, sizeof(int))
|
self.tags = numpy.empty(shape=(len(tokens), 1), dtype=numpy.int32)
|
||||||
self.ids = [token.idx for token in tokens]
|
self.heads = numpy.empty(shape=(len(tokens), 1), dtype=numpy.int32)
|
||||||
self.map_heads(label_ids)
|
self.labels = numpy.empty(shape=(len(tokens), 1), dtype=numpy.int32)
|
||||||
return self.loss
|
|
||||||
|
|
||||||
def map_heads(self, label_ids):
|
self.ids[:] = -1
|
||||||
mapped_heads = _map_indices_to_tokens(self.ids, self.heads)
|
self.tags[:] = -1
|
||||||
for i in range(self.length):
|
self.heads[:] = -1
|
||||||
if mapped_heads[i] is None:
|
self.labels[:] = -1
|
||||||
|
|
||||||
|
self.ner = <Transition*>self.mem.alloc(len(tokens), sizeof(Transition))
|
||||||
|
self.c_heads = <int*>self.mem.alloc(len(tokens), sizeof(int))
|
||||||
|
self.c_labels = <int*>self.mem.alloc(len(tokens), sizeof(int))
|
||||||
|
|
||||||
|
for i in range(len(tokens)):
|
||||||
|
self.c_heads[i] = -1
|
||||||
|
self.c_labels[i] = -1
|
||||||
|
|
||||||
|
self.tags_ = [None] * len(tokens)
|
||||||
|
self.labels_ = [None] * len(tokens)
|
||||||
|
self.ner_ = [None] * len(tokens)
|
||||||
|
|
||||||
|
idx_map = {token.idx: token.i for token in tokens}
|
||||||
|
print idx_map
|
||||||
|
# TODO: Fill NER moves
|
||||||
|
print raw_text
|
||||||
|
for idx, tag, head, label, ner in zip(*annot_tuples):
|
||||||
|
if idx < tokens[0].idx:
|
||||||
|
pass
|
||||||
|
elif idx > tokens[-1].idx:
|
||||||
|
break
|
||||||
|
elif idx in idx_map:
|
||||||
|
i = idx_map[idx]
|
||||||
|
print i, idx, head, idx_map.get(head, -1)
|
||||||
|
self.ids[i] = idx
|
||||||
|
self.tags[i] = pos_tags.index(tag)
|
||||||
|
self.heads[i] = idx_map.get(head, -1)
|
||||||
|
self.labels[i] = dep_labels[label]
|
||||||
self.c_heads[i] = -1
|
self.c_heads[i] = -1
|
||||||
self.c_labels[i] = -1
|
self.c_labels[i] = -1
|
||||||
else:
|
self.tags_[i] = tag
|
||||||
self.c_heads[i] = mapped_heads[i]
|
self.labels_[i] = label
|
||||||
self.c_labels[i] = label_ids[self.labels[i]]
|
self.ner_[i] = ner
|
||||||
return self.loss
|
|
||||||
|
@property
|
||||||
|
def n_non_punct(self):
|
||||||
|
return len([l for l in self.labels if l != 'P'])
|
||||||
|
|
||||||
|
cdef int heads_correct(self, TokenC* tokens, bint score_punct=False) except -1:
|
||||||
|
n = 0
|
||||||
|
for i in range(self.length):
|
||||||
|
if not score_punct and self.labels_[i] == 'P':
|
||||||
|
continue
|
||||||
|
if self.heads[i] == -1:
|
||||||
|
continue
|
||||||
|
n += (i + tokens[i].head) == self.heads[i]
|
||||||
|
return n
|
||||||
|
|
||||||
|
def is_correct(self, i, head):
|
||||||
|
return head == self.c_heads[i]
|
||||||
|
|
||||||
|
|
||||||
def is_punct_label(label):
|
def is_punct_label(label):
|
||||||
|
@ -146,6 +124,63 @@ def _parse_line(line):
|
||||||
id_ = int(pieces[0])
|
id_ = int(pieces[0])
|
||||||
word = pieces[1]
|
word = pieces[1]
|
||||||
pos = pieces[3]
|
pos = pieces[3]
|
||||||
|
iob_ent = pieces[5]
|
||||||
head_idx = int(pieces[6])
|
head_idx = int(pieces[6])
|
||||||
label = pieces[7]
|
label = pieces[7]
|
||||||
return id_, word, pos, head_idx, label
|
return id_, word, pos, head_idx, label, iob_ent
|
||||||
|
|
||||||
|
|
||||||
|
cdef class NERAnnotation:
|
||||||
|
def __init__(self, entities, length, entity_types):
|
||||||
|
self.mem = Pool()
|
||||||
|
self.starts = <int*>self.mem.alloc(length, sizeof(int))
|
||||||
|
self.ends = <int*>self.mem.alloc(length, sizeof(int))
|
||||||
|
self.labels = <int*>self.mem.alloc(length, sizeof(int))
|
||||||
|
self.entities = entities
|
||||||
|
memset(self.starts, -1, sizeof(int) * length)
|
||||||
|
memset(self.ends, -1, sizeof(int) * length)
|
||||||
|
memset(self.labels, -1, sizeof(int) * length)
|
||||||
|
|
||||||
|
cdef int start, end, label
|
||||||
|
for start, end, label in entities:
|
||||||
|
for i in range(start, end):
|
||||||
|
self.starts[i] = start
|
||||||
|
self.ends[i] = end
|
||||||
|
self.labels[i] = label
|
||||||
|
@property
|
||||||
|
def biluo_tags(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@property
|
||||||
|
def iob_tags(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_iobs(cls, iob_strs, entity_types):
|
||||||
|
return cls.from_biluos(iob_to_biluo(iob_strs), entity_types)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_biluos(cls, tag_strs, entity_types):
|
||||||
|
entities = []
|
||||||
|
start = None
|
||||||
|
for i, tag_str in enumerate(tag_strs):
|
||||||
|
if tag_str == 'O' or tag_str == '-':
|
||||||
|
continue
|
||||||
|
move, label_str = tag_str.split('-')
|
||||||
|
label = entity_types.index(label_str)
|
||||||
|
if label == -1:
|
||||||
|
label = len(entity_types)
|
||||||
|
entity_types.append(label)
|
||||||
|
if move == 'U':
|
||||||
|
assert start is None
|
||||||
|
entities.append((i, i+1, label))
|
||||||
|
elif move == 'B':
|
||||||
|
assert start is None
|
||||||
|
start = i
|
||||||
|
elif move == 'L':
|
||||||
|
assert start is not None
|
||||||
|
entities.append((start, i+1, label))
|
||||||
|
start = None
|
||||||
|
return cls(entities, len(tag_strs), entity_types)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,28 +1,7 @@
|
||||||
from cymem.cymem cimport Pool
|
from .transition_system cimport TransitionSystem
|
||||||
|
from .transition_system cimport Transition
|
||||||
from thinc.typedefs cimport weight_t
|
from ._state cimport State
|
||||||
|
|
||||||
|
|
||||||
from ._state cimport State
|
cdef class BiluoPushDown(TransitionSystem):
|
||||||
|
pass
|
||||||
|
|
||||||
cdef struct Transition:
|
|
||||||
int clas
|
|
||||||
int move
|
|
||||||
int label
|
|
||||||
int cost
|
|
||||||
weight_t score
|
|
||||||
|
|
||||||
|
|
||||||
cdef class TransitionSystem:
|
|
||||||
cdef Pool mem
|
|
||||||
cdef readonly int n_moves
|
|
||||||
cdef dict label_ids
|
|
||||||
|
|
||||||
cdef const Transition* _moves
|
|
||||||
|
|
||||||
cdef Transition best_valid(self, const weight_t* scores, const State* s) except *
|
|
||||||
cdef Transition best_gold(self, Transition* guess, const weight_t* scores,
|
|
||||||
const State* s,
|
|
||||||
const int* gold_heads, const int* gold_labels) except *
|
|
||||||
cdef int transition(self, State *s, const Transition* t) except -1
|
|
||||||
|
|
|
@ -1,16 +1,15 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ._state cimport State
|
from ._state cimport State
|
||||||
from ._state cimport has_head, get_idx, get_s0, get_n0
|
|
||||||
from ._state cimport is_final, at_eol, pop_stack, push_stack, add_dep
|
|
||||||
from ._state cimport head_in_buffer, children_in_buffer
|
|
||||||
from ._state cimport head_in_stack, children_in_stack
|
|
||||||
|
|
||||||
from ..structs cimport TokenC
|
from .transition_system cimport Transition
|
||||||
|
from .transition_system cimport do_func_t
|
||||||
|
|
||||||
|
from ..structs cimport TokenC, Entity
|
||||||
|
|
||||||
DEF NON_MONOTONIC = True
|
from thinc.typedefs cimport weight_t
|
||||||
DEF USE_BREAK = True
|
from .conll cimport GoldParse
|
||||||
|
from .ner_util import iob_to_biluo
|
||||||
|
|
||||||
|
|
||||||
cdef enum:
|
cdef enum:
|
||||||
|
@ -23,13 +22,34 @@ cdef enum:
|
||||||
N_MOVES
|
N_MOVES
|
||||||
|
|
||||||
|
|
||||||
cdef int is_valid(ActionType act, int label, State* s) except -1:
|
cdef do_func_t[N_MOVES] do_funcs
|
||||||
|
|
||||||
|
|
||||||
|
cdef bint entity_is_open(const State *s) except -1:
|
||||||
|
return s.sent[s.i - 1].ent.tag >= 1
|
||||||
|
|
||||||
|
|
||||||
|
cdef bint _entity_is_sunk(const State *s, Transition* golds) except -1:
|
||||||
|
if not entity_is_open(s):
|
||||||
|
return False
|
||||||
|
|
||||||
|
cdef const Entity* curr = &s.sent[s.i - 1].ent
|
||||||
|
cdef const Transition* gold = &golds[(s.i - 1) + curr.start]
|
||||||
|
if gold.move != BEGIN and gold.move != UNIT:
|
||||||
|
return True
|
||||||
|
elif gold.label != s.ent.label:
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
cdef int _is_valid(int act, int label, const State* s) except -1:
|
||||||
if act == BEGIN:
|
if act == BEGIN:
|
||||||
return not entity_is_open(s)
|
return not entity_is_open(s)
|
||||||
elif act == IN:
|
elif act == IN:
|
||||||
return entity_is_open(s) and s.curr.label == label
|
return entity_is_open(s) and s.ent.label == label
|
||||||
elif act == LAST:
|
elif act == LAST:
|
||||||
return entity_is_open(s) and s.curr.label == label
|
return entity_is_open(s) and s.ent.label == label
|
||||||
elif act == UNIT:
|
elif act == UNIT:
|
||||||
return not entity_is_open(s)
|
return not entity_is_open(s)
|
||||||
elif act == OUT:
|
elif act == OUT:
|
||||||
|
@ -38,8 +58,56 @@ cdef int is_valid(ActionType act, int label, State* s) except -1:
|
||||||
raise UnknownMove(act, label)
|
raise UnknownMove(act, label)
|
||||||
|
|
||||||
|
|
||||||
cdef bint is_gold(ActionType act, int tag, ActionType g_act, int g_tag,
|
cdef class BiluoPushDown(TransitionSystem):
|
||||||
ActionType next_act, bint is_sunk):
|
@classmethod
|
||||||
|
def get_labels(cls, gold_tuples):
|
||||||
|
move_labels = {BEGIN: {}, IN: {}, LAST: {}, UNIT: {}, OUT: {'ROOT': True}}
|
||||||
|
moves = ('-', 'B', 'I', 'L', 'U')
|
||||||
|
for (raw_text, toks, (ids, tags, heads, labels, iob)) in gold_tuples:
|
||||||
|
for i, ner_tag in enumerate(iob_to_biluo(iob)):
|
||||||
|
if ner_tag != 'O' and ner_tag != '-':
|
||||||
|
move_str, label = ner_tag.split('-')
|
||||||
|
move_labels[moves.index(move_str)][label] = True
|
||||||
|
return move_labels
|
||||||
|
|
||||||
|
cdef Transition init_transition(self, int clas, int move, int label) except *:
|
||||||
|
# TODO: Apparent Cython bug here when we try to use the Transition()
|
||||||
|
# constructor with the function pointers
|
||||||
|
cdef Transition t
|
||||||
|
t.score = 0
|
||||||
|
t.clas = clas
|
||||||
|
t.move = move
|
||||||
|
t.label = label
|
||||||
|
t.do = do_funcs[move]
|
||||||
|
t.get_cost = _get_cost
|
||||||
|
return t
|
||||||
|
|
||||||
|
cdef Transition best_valid(self, const weight_t* scores, const State* s) except *:
|
||||||
|
cdef int best = -1
|
||||||
|
cdef weight_t score = -90000
|
||||||
|
cdef const Transition* m
|
||||||
|
cdef int i
|
||||||
|
for i in range(self.n_moves):
|
||||||
|
m = &self.c[i]
|
||||||
|
if _is_valid(m.move, m.label, s) and scores[i] > score:
|
||||||
|
best = i
|
||||||
|
score = scores[i]
|
||||||
|
assert best >= 0
|
||||||
|
cdef Transition t = self.c[best]
|
||||||
|
t.score = score
|
||||||
|
return t
|
||||||
|
|
||||||
|
|
||||||
|
cdef int _get_cost(const Transition* self, const State* s, GoldParse gold) except -1:
|
||||||
|
if not _is_valid(self.move, self.label, s):
|
||||||
|
return 9000
|
||||||
|
cdef bint is_sunk = _entity_is_sunk(s, gold.ner)
|
||||||
|
cdef int next_act = gold.ner[s.i+1].move if s.i < s.sent_len else OUT
|
||||||
|
return not _is_gold(self.move, self.label, gold.ner[s.i].move, gold.ner[s.i].label,
|
||||||
|
next_act, is_sunk)
|
||||||
|
|
||||||
|
cdef bint _is_gold(int act, int tag, int g_act, int g_tag,
|
||||||
|
int next_act, bint is_sunk):
|
||||||
if g_act == MISSING:
|
if g_act == MISSING:
|
||||||
return True
|
return True
|
||||||
if act == BEGIN:
|
if act == BEGIN:
|
||||||
|
@ -112,98 +180,46 @@ cdef bint is_gold(ActionType act, int tag, ActionType g_act, int g_tag,
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
cdef bint entity_is_open(State *s) except -1:
|
cdef int _do_begin(const Transition* self, State* s) except -1:
|
||||||
return s.sent[s.i - 1].ent.tag >= 1
|
s.ent += 1
|
||||||
|
s.ents_len += 1
|
||||||
|
s.ent.start = s.i
|
||||||
|
s.ent.label = self.label
|
||||||
|
s.sent[s.i].ent.tag = self.clas
|
||||||
|
s.i += 1
|
||||||
|
|
||||||
|
|
||||||
cdef bint entity_is_sunk(State *s, Move* golds) except -1:
|
cdef int _do_in(const Transition* self, State* s) except -1:
|
||||||
if not entity_is_open(s):
|
s.sent[s.i].ent.tag = self.clas
|
||||||
return False
|
s.i += 1
|
||||||
|
|
||||||
cdef const Entity* curr = &s.sent[s.i - 1].ent
|
|
||||||
cdef Move* gold = &golds[(s.i - 1) + curr.start]
|
|
||||||
if gold.action != BEGIN and gold.action != UNIT:
|
|
||||||
return True
|
|
||||||
elif gold.label != s.curr.label:
|
|
||||||
return True
|
|
||||||
else:
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
cdef class TransitionSystem:
|
cdef int _do_last(const Transition* self, State* s) except -1:
|
||||||
def __init__(self, list entity_type_strs):
|
s.ent.end = s.i+1
|
||||||
self.mem = Pool()
|
s.sent[s.i].ent.tag = self.clas
|
||||||
|
s.i += 1
|
||||||
|
|
||||||
cdef Move* m
|
|
||||||
label_names = {'-': 0}
|
|
||||||
for i, tag_name in enumerate(tag_names):
|
|
||||||
m = &moves[i]
|
|
||||||
if '-' in tag_name:
|
|
||||||
action_str, label = tag_name.split('-')
|
|
||||||
elif tag_name == 'O':
|
|
||||||
action_str = 'O'
|
|
||||||
label = '-'
|
|
||||||
elif tag_name == 'NULL' or tag_name == 'EOL':
|
|
||||||
action_str = '?'
|
|
||||||
label = '-'
|
|
||||||
else:
|
|
||||||
raise StandardError(tag_name)
|
|
||||||
m.action = ACTION_NAMES.index(action_str)
|
|
||||||
m.label = label_names.setdefault(label, len(label_names))
|
|
||||||
m.clas = i
|
|
||||||
|
|
||||||
cdef int transition(self, State *s, Move* move) except -1:
|
cdef int _do_unit(const Transition* self, State* s) except -1:
|
||||||
if move.action == BEGIN:
|
s.ent += 1
|
||||||
s.curr.start = s.i
|
s.ents_len += 1
|
||||||
s.curr.label = label
|
s.ent.start = s.i
|
||||||
elif move.action == IN:
|
s.ent.label = self.label
|
||||||
pass
|
s.ent.end = s.i+1
|
||||||
elif move.action == LAST:
|
s.sent[s.i].ent.tag = self.clas
|
||||||
s.curr.end = s.i
|
s.i += 1
|
||||||
s.ents[s.j] = s.curr
|
|
||||||
s.j += 1
|
|
||||||
s.curr.start = 0
|
|
||||||
s.curr.label = -1
|
|
||||||
s.curr.end = 0
|
|
||||||
elif move.action == UNIT:
|
|
||||||
begin_entity(s, move.label)
|
|
||||||
end_entity(s)
|
|
||||||
elif move.action == OUT:
|
|
||||||
pass
|
|
||||||
s.tags[s.i] = move.clas
|
|
||||||
s.i += 1
|
|
||||||
|
|
||||||
cdef Transition best_valid(self, const weight_t* scores, const State* s) except *:
|
|
||||||
cdef int best = -1
|
|
||||||
cdef weight_t score = -90000
|
|
||||||
cdef const Transition* m
|
|
||||||
cdef int i
|
|
||||||
for i in range(self.n_moves):
|
|
||||||
m = &self._moves[i]
|
|
||||||
if _is_valid(s, m.ent_move, m.ent_label) and scores[i] > score:
|
|
||||||
best = i
|
|
||||||
score = scores[i]
|
|
||||||
assert best >= 0
|
|
||||||
cdef Transition t = self._moves[best]
|
|
||||||
t.score = score
|
|
||||||
return t
|
|
||||||
|
|
||||||
cdef Transition best_gold(self, Transition* guess, const weight_t* scores,
|
cdef int _do_out(const Transition* self, State* s) except -1:
|
||||||
const State* s, Move* golds) except *:
|
s.sent[s.i].ent.tag = self.clas
|
||||||
cdef Move* g = &golds[s.i]
|
s.i += 1
|
||||||
cdef ActionType next_act = <ActionType>golds[s.i+1].action if s.i < s.length else OUT
|
|
||||||
cdef bint is_sunk = entity_is_sunk(s, golds)
|
|
||||||
cdef Move* m
|
do_funcs[BEGIN] = _do_begin
|
||||||
cdef int n_accept = 0
|
do_funcs[IN] = _do_in
|
||||||
for i in range(1, self.n_classes):
|
do_funcs[LAST] = _do_last
|
||||||
m = &moves[i]
|
do_funcs[UNIT] = _do_unit
|
||||||
if _is_valid(s, m.move, m.label) and \
|
do_funcs[OUT] = _do_out
|
||||||
_is_gold(s, m.move, m.label, next_act, is_sunk) and \
|
|
||||||
scores[i] > score:
|
|
||||||
best = i
|
|
||||||
score = scores[i]
|
|
||||||
assert best >= 0
|
|
||||||
return self._moves[best]
|
|
||||||
|
|
||||||
|
|
||||||
class OracleError(Exception):
|
class OracleError(Exception):
|
||||||
|
@ -212,3 +228,5 @@ class OracleError(Exception):
|
||||||
|
|
||||||
class UnknownMove(Exception):
|
class UnknownMove(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -35,3 +35,10 @@ cdef class TransitionSystem:
|
||||||
|
|
||||||
cdef Transition best_gold(self, const weight_t* scores, const State* state,
|
cdef Transition best_gold(self, const weight_t* scores, const State* state,
|
||||||
GoldParse gold) except *
|
GoldParse gold) except *
|
||||||
|
|
||||||
|
|
||||||
|
#cdef class PyState:
|
||||||
|
# """Provide a Python class for testing purposes."""
|
||||||
|
# cdef Pool mem
|
||||||
|
# cdef TransitionSystem system
|
||||||
|
# cdef State* _state
|
||||||
|
|
|
@ -45,3 +45,39 @@ cdef class TransitionSystem:
|
||||||
score = scores[i]
|
score = scores[i]
|
||||||
assert score > MIN_SCORE
|
assert score > MIN_SCORE
|
||||||
return best
|
return best
|
||||||
|
|
||||||
|
|
||||||
|
#cdef class PyState:
|
||||||
|
# """Provide a Python class for testing purposes."""
|
||||||
|
# def __init__(self, GoldParse gold):
|
||||||
|
# self.mem = Pool()
|
||||||
|
# self.system = EntityRecognition(labels)
|
||||||
|
# self._state = init_state(self.mem, tokens, gold.length)
|
||||||
|
#
|
||||||
|
# def transition(self, name):
|
||||||
|
# cdef const Transition* trans = self._transition_by_name(name)
|
||||||
|
# trans.do(trans, self._state)
|
||||||
|
#
|
||||||
|
# def is_valid(self, name):
|
||||||
|
# cdef const Transition* trans = self._transition_by_name(name)
|
||||||
|
# return _is_valid(trans.move, trans.label, self._state)
|
||||||
|
#
|
||||||
|
# def is_gold(self, name):
|
||||||
|
# cdef const Transition* trans = self._transition_by_name(name)
|
||||||
|
# return _get_const(trans, self._state, self._gold)
|
||||||
|
#
|
||||||
|
# property ent:
|
||||||
|
# def __get__(self):
|
||||||
|
# pass
|
||||||
|
#
|
||||||
|
# property n_ents:
|
||||||
|
# def __get__(self):
|
||||||
|
# pass
|
||||||
|
#
|
||||||
|
# property i:
|
||||||
|
# def __get__(self):
|
||||||
|
# pass
|
||||||
|
#
|
||||||
|
# property open_entity:
|
||||||
|
# def __get__(self):
|
||||||
|
# return entity_is_open(self._s)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user