mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 17:24:41 +03:00
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring.
This commit is contained in:
parent
e99f19dd6c
commit
8057a95f20
|
@ -24,6 +24,8 @@ from spacy.syntax.util import Config
|
|||
from spacy.syntax.conll import read_docparse_file
|
||||
from spacy.syntax.conll import GoldParse
|
||||
|
||||
from spacy.scorer import Scorer
|
||||
|
||||
|
||||
def is_punct_label(label):
|
||||
return label == 'P' or label.lower() == 'punct'
|
||||
|
@ -186,7 +188,6 @@ def get_labels(sents):
|
|||
|
||||
def train(Language, train_loc, model_dir, n_iter=15, feat_set=u'basic', seed=0,
|
||||
gold_preproc=False, force_gold=False, n_sents=0):
|
||||
print "Setup model dir"
|
||||
dep_model_dir = path.join(model_dir, 'deps')
|
||||
pos_model_dir = path.join(model_dir, 'pos')
|
||||
ner_model_dir = path.join(model_dir, 'ner')
|
||||
|
@ -209,13 +210,16 @@ def train(Language, train_loc, model_dir, n_iter=15, feat_set=u'basic', seed=0,
|
|||
Config.write(ner_model_dir, 'config', features='ner', seed=seed,
|
||||
labels=Language.EntityTransitionSystem.get_labels(gold_tuples))
|
||||
|
||||
if n_sents > 0:
|
||||
gold_tuples = gold_tuples[:n_sents]
|
||||
nlp = Language()
|
||||
|
||||
ent_strings = [None] * (max(nlp.entity.moves.label_ids.values()) + 1)
|
||||
for label, i in nlp.entity.moves.label_ids.items():
|
||||
ent_strings[i] = label
|
||||
|
||||
print "Itn.\tUAS\tNER F.\tTag %"
|
||||
for itn in range(n_iter):
|
||||
dep_corr = 0
|
||||
pos_corr = 0
|
||||
ent_corr = 0
|
||||
n_tokens = 0
|
||||
scorer = Scorer()
|
||||
for raw_text, segmented_text, annot_tuples in gold_tuples:
|
||||
if gold_preproc:
|
||||
sents = [nlp.tokenizer.tokens_from_list(s) for s in segmented_text]
|
||||
|
@ -224,51 +228,32 @@ def train(Language, train_loc, model_dir, n_iter=15, feat_set=u'basic', seed=0,
|
|||
for tokens in sents:
|
||||
gold = GoldParse(tokens, annot_tuples)
|
||||
nlp.tagger(tokens)
|
||||
#ent_corr += nlp.entity.train(tokens, gold, force_gold=force_gold)
|
||||
dep_corr += nlp.parser.train(tokens, gold, force_gold=force_gold)
|
||||
pos_corr += nlp.tagger.train(tokens, gold.tags)
|
||||
n_tokens += len(tokens)
|
||||
acc = float(dep_corr) / n_tokens
|
||||
pos_acc = float(pos_corr) / n_tokens
|
||||
print '%d: ' % itn, '%.3f' % acc, '%.3f' % pos_acc
|
||||
nlp.entity.train(tokens, gold, force_gold=force_gold)
|
||||
#nlp.parser.train(tokens, gold, force_gold=force_gold)
|
||||
nlp.tagger.train(tokens, gold.tags)
|
||||
|
||||
nlp.entity(tokens)
|
||||
tokens._ent_strings = tuple(ent_strings)
|
||||
nlp.parser(tokens)
|
||||
scorer.score(tokens, gold, verbose=False)
|
||||
print '%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.ents_f, scorer.tags_acc)
|
||||
random.shuffle(gold_tuples)
|
||||
nlp.parser.model.end_training()
|
||||
nlp.entity.model.end_training()
|
||||
nlp.tagger.model.end_training()
|
||||
return acc
|
||||
|
||||
|
||||
def evaluate(Language, dev_loc, model_dir, gold_preproc=False):
|
||||
global loss
|
||||
assert not gold_preproc
|
||||
nlp = Language()
|
||||
uas_corr = 0
|
||||
las_corr = 0
|
||||
pos_corr = 0
|
||||
n_tokens = 0
|
||||
total = 0
|
||||
skipped = 0
|
||||
loss = 0
|
||||
gold_tuples = read_docparse_file(train_loc)
|
||||
gold_tuples = read_docparse_file(dev_loc)
|
||||
scorer = Scorer()
|
||||
for raw_text, segmented_text, annot_tuples in gold_tuples:
|
||||
if gold_preproc:
|
||||
tokens = nlp.tokenizer.tokens_from_list(gold_sent.words)
|
||||
nlp.tagger(tokens)
|
||||
nlp.parser(tokens)
|
||||
gold_sent.map_heads(nlp.parser.moves.label_ids)
|
||||
else:
|
||||
tokens = nlp(gold_sent.raw_text)
|
||||
loss += gold_sent.align_to_tokens(tokens, nlp.parser.moves.label_ids)
|
||||
for i, token in enumerate(tokens):
|
||||
pos_corr += token.tag_ == gold_sent.tags[i]
|
||||
n_tokens += 1
|
||||
if gold_sent.heads[i] is None:
|
||||
skipped += 1
|
||||
continue
|
||||
if gold_sent.labels[i] != 'P':
|
||||
n_corr += gold_sent.is_correct(i, token.head.i)
|
||||
total += 1
|
||||
print loss, skipped, (loss+skipped + total)
|
||||
print pos_corr / n_tokens
|
||||
return float(n_corr) / (total + loss)
|
||||
tokens = nlp(raw_text)
|
||||
gold = GoldParse(tokens, annot_tuples)
|
||||
scorer.score(tokens, gold, verbose=False)
|
||||
return scorer
|
||||
|
||||
|
||||
|
||||
|
@ -281,7 +266,14 @@ def evaluate(Language, dev_loc, model_dir, gold_preproc=False):
|
|||
def main(train_loc, dev_loc, model_dir, n_sents=0):
|
||||
train(English, train_loc, model_dir,
|
||||
gold_preproc=False, force_gold=False, n_sents=n_sents)
|
||||
print evaluate(English, dev_loc, model_dir, gold_preproc=False)
|
||||
scorer = evaluate(English, dev_loc, model_dir, gold_preproc=False)
|
||||
print 'POS', scorer.tags_acc
|
||||
print 'UAS', scorer.uas
|
||||
print 'LAS', scorer.las
|
||||
|
||||
print 'NER P', scorer.ents_p
|
||||
print 'NER R', scorer.ents_r
|
||||
print 'NER F', scorer.ents_f
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
|
@ -180,7 +180,12 @@ class English(object):
|
|||
if parse and self.has_parser_model:
|
||||
self.parser(tokens)
|
||||
if entity and self.has_entity_model:
|
||||
# TODO: Clean this up
|
||||
self.entity(tokens)
|
||||
ent_strings = [None] * (max(self.entity.moves.label_ids.values()) + 1)
|
||||
for label, i in self.entity.moves.label_ids.items():
|
||||
ent_strings[i] = label
|
||||
tokens._ent_strings = tuple(ent_strings)
|
||||
return tokens
|
||||
|
||||
@property
|
||||
|
|
|
@ -45,14 +45,12 @@ cdef struct PosTag:
|
|||
cdef struct Entity:
|
||||
int start
|
||||
int end
|
||||
int tag
|
||||
int label
|
||||
|
||||
|
||||
|
||||
cdef struct TokenC:
|
||||
const LexemeC* lex
|
||||
Morphology morph
|
||||
Entity ent
|
||||
univ_pos_t pos
|
||||
int tag
|
||||
int idx
|
||||
|
@ -64,6 +62,9 @@ cdef struct TokenC:
|
|||
uint32_t l_kids
|
||||
uint32_t r_kids
|
||||
|
||||
int ent_iob
|
||||
int ent_type
|
||||
|
||||
|
||||
cdef struct Utf8Str:
|
||||
id_t i
|
||||
|
|
|
@ -16,6 +16,7 @@ cdef int fill_context(atom_t* context, State* state) except -1
|
|||
# S0w,
|
||||
# S0r0w, S0r2w, S0rw,
|
||||
# N0l0w, N0l2w, N0lw,
|
||||
# P2w, P1w,
|
||||
# N0w, N1w, N2w, N3w, 0
|
||||
#]
|
||||
|
||||
|
@ -28,6 +29,9 @@ cpdef enum:
|
|||
S2c4
|
||||
S2c6
|
||||
S2L
|
||||
S2_prefix
|
||||
S2_suffix
|
||||
S2_shape
|
||||
|
||||
S1w
|
||||
S1W
|
||||
|
@ -36,6 +40,9 @@ cpdef enum:
|
|||
S1c4
|
||||
S1c6
|
||||
S1L
|
||||
S1_prefix
|
||||
S1_suffix
|
||||
S1_shape
|
||||
|
||||
S1rw
|
||||
S1rW
|
||||
|
@ -44,6 +51,9 @@ cpdef enum:
|
|||
S1rc4
|
||||
S1rc6
|
||||
S1rL
|
||||
S1r_prefix
|
||||
S1r_suffix
|
||||
S1r_shape
|
||||
|
||||
S0lw
|
||||
S0lW
|
||||
|
@ -52,6 +62,9 @@ cpdef enum:
|
|||
S0lc4
|
||||
S0lc6
|
||||
S0lL
|
||||
S0l_prefix
|
||||
S0l_suffix
|
||||
S0l_shape
|
||||
|
||||
S0l2w
|
||||
S0l2W
|
||||
|
@ -60,6 +73,9 @@ cpdef enum:
|
|||
S0l2c4
|
||||
S0l2c6
|
||||
S0l2L
|
||||
S0l2_prefix
|
||||
S0l2_suffix
|
||||
S0l2_shape
|
||||
|
||||
S0w
|
||||
S0W
|
||||
|
@ -68,6 +84,9 @@ cpdef enum:
|
|||
S0c4
|
||||
S0c6
|
||||
S0L
|
||||
S0_prefix
|
||||
S0_suffix
|
||||
S0_shape
|
||||
|
||||
S0r2w
|
||||
S0r2W
|
||||
|
@ -76,6 +95,9 @@ cpdef enum:
|
|||
S0r2c4
|
||||
S0r2c6
|
||||
S0r2L
|
||||
S0r2_prefix
|
||||
S0r2_suffix
|
||||
S0r2_shape
|
||||
|
||||
S0rw
|
||||
S0rW
|
||||
|
@ -84,6 +106,9 @@ cpdef enum:
|
|||
S0rc4
|
||||
S0rc6
|
||||
S0rL
|
||||
S0r_prefix
|
||||
S0r_suffix
|
||||
S0r_shape
|
||||
|
||||
N0l2w
|
||||
N0l2W
|
||||
|
@ -92,6 +117,9 @@ cpdef enum:
|
|||
N0l2c4
|
||||
N0l2c6
|
||||
N0l2L
|
||||
N0l2_prefix
|
||||
N0l2_suffix
|
||||
N0l2_shape
|
||||
|
||||
N0lw
|
||||
N0lW
|
||||
|
@ -100,6 +128,9 @@ cpdef enum:
|
|||
N0lc4
|
||||
N0lc6
|
||||
N0lL
|
||||
N0l_prefix
|
||||
N0l_suffix
|
||||
N0l_shape
|
||||
|
||||
N0w
|
||||
N0W
|
||||
|
@ -108,6 +139,9 @@ cpdef enum:
|
|||
N0c4
|
||||
N0c6
|
||||
N0L
|
||||
N0_prefix
|
||||
N0_suffix
|
||||
N0_shape
|
||||
|
||||
N1w
|
||||
N1W
|
||||
|
@ -116,7 +150,10 @@ cpdef enum:
|
|||
N1c4
|
||||
N1c6
|
||||
N1L
|
||||
|
||||
N1_prefix
|
||||
N1_suffix
|
||||
N1_shape
|
||||
|
||||
N2w
|
||||
N2W
|
||||
N2p
|
||||
|
@ -124,7 +161,32 @@ cpdef enum:
|
|||
N2c4
|
||||
N2c6
|
||||
N2L
|
||||
N2_prefix
|
||||
N2_suffix
|
||||
N2_shape
|
||||
|
||||
P1w
|
||||
P1W
|
||||
P1p
|
||||
P1c
|
||||
P1c4
|
||||
P1c6
|
||||
P1L
|
||||
P1_prefix
|
||||
P1_suffix
|
||||
P1_shape
|
||||
|
||||
P2w
|
||||
P2W
|
||||
P2p
|
||||
P2c
|
||||
P2c4
|
||||
P2c6
|
||||
P2L
|
||||
P2_prefix
|
||||
P2_suffix
|
||||
P2_shape
|
||||
|
||||
# Misc features at the end
|
||||
dist
|
||||
N0lv
|
||||
|
|
|
@ -12,6 +12,7 @@ from itertools import combinations
|
|||
from ..tokens cimport TokenC
|
||||
from ._state cimport State
|
||||
from ._state cimport get_s2, get_s1, get_s0, get_n0, get_n1, get_n2
|
||||
from ._state cimport get_p2, get_p1
|
||||
from ._state cimport has_head, get_left, get_right
|
||||
from ._state cimport count_left_kids, count_right_kids
|
||||
|
||||
|
@ -45,6 +46,9 @@ cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:
|
|||
context[4] = token.lex.cluster & 63
|
||||
context[5] = token.lex.cluster & 15
|
||||
context[6] = token.dep if has_head(token) else 0
|
||||
context[7] = token.lex.prefix
|
||||
context[8] = token.lex.suffix
|
||||
context[9] = token.lex.shape
|
||||
|
||||
|
||||
cdef int fill_context(atom_t* context, State* state) except -1:
|
||||
|
@ -62,7 +66,8 @@ cdef int fill_context(atom_t* context, State* state) except -1:
|
|||
fill_token(&context[N0l2w], get_left(state, get_n0(state), 2))
|
||||
fill_token(&context[N0w], get_n0(state))
|
||||
fill_token(&context[N1w], get_n1(state))
|
||||
fill_token(&context[N2w], get_n2(state))
|
||||
fill_token(&context[P1w], get_p1(state))
|
||||
fill_token(&context[P2w], get_p2(state))
|
||||
|
||||
if state.stack_len >= 1:
|
||||
context[dist] = state.stack[0] - state.i
|
||||
|
@ -84,6 +89,54 @@ cdef int fill_context(atom_t* context, State* state) except -1:
|
|||
if state.stack_len >= 3:
|
||||
context[S2_has_head] = has_head(get_s2(state))
|
||||
|
||||
ner = (
|
||||
(N0w,),
|
||||
(P1w,),
|
||||
(N1w,),
|
||||
(P2w,),
|
||||
(N2w,),
|
||||
|
||||
(P1w, N0w,),
|
||||
(N0w, N1w),
|
||||
|
||||
(N0_prefix,),
|
||||
(N0_suffix,),
|
||||
|
||||
(P1_shape,),
|
||||
(N0_shape,),
|
||||
(N1_shape,),
|
||||
(P1_shape, N0_shape,),
|
||||
(N0_shape, P1_shape,),
|
||||
(P1_shape, N0_shape, N1_shape),
|
||||
(N2_shape,),
|
||||
(P2_shape,),
|
||||
|
||||
#(P2_norm, P1_norm, W_norm),
|
||||
#(P1_norm, W_norm, N1_norm),
|
||||
#(W_norm, N1_norm, N2_norm)
|
||||
|
||||
(P2p,),
|
||||
(P1p,),
|
||||
(N0p,),
|
||||
(N1p,),
|
||||
(N2p,),
|
||||
|
||||
(P1p, N0p),
|
||||
(N0p, N1p),
|
||||
(P2p, P1p, N0p),
|
||||
(P1p, N0p, N1p),
|
||||
(N0p, N1p, N2p),
|
||||
|
||||
(P2c,),
|
||||
(P1c,),
|
||||
(N0c,),
|
||||
(N1c,),
|
||||
(N2c,),
|
||||
|
||||
(P1c, N0c),
|
||||
(N0c, N1c),
|
||||
)
|
||||
|
||||
|
||||
unigrams = (
|
||||
(S2W, S2p),
|
||||
|
|
|
@ -40,6 +40,21 @@ cdef inline TokenC* get_n1(const State* s) nogil:
|
|||
return &s.sent[s.i+1]
|
||||
|
||||
|
||||
cdef inline TokenC* get_p1(const State* s) nogil:
|
||||
if s.i < 1:
|
||||
return NULL
|
||||
else:
|
||||
return &s.sent[s.i-1]
|
||||
|
||||
|
||||
cdef inline TokenC* get_p2(const State* s) nogil:
|
||||
if s.i < 2:
|
||||
return NULL
|
||||
else:
|
||||
return &s.sent[s.i-2]
|
||||
|
||||
|
||||
|
||||
cdef inline TokenC* get_n2(const State* s) nogil:
|
||||
if (s.i + 2) >= s.sent_len:
|
||||
return NULL
|
||||
|
@ -77,7 +92,7 @@ cdef int head_in_buffer(const State *s, const int child, const int* gold) except
|
|||
cdef int children_in_stack(const State *s, const int head, const int* gold) except -1
|
||||
cdef int head_in_stack(const State *s, const int child, const int* gold) except -1
|
||||
|
||||
cdef State* init_state(Pool mem, TokenC* sent, const int sent_length) except NULL
|
||||
cdef State* new_state(Pool mem, TokenC* sent, const int sent_length) except NULL
|
||||
|
||||
|
||||
cdef int count_left_kids(const TokenC* head) nogil
|
||||
|
|
|
@ -2,7 +2,7 @@ from libc.string cimport memmove, memcpy
|
|||
from cymem.cymem cimport Pool
|
||||
|
||||
from ..lexeme cimport EMPTY_LEXEME
|
||||
from ..structs cimport TokenC
|
||||
from ..structs cimport TokenC, Entity
|
||||
|
||||
|
||||
DEF PADDING = 5
|
||||
|
@ -112,13 +112,15 @@ cdef int count_right_kids(const TokenC* head) nogil:
|
|||
return _popcount(head.r_kids)
|
||||
|
||||
|
||||
cdef State* init_state(Pool mem, const TokenC* sent, const int sent_len) except NULL:
|
||||
cdef State* new_state(Pool mem, const TokenC* sent, const int sent_len) except NULL:
|
||||
cdef int padded_len = sent_len + PADDING + PADDING
|
||||
cdef State* s = <State*>mem.alloc(1, sizeof(State))
|
||||
s.ent = <Entity*>mem.alloc(padded_len, sizeof(Entity))
|
||||
s.stack = <int*>mem.alloc(padded_len, sizeof(int))
|
||||
for i in range(PADDING):
|
||||
s.stack[i] = -1
|
||||
s.stack += (PADDING - 1)
|
||||
s.ent += (PADDING - 1)
|
||||
assert s.stack[0] == -1
|
||||
state_sent = <TokenC*>mem.alloc(padded_len, sizeof(TokenC))
|
||||
memcpy(state_sent, sent - PADDING, padded_len * sizeof(TokenC))
|
||||
|
@ -126,5 +128,4 @@ cdef State* init_state(Pool mem, const TokenC* sent, const int sent_len) except
|
|||
s.stack_len = 0
|
||||
s.i = 0
|
||||
s.sent_len = sent_len
|
||||
push_stack(s)
|
||||
return s
|
||||
|
|
|
@ -58,7 +58,6 @@ cdef class ArcEager(TransitionSystem):
|
|||
gold.c_heads[i] = gold.heads[i]
|
||||
gold.c_labels[i] = self.label_ids[gold.labels[i]]
|
||||
|
||||
|
||||
cdef Transition lookup_transition(self, object name) except *:
|
||||
if '-' in name:
|
||||
move_str, label_str = name.split('-', 1)
|
||||
|
@ -82,6 +81,9 @@ cdef class ArcEager(TransitionSystem):
|
|||
t.get_cost = get_cost_funcs[move]
|
||||
return t
|
||||
|
||||
cdef int first_state(self, State* state) except -1:
|
||||
push_stack(state)
|
||||
|
||||
cdef Transition best_valid(self, const weight_t* scores, const State* s) except *:
|
||||
cdef bint[N_MOVES] is_valid
|
||||
is_valid[SHIFT] = _can_shift(s)
|
||||
|
|
|
@ -14,6 +14,7 @@ cdef class GoldParse:
|
|||
cdef readonly list heads
|
||||
cdef readonly list labels
|
||||
cdef readonly list ner
|
||||
cdef readonly list ents
|
||||
|
||||
cdef int* c_tags
|
||||
cdef int* c_heads
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
import numpy
|
||||
import codecs
|
||||
from .ner_util import iob_to_biluo
|
||||
|
||||
from libc.string cimport memset
|
||||
|
||||
|
@ -47,6 +46,7 @@ def _parse_line(line):
|
|||
label = pieces[7]
|
||||
return id_, word, pos, head_idx, label, iob_ent
|
||||
|
||||
|
||||
cdef class GoldParse:
|
||||
def __init__(self, tokens, annot_tuples):
|
||||
self.mem = Pool()
|
||||
|
@ -62,9 +62,12 @@ cdef class GoldParse:
|
|||
self.tags = [None] * len(tokens)
|
||||
self.heads = [-1] * len(tokens)
|
||||
self.labels = ['MISSING'] * len(tokens)
|
||||
self.ner = [None] * len(tokens)
|
||||
self.ner = ['O'] * len(tokens)
|
||||
|
||||
idx_map = {token.idx: token.i for token in tokens}
|
||||
self.ents = []
|
||||
ent_start = None
|
||||
ent_label = None
|
||||
for idx, tag, head, label, ner in zip(*annot_tuples):
|
||||
if idx < tokens[0].idx:
|
||||
pass
|
||||
|
@ -76,8 +79,29 @@ cdef class GoldParse:
|
|||
self.heads[i] = idx_map.get(head, -1)
|
||||
self.labels[i] = label
|
||||
self.tags[i] = tag
|
||||
self.labels[i] = label
|
||||
self.ner[i] = ner
|
||||
if ner == '-':
|
||||
self.ner[i] = '-'
|
||||
# Deal with inconsistencies in BILUO arising from tokenization
|
||||
if ner[0] in ('B', 'U', 'O') and ent_start is not None:
|
||||
self.ents.append((ent_start, i, ent_label))
|
||||
ent_start = None
|
||||
ent_label = None
|
||||
if ner[0] in ('B', 'U'):
|
||||
ent_start = i
|
||||
ent_label = ner[2:]
|
||||
if ent_start is not None:
|
||||
self.ents.append((ent_start, self.length, ent_label))
|
||||
for start, end, label in self.ents:
|
||||
if start == (end - 1):
|
||||
self.ner[start] = 'U-%s' % label
|
||||
else:
|
||||
self.ner[start] = 'B-%s' % label
|
||||
for i in range(start+1, end-1):
|
||||
self.ner[i] = 'I-%s' % label
|
||||
self.ner[end-1] = 'L-%s' % label
|
||||
|
||||
def __len__(self):
|
||||
return self.length
|
||||
|
||||
@property
|
||||
def n_non_punct(self):
|
||||
|
|
|
@ -34,15 +34,14 @@ cdef do_func_t[N_MOVES] do_funcs
|
|||
|
||||
|
||||
cdef bint entity_is_open(const State *s) except -1:
|
||||
return s.sent[s.i - 1].ent.tag >= 1
|
||||
return s.ents_len >= 1 and s.ent.end == 0
|
||||
|
||||
|
||||
cdef bint _entity_is_sunk(const State *s, Transition* golds) except -1:
|
||||
if not entity_is_open(s):
|
||||
return False
|
||||
|
||||
cdef const Entity* curr = &s.sent[s.i - 1].ent
|
||||
cdef const Transition* gold = &golds[(s.i - 1) + curr.start]
|
||||
cdef const Transition* gold = &golds[(s.i - 1) + s.ent.start]
|
||||
if gold.move != BEGIN and gold.move != UNIT:
|
||||
return True
|
||||
elif gold.label != s.ent.label:
|
||||
|
@ -52,14 +51,16 @@ cdef bint _entity_is_sunk(const State *s, Transition* golds) except -1:
|
|||
|
||||
|
||||
cdef int _is_valid(int act, int label, const State* s) except -1:
|
||||
if act == BEGIN:
|
||||
return not entity_is_open(s)
|
||||
if act == MISSING:
|
||||
return False
|
||||
elif act == BEGIN:
|
||||
return label != 0 and not entity_is_open(s)
|
||||
elif act == IN:
|
||||
return entity_is_open(s) and s.ent.label == label
|
||||
return entity_is_open(s) and label != 0 and s.ent.label == label
|
||||
elif act == LAST:
|
||||
return entity_is_open(s) and s.ent.label == label
|
||||
return entity_is_open(s) and label != 0 and s.ent.label == label
|
||||
elif act == UNIT:
|
||||
return not entity_is_open(s)
|
||||
return label != 0 and not entity_is_open(s)
|
||||
elif act == OUT:
|
||||
return not entity_is_open(s)
|
||||
else:
|
||||
|
@ -69,22 +70,34 @@ cdef int _is_valid(int act, int label, const State* s) except -1:
|
|||
cdef class BiluoPushDown(TransitionSystem):
|
||||
@classmethod
|
||||
def get_labels(cls, gold_tuples):
|
||||
move_labels = {BEGIN: {}, IN: {}, LAST: {}, UNIT: {}, OUT: {'ROOT': True}}
|
||||
moves = ('-', 'B', 'I', 'L', 'U')
|
||||
for (raw_text, toks, (ids, tags, heads, labels, iob)) in gold_tuples:
|
||||
for i, ner_tag in enumerate(iob_to_biluo(iob)):
|
||||
move_labels = {MISSING: {'ROOT': True}, BEGIN: {}, IN: {}, LAST: {}, UNIT: {},
|
||||
OUT: {'ROOT': True}}
|
||||
moves = ('M', 'B', 'I', 'L', 'U')
|
||||
for (raw_text, toks, (ids, tags, heads, labels, biluo)) in gold_tuples:
|
||||
for i, ner_tag in enumerate(biluo):
|
||||
if ner_tag != 'O' and ner_tag != '-':
|
||||
move_str, label = ner_tag.split('-')
|
||||
move_labels[moves.index(move_str)][label] = True
|
||||
return move_labels
|
||||
|
||||
def move_name(self, int move, int label):
|
||||
if move == OUT:
|
||||
return 'O'
|
||||
elif move == 'MISSING':
|
||||
return 'M'
|
||||
else:
|
||||
labels = {id_: name for name, id_ in self.label_ids.items()}
|
||||
return MOVE_NAMES[move] + '-' + labels[label]
|
||||
|
||||
cdef int preprocess_gold(self, GoldParse gold) except -1:
|
||||
biluo_strings = iob_to_biluo(gold.ner)
|
||||
for i in range(gold.length):
|
||||
gold.c_ner[i] = self.lookup_transition(biluo_strings[i])
|
||||
gold.c_ner[i] = self.lookup_transition(gold.ner[i])
|
||||
|
||||
cdef Transition lookup_transition(self, object name) except *:
|
||||
if '-' in name:
|
||||
if name == '-':
|
||||
move_str = 'M'
|
||||
label = 0
|
||||
elif '-' in name:
|
||||
move_str, label_str = name.split('-', 1)
|
||||
label = self.label_ids[label_str]
|
||||
else:
|
||||
|
@ -107,6 +120,9 @@ cdef class BiluoPushDown(TransitionSystem):
|
|||
t.get_cost = _get_cost
|
||||
return t
|
||||
|
||||
cdef int first_state(self, State* state) except -1:
|
||||
pass
|
||||
|
||||
cdef Transition best_valid(self, const weight_t* scores, const State* s) except *:
|
||||
cdef int best = -1
|
||||
cdef weight_t score = -90000
|
||||
|
@ -128,8 +144,9 @@ cdef int _get_cost(const Transition* self, const State* s, GoldParse gold) excep
|
|||
return 9000
|
||||
cdef bint is_sunk = _entity_is_sunk(s, gold.c_ner)
|
||||
cdef int next_act = gold.c_ner[s.i+1].move if s.i < s.sent_len else OUT
|
||||
return not _is_gold(self.move, self.label, gold.c_ner[s.i].move, gold.c_ner[s.i].label,
|
||||
next_act, is_sunk)
|
||||
cdef bint is_gold = _is_gold(self.move, self.label, gold.c_ner[s.i].move,
|
||||
gold.c_ner[s.i].label, next_act, is_sunk)
|
||||
return not is_gold
|
||||
|
||||
cdef bint _is_gold(int act, int tag, int g_act, int g_tag,
|
||||
int next_act, bint is_sunk):
|
||||
|
@ -210,18 +227,21 @@ cdef int _do_begin(const Transition* self, State* s) except -1:
|
|||
s.ents_len += 1
|
||||
s.ent.start = s.i
|
||||
s.ent.label = self.label
|
||||
s.sent[s.i].ent.tag = self.clas
|
||||
s.sent[s.i].ent_iob = 3
|
||||
s.sent[s.i].ent_type = self.label
|
||||
s.i += 1
|
||||
|
||||
|
||||
cdef int _do_in(const Transition* self, State* s) except -1:
|
||||
s.sent[s.i].ent.tag = self.clas
|
||||
s.sent[s.i].ent_iob = 1
|
||||
s.sent[s.i].ent_type = self.label
|
||||
s.i += 1
|
||||
|
||||
|
||||
cdef int _do_last(const Transition* self, State* s) except -1:
|
||||
s.ent.end = s.i+1
|
||||
s.sent[s.i].ent.tag = self.clas
|
||||
s.sent[s.i].ent_iob = 1
|
||||
s.sent[s.i].ent_type = self.label
|
||||
s.i += 1
|
||||
|
||||
|
||||
|
@ -231,12 +251,13 @@ cdef int _do_unit(const Transition* self, State* s) except -1:
|
|||
s.ent.start = s.i
|
||||
s.ent.label = self.label
|
||||
s.ent.end = s.i+1
|
||||
s.sent[s.i].ent.tag = self.clas
|
||||
s.sent[s.i].ent_iob = 3
|
||||
s.sent[s.i].ent_type = self.label
|
||||
s.i += 1
|
||||
|
||||
|
||||
cdef int _do_out(const Transition* self, State* s) except -1:
|
||||
s.sent[s.i].ent.tag = self.clas
|
||||
s.sent[s.i].ent_iob = 2
|
||||
s.i += 1
|
||||
|
||||
|
||||
|
|
|
@ -28,14 +28,12 @@ from ..tokens cimport Tokens, TokenC
|
|||
from .arc_eager cimport TransitionSystem, Transition
|
||||
from .transition_system import OracleError
|
||||
|
||||
from ._state cimport init_state, State, is_final, get_idx, get_s0, get_s1, get_n0, get_n1
|
||||
from ._state cimport new_state, State, is_final, get_idx, get_s0, get_s1, get_n0, get_n1
|
||||
from .conll cimport GoldParse
|
||||
|
||||
from . import _parse_features
|
||||
from ._parse_features cimport fill_context, CONTEXT_SIZE
|
||||
|
||||
from ._ner_features cimport _ner_features
|
||||
|
||||
|
||||
DEBUG = False
|
||||
def set_debug(val):
|
||||
|
@ -50,7 +48,11 @@ cdef unicode print_state(State* s, list words):
|
|||
third = words[s.stack[-2]] + '_%d' % s.sent[s.stack[-2]].head
|
||||
n0 = words[s.i]
|
||||
n1 = words[s.i + 1]
|
||||
return ' '.join((str(s.stack_len), third, second, top, '|', n0, n1))
|
||||
if s.ents_len:
|
||||
ent = '%s %d-%d' % (s.ent.label, s.ent.start, s.ent.end)
|
||||
else:
|
||||
ent = '-'
|
||||
return ' '.join((ent, str(s.stack_len), third, second, top, '|', n0, n1))
|
||||
|
||||
|
||||
def get_templates(name):
|
||||
|
@ -58,7 +60,7 @@ def get_templates(name):
|
|||
if name == 'zhang':
|
||||
return pf.arc_eager
|
||||
elif name == 'ner':
|
||||
return _ner_features.basic
|
||||
return pf.ner
|
||||
else:
|
||||
return (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s0_n1 + pf.n0_n1 + \
|
||||
pf.tree_shape + pf.trigrams)
|
||||
|
@ -79,7 +81,8 @@ cdef class GreedyParser:
|
|||
cdef atom_t[CONTEXT_SIZE] context
|
||||
cdef int n_feats
|
||||
cdef Pool mem = Pool()
|
||||
cdef State* state = init_state(mem, tokens.data, tokens.length)
|
||||
cdef State* state = new_state(mem, tokens.data, tokens.length)
|
||||
self.moves.first_state(state)
|
||||
cdef Transition guess
|
||||
while not is_final(state):
|
||||
fill_context(context, state)
|
||||
|
@ -99,10 +102,12 @@ cdef class GreedyParser:
|
|||
Transition best
|
||||
|
||||
atom_t[CONTEXT_SIZE] context
|
||||
|
||||
|
||||
self.moves.preprocess_gold(gold)
|
||||
cdef Pool mem = Pool()
|
||||
cdef State* state = init_state(mem, tokens.data, tokens.length)
|
||||
cdef State* state = new_state(mem, tokens.data, tokens.length)
|
||||
self.moves.first_state(state)
|
||||
py_words = [t.orth_ for t in tokens]
|
||||
while not is_final(state):
|
||||
fill_context(context, state)
|
||||
scores = self.model.score(context)
|
||||
|
@ -114,7 +119,3 @@ cdef class GreedyParser:
|
|||
best.do(&best, state)
|
||||
else:
|
||||
guess.do(&guess, state)
|
||||
n_corr = gold.heads_correct(state.sent, score_punct=True)
|
||||
if force_gold and n_corr != tokens.length:
|
||||
raise OracleError
|
||||
return n_corr
|
||||
|
|
|
@ -29,6 +29,8 @@ cdef class TransitionSystem:
|
|||
cdef const Transition* c
|
||||
cdef readonly int n_moves
|
||||
|
||||
cdef int first_state(self, State* state) except -1
|
||||
|
||||
cdef int preprocess_gold(self, GoldParse gold) except -1
|
||||
|
||||
cdef Transition lookup_transition(self, object name) except *
|
||||
|
|
|
@ -28,6 +28,9 @@ cdef class TransitionSystem:
|
|||
self.label_ids['MISSING'] = -1
|
||||
self.c = moves
|
||||
|
||||
cdef int first_state(self, State* state) except -1:
|
||||
raise NotImplementedError
|
||||
|
||||
cdef int preprocess_gold(self, GoldParse gold) except -1:
|
||||
raise NotImplementedError
|
||||
|
||||
|
|
|
@ -39,6 +39,7 @@ cdef class Tokens:
|
|||
cdef unicode _string
|
||||
cdef tuple _tag_strings
|
||||
cdef tuple _dep_strings
|
||||
cdef public tuple _ent_strings
|
||||
|
||||
cdef public bint is_tagged
|
||||
cdef public bint is_parsed
|
||||
|
|
|
@ -94,6 +94,7 @@ cdef class Tokens:
|
|||
self._py_tokens = []
|
||||
self._tag_strings = tuple() # These will be set by the POS tagger and parser
|
||||
self._dep_strings = tuple() # The strings are arbitrary and model-specific.
|
||||
self._ent_strings = tuple() # TODO: Clean this up
|
||||
|
||||
def __getitem__(self, object i):
|
||||
"""Retrieve a token.
|
||||
|
@ -129,6 +130,28 @@ cdef class Tokens:
|
|||
cdef const TokenC* last = &self.data[self.length - 1]
|
||||
return self._string[:last.idx + last.lex.length]
|
||||
|
||||
property ents:
|
||||
def __get__(self):
|
||||
cdef int i
|
||||
cdef const TokenC* token
|
||||
cdef int start = -1
|
||||
cdef object label = None
|
||||
for i in range(self.length):
|
||||
token = &self.data[i]
|
||||
if token.ent_iob == 1:
|
||||
assert start != -1
|
||||
pass
|
||||
elif token.ent_iob == 2:
|
||||
if start != -1:
|
||||
yield (start, i, label)
|
||||
start = -1
|
||||
label = None
|
||||
elif token.ent_iob == 3:
|
||||
start = i
|
||||
label = self._ent_strings[token.ent_type]
|
||||
if start != -1:
|
||||
yield (start, self.length, label)
|
||||
|
||||
cdef int push_back(self, int idx, LexemeOrToken lex_or_tok) except -1:
|
||||
if self.length == self.max_length:
|
||||
self._realloc(self.length * 2)
|
||||
|
|
Loading…
Reference in New Issue
Block a user