* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring.

This commit is contained in:
Matthew Honnibal 2015-03-10 13:00:23 -04:00
parent e99f19dd6c
commit 8057a95f20
16 changed files with 298 additions and 91 deletions

View File

@ -24,6 +24,8 @@ from spacy.syntax.util import Config
from spacy.syntax.conll import read_docparse_file from spacy.syntax.conll import read_docparse_file
from spacy.syntax.conll import GoldParse from spacy.syntax.conll import GoldParse
from spacy.scorer import Scorer
def is_punct_label(label): def is_punct_label(label):
return label == 'P' or label.lower() == 'punct' return label == 'P' or label.lower() == 'punct'
@ -186,7 +188,6 @@ def get_labels(sents):
def train(Language, train_loc, model_dir, n_iter=15, feat_set=u'basic', seed=0, def train(Language, train_loc, model_dir, n_iter=15, feat_set=u'basic', seed=0,
gold_preproc=False, force_gold=False, n_sents=0): gold_preproc=False, force_gold=False, n_sents=0):
print "Setup model dir"
dep_model_dir = path.join(model_dir, 'deps') dep_model_dir = path.join(model_dir, 'deps')
pos_model_dir = path.join(model_dir, 'pos') pos_model_dir = path.join(model_dir, 'pos')
ner_model_dir = path.join(model_dir, 'ner') ner_model_dir = path.join(model_dir, 'ner')
@ -209,13 +210,16 @@ def train(Language, train_loc, model_dir, n_iter=15, feat_set=u'basic', seed=0,
Config.write(ner_model_dir, 'config', features='ner', seed=seed, Config.write(ner_model_dir, 'config', features='ner', seed=seed,
labels=Language.EntityTransitionSystem.get_labels(gold_tuples)) labels=Language.EntityTransitionSystem.get_labels(gold_tuples))
if n_sents > 0:
gold_tuples = gold_tuples[:n_sents]
nlp = Language() nlp = Language()
ent_strings = [None] * (max(nlp.entity.moves.label_ids.values()) + 1)
for label, i in nlp.entity.moves.label_ids.items():
ent_strings[i] = label
print "Itn.\tUAS\tNER F.\tTag %"
for itn in range(n_iter): for itn in range(n_iter):
dep_corr = 0 scorer = Scorer()
pos_corr = 0
ent_corr = 0
n_tokens = 0
for raw_text, segmented_text, annot_tuples in gold_tuples: for raw_text, segmented_text, annot_tuples in gold_tuples:
if gold_preproc: if gold_preproc:
sents = [nlp.tokenizer.tokens_from_list(s) for s in segmented_text] sents = [nlp.tokenizer.tokens_from_list(s) for s in segmented_text]
@ -224,51 +228,32 @@ def train(Language, train_loc, model_dir, n_iter=15, feat_set=u'basic', seed=0,
for tokens in sents: for tokens in sents:
gold = GoldParse(tokens, annot_tuples) gold = GoldParse(tokens, annot_tuples)
nlp.tagger(tokens) nlp.tagger(tokens)
#ent_corr += nlp.entity.train(tokens, gold, force_gold=force_gold) nlp.entity.train(tokens, gold, force_gold=force_gold)
dep_corr += nlp.parser.train(tokens, gold, force_gold=force_gold) #nlp.parser.train(tokens, gold, force_gold=force_gold)
pos_corr += nlp.tagger.train(tokens, gold.tags) nlp.tagger.train(tokens, gold.tags)
n_tokens += len(tokens)
acc = float(dep_corr) / n_tokens nlp.entity(tokens)
pos_acc = float(pos_corr) / n_tokens tokens._ent_strings = tuple(ent_strings)
print '%d: ' % itn, '%.3f' % acc, '%.3f' % pos_acc nlp.parser(tokens)
scorer.score(tokens, gold, verbose=False)
print '%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.ents_f, scorer.tags_acc)
random.shuffle(gold_tuples) random.shuffle(gold_tuples)
nlp.parser.model.end_training() nlp.parser.model.end_training()
nlp.entity.model.end_training()
nlp.tagger.model.end_training() nlp.tagger.model.end_training()
return acc
def evaluate(Language, dev_loc, model_dir, gold_preproc=False): def evaluate(Language, dev_loc, model_dir, gold_preproc=False):
global loss global loss
assert not gold_preproc
nlp = Language() nlp = Language()
uas_corr = 0 gold_tuples = read_docparse_file(dev_loc)
las_corr = 0 scorer = Scorer()
pos_corr = 0
n_tokens = 0
total = 0
skipped = 0
loss = 0
gold_tuples = read_docparse_file(train_loc)
for raw_text, segmented_text, annot_tuples in gold_tuples: for raw_text, segmented_text, annot_tuples in gold_tuples:
if gold_preproc: tokens = nlp(raw_text)
tokens = nlp.tokenizer.tokens_from_list(gold_sent.words) gold = GoldParse(tokens, annot_tuples)
nlp.tagger(tokens) scorer.score(tokens, gold, verbose=False)
nlp.parser(tokens) return scorer
gold_sent.map_heads(nlp.parser.moves.label_ids)
else:
tokens = nlp(gold_sent.raw_text)
loss += gold_sent.align_to_tokens(tokens, nlp.parser.moves.label_ids)
for i, token in enumerate(tokens):
pos_corr += token.tag_ == gold_sent.tags[i]
n_tokens += 1
if gold_sent.heads[i] is None:
skipped += 1
continue
if gold_sent.labels[i] != 'P':
n_corr += gold_sent.is_correct(i, token.head.i)
total += 1
print loss, skipped, (loss+skipped + total)
print pos_corr / n_tokens
return float(n_corr) / (total + loss)
@ -281,7 +266,14 @@ def evaluate(Language, dev_loc, model_dir, gold_preproc=False):
def main(train_loc, dev_loc, model_dir, n_sents=0): def main(train_loc, dev_loc, model_dir, n_sents=0):
train(English, train_loc, model_dir, train(English, train_loc, model_dir,
gold_preproc=False, force_gold=False, n_sents=n_sents) gold_preproc=False, force_gold=False, n_sents=n_sents)
print evaluate(English, dev_loc, model_dir, gold_preproc=False) scorer = evaluate(English, dev_loc, model_dir, gold_preproc=False)
print 'POS', scorer.tags_acc
print 'UAS', scorer.uas
print 'LAS', scorer.las
print 'NER P', scorer.ents_p
print 'NER R', scorer.ents_r
print 'NER F', scorer.ents_f
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -180,7 +180,12 @@ class English(object):
if parse and self.has_parser_model: if parse and self.has_parser_model:
self.parser(tokens) self.parser(tokens)
if entity and self.has_entity_model: if entity and self.has_entity_model:
# TODO: Clean this up
self.entity(tokens) self.entity(tokens)
ent_strings = [None] * (max(self.entity.moves.label_ids.values()) + 1)
for label, i in self.entity.moves.label_ids.items():
ent_strings[i] = label
tokens._ent_strings = tuple(ent_strings)
return tokens return tokens
@property @property

View File

@ -45,14 +45,12 @@ cdef struct PosTag:
cdef struct Entity: cdef struct Entity:
int start int start
int end int end
int tag
int label int label
cdef struct TokenC: cdef struct TokenC:
const LexemeC* lex const LexemeC* lex
Morphology morph Morphology morph
Entity ent
univ_pos_t pos univ_pos_t pos
int tag int tag
int idx int idx
@ -64,6 +62,9 @@ cdef struct TokenC:
uint32_t l_kids uint32_t l_kids
uint32_t r_kids uint32_t r_kids
int ent_iob
int ent_type
cdef struct Utf8Str: cdef struct Utf8Str:
id_t i id_t i

View File

@ -16,6 +16,7 @@ cdef int fill_context(atom_t* context, State* state) except -1
# S0w, # S0w,
# S0r0w, S0r2w, S0rw, # S0r0w, S0r2w, S0rw,
# N0l0w, N0l2w, N0lw, # N0l0w, N0l2w, N0lw,
# P2w, P1w,
# N0w, N1w, N2w, N3w, 0 # N0w, N1w, N2w, N3w, 0
#] #]
@ -28,6 +29,9 @@ cpdef enum:
S2c4 S2c4
S2c6 S2c6
S2L S2L
S2_prefix
S2_suffix
S2_shape
S1w S1w
S1W S1W
@ -36,6 +40,9 @@ cpdef enum:
S1c4 S1c4
S1c6 S1c6
S1L S1L
S1_prefix
S1_suffix
S1_shape
S1rw S1rw
S1rW S1rW
@ -44,6 +51,9 @@ cpdef enum:
S1rc4 S1rc4
S1rc6 S1rc6
S1rL S1rL
S1r_prefix
S1r_suffix
S1r_shape
S0lw S0lw
S0lW S0lW
@ -52,6 +62,9 @@ cpdef enum:
S0lc4 S0lc4
S0lc6 S0lc6
S0lL S0lL
S0l_prefix
S0l_suffix
S0l_shape
S0l2w S0l2w
S0l2W S0l2W
@ -60,6 +73,9 @@ cpdef enum:
S0l2c4 S0l2c4
S0l2c6 S0l2c6
S0l2L S0l2L
S0l2_prefix
S0l2_suffix
S0l2_shape
S0w S0w
S0W S0W
@ -68,6 +84,9 @@ cpdef enum:
S0c4 S0c4
S0c6 S0c6
S0L S0L
S0_prefix
S0_suffix
S0_shape
S0r2w S0r2w
S0r2W S0r2W
@ -76,6 +95,9 @@ cpdef enum:
S0r2c4 S0r2c4
S0r2c6 S0r2c6
S0r2L S0r2L
S0r2_prefix
S0r2_suffix
S0r2_shape
S0rw S0rw
S0rW S0rW
@ -84,6 +106,9 @@ cpdef enum:
S0rc4 S0rc4
S0rc6 S0rc6
S0rL S0rL
S0r_prefix
S0r_suffix
S0r_shape
N0l2w N0l2w
N0l2W N0l2W
@ -92,6 +117,9 @@ cpdef enum:
N0l2c4 N0l2c4
N0l2c6 N0l2c6
N0l2L N0l2L
N0l2_prefix
N0l2_suffix
N0l2_shape
N0lw N0lw
N0lW N0lW
@ -100,6 +128,9 @@ cpdef enum:
N0lc4 N0lc4
N0lc6 N0lc6
N0lL N0lL
N0l_prefix
N0l_suffix
N0l_shape
N0w N0w
N0W N0W
@ -108,6 +139,9 @@ cpdef enum:
N0c4 N0c4
N0c6 N0c6
N0L N0L
N0_prefix
N0_suffix
N0_shape
N1w N1w
N1W N1W
@ -116,6 +150,9 @@ cpdef enum:
N1c4 N1c4
N1c6 N1c6
N1L N1L
N1_prefix
N1_suffix
N1_shape
N2w N2w
N2W N2W
@ -124,6 +161,31 @@ cpdef enum:
N2c4 N2c4
N2c6 N2c6
N2L N2L
N2_prefix
N2_suffix
N2_shape
P1w
P1W
P1p
P1c
P1c4
P1c6
P1L
P1_prefix
P1_suffix
P1_shape
P2w
P2W
P2p
P2c
P2c4
P2c6
P2L
P2_prefix
P2_suffix
P2_shape
# Misc features at the end # Misc features at the end
dist dist

View File

@ -12,6 +12,7 @@ from itertools import combinations
from ..tokens cimport TokenC from ..tokens cimport TokenC
from ._state cimport State from ._state cimport State
from ._state cimport get_s2, get_s1, get_s0, get_n0, get_n1, get_n2 from ._state cimport get_s2, get_s1, get_s0, get_n0, get_n1, get_n2
from ._state cimport get_p2, get_p1
from ._state cimport has_head, get_left, get_right from ._state cimport has_head, get_left, get_right
from ._state cimport count_left_kids, count_right_kids from ._state cimport count_left_kids, count_right_kids
@ -45,6 +46,9 @@ cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:
context[4] = token.lex.cluster & 63 context[4] = token.lex.cluster & 63
context[5] = token.lex.cluster & 15 context[5] = token.lex.cluster & 15
context[6] = token.dep if has_head(token) else 0 context[6] = token.dep if has_head(token) else 0
context[7] = token.lex.prefix
context[8] = token.lex.suffix
context[9] = token.lex.shape
cdef int fill_context(atom_t* context, State* state) except -1: cdef int fill_context(atom_t* context, State* state) except -1:
@ -62,7 +66,8 @@ cdef int fill_context(atom_t* context, State* state) except -1:
fill_token(&context[N0l2w], get_left(state, get_n0(state), 2)) fill_token(&context[N0l2w], get_left(state, get_n0(state), 2))
fill_token(&context[N0w], get_n0(state)) fill_token(&context[N0w], get_n0(state))
fill_token(&context[N1w], get_n1(state)) fill_token(&context[N1w], get_n1(state))
fill_token(&context[N2w], get_n2(state)) fill_token(&context[P1w], get_p1(state))
fill_token(&context[P2w], get_p2(state))
if state.stack_len >= 1: if state.stack_len >= 1:
context[dist] = state.stack[0] - state.i context[dist] = state.stack[0] - state.i
@ -84,6 +89,54 @@ cdef int fill_context(atom_t* context, State* state) except -1:
if state.stack_len >= 3: if state.stack_len >= 3:
context[S2_has_head] = has_head(get_s2(state)) context[S2_has_head] = has_head(get_s2(state))
ner = (
(N0w,),
(P1w,),
(N1w,),
(P2w,),
(N2w,),
(P1w, N0w,),
(N0w, N1w),
(N0_prefix,),
(N0_suffix,),
(P1_shape,),
(N0_shape,),
(N1_shape,),
(P1_shape, N0_shape,),
(N0_shape, P1_shape,),
(P1_shape, N0_shape, N1_shape),
(N2_shape,),
(P2_shape,),
#(P2_norm, P1_norm, W_norm),
#(P1_norm, W_norm, N1_norm),
#(W_norm, N1_norm, N2_norm)
(P2p,),
(P1p,),
(N0p,),
(N1p,),
(N2p,),
(P1p, N0p),
(N0p, N1p),
(P2p, P1p, N0p),
(P1p, N0p, N1p),
(N0p, N1p, N2p),
(P2c,),
(P1c,),
(N0c,),
(N1c,),
(N2c,),
(P1c, N0c),
(N0c, N1c),
)
unigrams = ( unigrams = (
(S2W, S2p), (S2W, S2p),

View File

@ -40,6 +40,21 @@ cdef inline TokenC* get_n1(const State* s) nogil:
return &s.sent[s.i+1] return &s.sent[s.i+1]
cdef inline TokenC* get_p1(const State* s) nogil:
if s.i < 1:
return NULL
else:
return &s.sent[s.i-1]
cdef inline TokenC* get_p2(const State* s) nogil:
if s.i < 2:
return NULL
else:
return &s.sent[s.i-2]
cdef inline TokenC* get_n2(const State* s) nogil: cdef inline TokenC* get_n2(const State* s) nogil:
if (s.i + 2) >= s.sent_len: if (s.i + 2) >= s.sent_len:
return NULL return NULL
@ -77,7 +92,7 @@ cdef int head_in_buffer(const State *s, const int child, const int* gold) except
cdef int children_in_stack(const State *s, const int head, const int* gold) except -1 cdef int children_in_stack(const State *s, const int head, const int* gold) except -1
cdef int head_in_stack(const State *s, const int child, const int* gold) except -1 cdef int head_in_stack(const State *s, const int child, const int* gold) except -1
cdef State* init_state(Pool mem, TokenC* sent, const int sent_length) except NULL cdef State* new_state(Pool mem, TokenC* sent, const int sent_length) except NULL
cdef int count_left_kids(const TokenC* head) nogil cdef int count_left_kids(const TokenC* head) nogil

View File

@ -2,7 +2,7 @@ from libc.string cimport memmove, memcpy
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from ..lexeme cimport EMPTY_LEXEME from ..lexeme cimport EMPTY_LEXEME
from ..structs cimport TokenC from ..structs cimport TokenC, Entity
DEF PADDING = 5 DEF PADDING = 5
@ -112,13 +112,15 @@ cdef int count_right_kids(const TokenC* head) nogil:
return _popcount(head.r_kids) return _popcount(head.r_kids)
cdef State* init_state(Pool mem, const TokenC* sent, const int sent_len) except NULL: cdef State* new_state(Pool mem, const TokenC* sent, const int sent_len) except NULL:
cdef int padded_len = sent_len + PADDING + PADDING cdef int padded_len = sent_len + PADDING + PADDING
cdef State* s = <State*>mem.alloc(1, sizeof(State)) cdef State* s = <State*>mem.alloc(1, sizeof(State))
s.ent = <Entity*>mem.alloc(padded_len, sizeof(Entity))
s.stack = <int*>mem.alloc(padded_len, sizeof(int)) s.stack = <int*>mem.alloc(padded_len, sizeof(int))
for i in range(PADDING): for i in range(PADDING):
s.stack[i] = -1 s.stack[i] = -1
s.stack += (PADDING - 1) s.stack += (PADDING - 1)
s.ent += (PADDING - 1)
assert s.stack[0] == -1 assert s.stack[0] == -1
state_sent = <TokenC*>mem.alloc(padded_len, sizeof(TokenC)) state_sent = <TokenC*>mem.alloc(padded_len, sizeof(TokenC))
memcpy(state_sent, sent - PADDING, padded_len * sizeof(TokenC)) memcpy(state_sent, sent - PADDING, padded_len * sizeof(TokenC))
@ -126,5 +128,4 @@ cdef State* init_state(Pool mem, const TokenC* sent, const int sent_len) except
s.stack_len = 0 s.stack_len = 0
s.i = 0 s.i = 0
s.sent_len = sent_len s.sent_len = sent_len
push_stack(s)
return s return s

View File

@ -58,7 +58,6 @@ cdef class ArcEager(TransitionSystem):
gold.c_heads[i] = gold.heads[i] gold.c_heads[i] = gold.heads[i]
gold.c_labels[i] = self.label_ids[gold.labels[i]] gold.c_labels[i] = self.label_ids[gold.labels[i]]
cdef Transition lookup_transition(self, object name) except *: cdef Transition lookup_transition(self, object name) except *:
if '-' in name: if '-' in name:
move_str, label_str = name.split('-', 1) move_str, label_str = name.split('-', 1)
@ -82,6 +81,9 @@ cdef class ArcEager(TransitionSystem):
t.get_cost = get_cost_funcs[move] t.get_cost = get_cost_funcs[move]
return t return t
cdef int first_state(self, State* state) except -1:
push_stack(state)
cdef Transition best_valid(self, const weight_t* scores, const State* s) except *: cdef Transition best_valid(self, const weight_t* scores, const State* s) except *:
cdef bint[N_MOVES] is_valid cdef bint[N_MOVES] is_valid
is_valid[SHIFT] = _can_shift(s) is_valid[SHIFT] = _can_shift(s)

View File

@ -14,6 +14,7 @@ cdef class GoldParse:
cdef readonly list heads cdef readonly list heads
cdef readonly list labels cdef readonly list labels
cdef readonly list ner cdef readonly list ner
cdef readonly list ents
cdef int* c_tags cdef int* c_tags
cdef int* c_heads cdef int* c_heads

View File

@ -1,6 +1,5 @@
import numpy import numpy
import codecs import codecs
from .ner_util import iob_to_biluo
from libc.string cimport memset from libc.string cimport memset
@ -47,6 +46,7 @@ def _parse_line(line):
label = pieces[7] label = pieces[7]
return id_, word, pos, head_idx, label, iob_ent return id_, word, pos, head_idx, label, iob_ent
cdef class GoldParse: cdef class GoldParse:
def __init__(self, tokens, annot_tuples): def __init__(self, tokens, annot_tuples):
self.mem = Pool() self.mem = Pool()
@ -62,9 +62,12 @@ cdef class GoldParse:
self.tags = [None] * len(tokens) self.tags = [None] * len(tokens)
self.heads = [-1] * len(tokens) self.heads = [-1] * len(tokens)
self.labels = ['MISSING'] * len(tokens) self.labels = ['MISSING'] * len(tokens)
self.ner = [None] * len(tokens) self.ner = ['O'] * len(tokens)
idx_map = {token.idx: token.i for token in tokens} idx_map = {token.idx: token.i for token in tokens}
self.ents = []
ent_start = None
ent_label = None
for idx, tag, head, label, ner in zip(*annot_tuples): for idx, tag, head, label, ner in zip(*annot_tuples):
if idx < tokens[0].idx: if idx < tokens[0].idx:
pass pass
@ -76,8 +79,29 @@ cdef class GoldParse:
self.heads[i] = idx_map.get(head, -1) self.heads[i] = idx_map.get(head, -1)
self.labels[i] = label self.labels[i] = label
self.tags[i] = tag self.tags[i] = tag
self.labels[i] = label if ner == '-':
self.ner[i] = ner self.ner[i] = '-'
# Deal with inconsistencies in BILUO arising from tokenization
if ner[0] in ('B', 'U', 'O') and ent_start is not None:
self.ents.append((ent_start, i, ent_label))
ent_start = None
ent_label = None
if ner[0] in ('B', 'U'):
ent_start = i
ent_label = ner[2:]
if ent_start is not None:
self.ents.append((ent_start, self.length, ent_label))
for start, end, label in self.ents:
if start == (end - 1):
self.ner[start] = 'U-%s' % label
else:
self.ner[start] = 'B-%s' % label
for i in range(start+1, end-1):
self.ner[i] = 'I-%s' % label
self.ner[end-1] = 'L-%s' % label
def __len__(self):
return self.length
@property @property
def n_non_punct(self): def n_non_punct(self):

View File

@ -34,15 +34,14 @@ cdef do_func_t[N_MOVES] do_funcs
cdef bint entity_is_open(const State *s) except -1: cdef bint entity_is_open(const State *s) except -1:
return s.sent[s.i - 1].ent.tag >= 1 return s.ents_len >= 1 and s.ent.end == 0
cdef bint _entity_is_sunk(const State *s, Transition* golds) except -1: cdef bint _entity_is_sunk(const State *s, Transition* golds) except -1:
if not entity_is_open(s): if not entity_is_open(s):
return False return False
cdef const Entity* curr = &s.sent[s.i - 1].ent cdef const Transition* gold = &golds[(s.i - 1) + s.ent.start]
cdef const Transition* gold = &golds[(s.i - 1) + curr.start]
if gold.move != BEGIN and gold.move != UNIT: if gold.move != BEGIN and gold.move != UNIT:
return True return True
elif gold.label != s.ent.label: elif gold.label != s.ent.label:
@ -52,14 +51,16 @@ cdef bint _entity_is_sunk(const State *s, Transition* golds) except -1:
cdef int _is_valid(int act, int label, const State* s) except -1: cdef int _is_valid(int act, int label, const State* s) except -1:
if act == BEGIN: if act == MISSING:
return not entity_is_open(s) return False
elif act == BEGIN:
return label != 0 and not entity_is_open(s)
elif act == IN: elif act == IN:
return entity_is_open(s) and s.ent.label == label return entity_is_open(s) and label != 0 and s.ent.label == label
elif act == LAST: elif act == LAST:
return entity_is_open(s) and s.ent.label == label return entity_is_open(s) and label != 0 and s.ent.label == label
elif act == UNIT: elif act == UNIT:
return not entity_is_open(s) return label != 0 and not entity_is_open(s)
elif act == OUT: elif act == OUT:
return not entity_is_open(s) return not entity_is_open(s)
else: else:
@ -69,22 +70,34 @@ cdef int _is_valid(int act, int label, const State* s) except -1:
cdef class BiluoPushDown(TransitionSystem): cdef class BiluoPushDown(TransitionSystem):
@classmethod @classmethod
def get_labels(cls, gold_tuples): def get_labels(cls, gold_tuples):
move_labels = {BEGIN: {}, IN: {}, LAST: {}, UNIT: {}, OUT: {'ROOT': True}} move_labels = {MISSING: {'ROOT': True}, BEGIN: {}, IN: {}, LAST: {}, UNIT: {},
moves = ('-', 'B', 'I', 'L', 'U') OUT: {'ROOT': True}}
for (raw_text, toks, (ids, tags, heads, labels, iob)) in gold_tuples: moves = ('M', 'B', 'I', 'L', 'U')
for i, ner_tag in enumerate(iob_to_biluo(iob)): for (raw_text, toks, (ids, tags, heads, labels, biluo)) in gold_tuples:
for i, ner_tag in enumerate(biluo):
if ner_tag != 'O' and ner_tag != '-': if ner_tag != 'O' and ner_tag != '-':
move_str, label = ner_tag.split('-') move_str, label = ner_tag.split('-')
move_labels[moves.index(move_str)][label] = True move_labels[moves.index(move_str)][label] = True
return move_labels return move_labels
def move_name(self, int move, int label):
if move == OUT:
return 'O'
elif move == 'MISSING':
return 'M'
else:
labels = {id_: name for name, id_ in self.label_ids.items()}
return MOVE_NAMES[move] + '-' + labels[label]
cdef int preprocess_gold(self, GoldParse gold) except -1: cdef int preprocess_gold(self, GoldParse gold) except -1:
biluo_strings = iob_to_biluo(gold.ner)
for i in range(gold.length): for i in range(gold.length):
gold.c_ner[i] = self.lookup_transition(biluo_strings[i]) gold.c_ner[i] = self.lookup_transition(gold.ner[i])
cdef Transition lookup_transition(self, object name) except *: cdef Transition lookup_transition(self, object name) except *:
if '-' in name: if name == '-':
move_str = 'M'
label = 0
elif '-' in name:
move_str, label_str = name.split('-', 1) move_str, label_str = name.split('-', 1)
label = self.label_ids[label_str] label = self.label_ids[label_str]
else: else:
@ -107,6 +120,9 @@ cdef class BiluoPushDown(TransitionSystem):
t.get_cost = _get_cost t.get_cost = _get_cost
return t return t
cdef int first_state(self, State* state) except -1:
pass
cdef Transition best_valid(self, const weight_t* scores, const State* s) except *: cdef Transition best_valid(self, const weight_t* scores, const State* s) except *:
cdef int best = -1 cdef int best = -1
cdef weight_t score = -90000 cdef weight_t score = -90000
@ -128,8 +144,9 @@ cdef int _get_cost(const Transition* self, const State* s, GoldParse gold) excep
return 9000 return 9000
cdef bint is_sunk = _entity_is_sunk(s, gold.c_ner) cdef bint is_sunk = _entity_is_sunk(s, gold.c_ner)
cdef int next_act = gold.c_ner[s.i+1].move if s.i < s.sent_len else OUT cdef int next_act = gold.c_ner[s.i+1].move if s.i < s.sent_len else OUT
return not _is_gold(self.move, self.label, gold.c_ner[s.i].move, gold.c_ner[s.i].label, cdef bint is_gold = _is_gold(self.move, self.label, gold.c_ner[s.i].move,
next_act, is_sunk) gold.c_ner[s.i].label, next_act, is_sunk)
return not is_gold
cdef bint _is_gold(int act, int tag, int g_act, int g_tag, cdef bint _is_gold(int act, int tag, int g_act, int g_tag,
int next_act, bint is_sunk): int next_act, bint is_sunk):
@ -210,18 +227,21 @@ cdef int _do_begin(const Transition* self, State* s) except -1:
s.ents_len += 1 s.ents_len += 1
s.ent.start = s.i s.ent.start = s.i
s.ent.label = self.label s.ent.label = self.label
s.sent[s.i].ent.tag = self.clas s.sent[s.i].ent_iob = 3
s.sent[s.i].ent_type = self.label
s.i += 1 s.i += 1
cdef int _do_in(const Transition* self, State* s) except -1: cdef int _do_in(const Transition* self, State* s) except -1:
s.sent[s.i].ent.tag = self.clas s.sent[s.i].ent_iob = 1
s.sent[s.i].ent_type = self.label
s.i += 1 s.i += 1
cdef int _do_last(const Transition* self, State* s) except -1: cdef int _do_last(const Transition* self, State* s) except -1:
s.ent.end = s.i+1 s.ent.end = s.i+1
s.sent[s.i].ent.tag = self.clas s.sent[s.i].ent_iob = 1
s.sent[s.i].ent_type = self.label
s.i += 1 s.i += 1
@ -231,12 +251,13 @@ cdef int _do_unit(const Transition* self, State* s) except -1:
s.ent.start = s.i s.ent.start = s.i
s.ent.label = self.label s.ent.label = self.label
s.ent.end = s.i+1 s.ent.end = s.i+1
s.sent[s.i].ent.tag = self.clas s.sent[s.i].ent_iob = 3
s.sent[s.i].ent_type = self.label
s.i += 1 s.i += 1
cdef int _do_out(const Transition* self, State* s) except -1: cdef int _do_out(const Transition* self, State* s) except -1:
s.sent[s.i].ent.tag = self.clas s.sent[s.i].ent_iob = 2
s.i += 1 s.i += 1

View File

@ -28,14 +28,12 @@ from ..tokens cimport Tokens, TokenC
from .arc_eager cimport TransitionSystem, Transition from .arc_eager cimport TransitionSystem, Transition
from .transition_system import OracleError from .transition_system import OracleError
from ._state cimport init_state, State, is_final, get_idx, get_s0, get_s1, get_n0, get_n1 from ._state cimport new_state, State, is_final, get_idx, get_s0, get_s1, get_n0, get_n1
from .conll cimport GoldParse from .conll cimport GoldParse
from . import _parse_features from . import _parse_features
from ._parse_features cimport fill_context, CONTEXT_SIZE from ._parse_features cimport fill_context, CONTEXT_SIZE
from ._ner_features cimport _ner_features
DEBUG = False DEBUG = False
def set_debug(val): def set_debug(val):
@ -50,7 +48,11 @@ cdef unicode print_state(State* s, list words):
third = words[s.stack[-2]] + '_%d' % s.sent[s.stack[-2]].head third = words[s.stack[-2]] + '_%d' % s.sent[s.stack[-2]].head
n0 = words[s.i] n0 = words[s.i]
n1 = words[s.i + 1] n1 = words[s.i + 1]
return ' '.join((str(s.stack_len), third, second, top, '|', n0, n1)) if s.ents_len:
ent = '%s %d-%d' % (s.ent.label, s.ent.start, s.ent.end)
else:
ent = '-'
return ' '.join((ent, str(s.stack_len), third, second, top, '|', n0, n1))
def get_templates(name): def get_templates(name):
@ -58,7 +60,7 @@ def get_templates(name):
if name == 'zhang': if name == 'zhang':
return pf.arc_eager return pf.arc_eager
elif name == 'ner': elif name == 'ner':
return _ner_features.basic return pf.ner
else: else:
return (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s0_n1 + pf.n0_n1 + \ return (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s0_n1 + pf.n0_n1 + \
pf.tree_shape + pf.trigrams) pf.tree_shape + pf.trigrams)
@ -79,7 +81,8 @@ cdef class GreedyParser:
cdef atom_t[CONTEXT_SIZE] context cdef atom_t[CONTEXT_SIZE] context
cdef int n_feats cdef int n_feats
cdef Pool mem = Pool() cdef Pool mem = Pool()
cdef State* state = init_state(mem, tokens.data, tokens.length) cdef State* state = new_state(mem, tokens.data, tokens.length)
self.moves.first_state(state)
cdef Transition guess cdef Transition guess
while not is_final(state): while not is_final(state):
fill_context(context, state) fill_context(context, state)
@ -102,7 +105,9 @@ cdef class GreedyParser:
self.moves.preprocess_gold(gold) self.moves.preprocess_gold(gold)
cdef Pool mem = Pool() cdef Pool mem = Pool()
cdef State* state = init_state(mem, tokens.data, tokens.length) cdef State* state = new_state(mem, tokens.data, tokens.length)
self.moves.first_state(state)
py_words = [t.orth_ for t in tokens]
while not is_final(state): while not is_final(state):
fill_context(context, state) fill_context(context, state)
scores = self.model.score(context) scores = self.model.score(context)
@ -114,7 +119,3 @@ cdef class GreedyParser:
best.do(&best, state) best.do(&best, state)
else: else:
guess.do(&guess, state) guess.do(&guess, state)
n_corr = gold.heads_correct(state.sent, score_punct=True)
if force_gold and n_corr != tokens.length:
raise OracleError
return n_corr

View File

@ -29,6 +29,8 @@ cdef class TransitionSystem:
cdef const Transition* c cdef const Transition* c
cdef readonly int n_moves cdef readonly int n_moves
cdef int first_state(self, State* state) except -1
cdef int preprocess_gold(self, GoldParse gold) except -1 cdef int preprocess_gold(self, GoldParse gold) except -1
cdef Transition lookup_transition(self, object name) except * cdef Transition lookup_transition(self, object name) except *

View File

@ -28,6 +28,9 @@ cdef class TransitionSystem:
self.label_ids['MISSING'] = -1 self.label_ids['MISSING'] = -1
self.c = moves self.c = moves
cdef int first_state(self, State* state) except -1:
raise NotImplementedError
cdef int preprocess_gold(self, GoldParse gold) except -1: cdef int preprocess_gold(self, GoldParse gold) except -1:
raise NotImplementedError raise NotImplementedError

View File

@ -39,6 +39,7 @@ cdef class Tokens:
cdef unicode _string cdef unicode _string
cdef tuple _tag_strings cdef tuple _tag_strings
cdef tuple _dep_strings cdef tuple _dep_strings
cdef public tuple _ent_strings
cdef public bint is_tagged cdef public bint is_tagged
cdef public bint is_parsed cdef public bint is_parsed

View File

@ -94,6 +94,7 @@ cdef class Tokens:
self._py_tokens = [] self._py_tokens = []
self._tag_strings = tuple() # These will be set by the POS tagger and parser self._tag_strings = tuple() # These will be set by the POS tagger and parser
self._dep_strings = tuple() # The strings are arbitrary and model-specific. self._dep_strings = tuple() # The strings are arbitrary and model-specific.
self._ent_strings = tuple() # TODO: Clean this up
def __getitem__(self, object i): def __getitem__(self, object i):
"""Retrieve a token. """Retrieve a token.
@ -129,6 +130,28 @@ cdef class Tokens:
cdef const TokenC* last = &self.data[self.length - 1] cdef const TokenC* last = &self.data[self.length - 1]
return self._string[:last.idx + last.lex.length] return self._string[:last.idx + last.lex.length]
property ents:
def __get__(self):
cdef int i
cdef const TokenC* token
cdef int start = -1
cdef object label = None
for i in range(self.length):
token = &self.data[i]
if token.ent_iob == 1:
assert start != -1
pass
elif token.ent_iob == 2:
if start != -1:
yield (start, i, label)
start = -1
label = None
elif token.ent_iob == 3:
start = i
label = self._ent_strings[token.ent_type]
if start != -1:
yield (start, self.length, label)
cdef int push_back(self, int idx, LexemeOrToken lex_or_tok) except -1: cdef int push_back(self, int idx, LexemeOrToken lex_or_tok) except -1:
if self.length == self.max_length: if self.length == self.max_length:
self._realloc(self.length * 2) self._realloc(self.length * 2)