* Refactoring working for parser, but now need to rig up features for NER, and then debug oracle etc.

This commit is contained in:
Matthew Honnibal 2015-03-09 07:06:01 -04:00
parent 4539c70542
commit ae235e07b9
8 changed files with 105 additions and 126 deletions

View File

@ -206,7 +206,7 @@ def train(Language, train_loc, model_dir, n_iter=15, feat_set=u'basic', seed=0,
Config.write(dep_model_dir, 'config', features=feat_set, seed=seed, Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
labels=Language.ParserTransitionSystem.get_labels(gold_tuples)) labels=Language.ParserTransitionSystem.get_labels(gold_tuples))
Config.write(ner_model_dir, 'config', features=feat_set, seed=seed, Config.write(ner_model_dir, 'config', features='ner', seed=seed,
labels=Language.EntityTransitionSystem.get_labels(gold_tuples)) labels=Language.EntityTransitionSystem.get_labels(gold_tuples))
nlp = Language() nlp = Language()
@ -214,6 +214,7 @@ def train(Language, train_loc, model_dir, n_iter=15, feat_set=u'basic', seed=0,
for itn in range(n_iter): for itn in range(n_iter):
dep_corr = 0 dep_corr = 0
pos_corr = 0 pos_corr = 0
ent_corr = 0
n_tokens = 0 n_tokens = 0
for raw_text, segmented_text, annot_tuples in gold_tuples: for raw_text, segmented_text, annot_tuples in gold_tuples:
if gold_preproc: if gold_preproc:
@ -221,14 +222,11 @@ def train(Language, train_loc, model_dir, n_iter=15, feat_set=u'basic', seed=0,
else: else:
sents = [nlp.tokenizer(raw_text)] sents = [nlp.tokenizer(raw_text)]
for tokens in sents: for tokens in sents:
gold = GoldParse(tokens, annot_tuples)
gold = GoldParse(tokens, annot_tuples, nlp.tags,
nlp.parser.moves.label_ids,
nlp.entity.moves.label_ids)
nlp.tagger(tokens) nlp.tagger(tokens)
#ent_corr += nlp.entity.train(tokens, gold, force_gold=force_gold)
dep_corr += nlp.parser.train(tokens, gold, force_gold=force_gold) dep_corr += nlp.parser.train(tokens, gold, force_gold=force_gold)
pos_corr += nlp.tagger.train(tokens, gold.tags_) pos_corr += nlp.tagger.train(tokens, gold.tags)
n_tokens += len(tokens) n_tokens += len(tokens)
acc = float(dep_corr) / n_tokens acc = float(dep_corr) / n_tokens
pos_acc = float(pos_corr) / n_tokens pos_acc = float(pos_corr) / n_tokens

View File

@ -27,6 +27,13 @@ cdef enum:
BREAK BREAK
N_MOVES N_MOVES
MOVE_NAMES = [None] * N_MOVES
MOVE_NAMES[SHIFT] = 'S'
MOVE_NAMES[REDUCE] = 'D'
MOVE_NAMES[LEFT] = 'L'
MOVE_NAMES[RIGHT] = 'R'
MOVE_NAMES[BREAK] = 'B'
cdef do_func_t[N_MOVES] do_funcs cdef do_func_t[N_MOVES] do_funcs
cdef get_cost_func_t[N_MOVES] get_cost_funcs cdef get_cost_func_t[N_MOVES] get_cost_funcs
@ -46,6 +53,23 @@ cdef class ArcEager(TransitionSystem):
move_labels[LEFT][label] = True move_labels[LEFT][label] = True
return move_labels return move_labels
cdef int preprocess_gold(self, GoldParse gold) except -1:
for i in range(gold.length):
gold.c_heads[i] = gold.heads[i]
gold.c_labels[i] = self.label_ids[gold.labels[i]]
cdef Transition lookup_transition(self, object name) except *:
if '-' in name:
move_str, label_str = name.split('-', 1)
label = self.label_ids[label_str]
else:
label = 0
move = MOVE_NAMES.index(move_str)
for i in range(self.n_moves):
if self.c[i].move == move and self.c[i].label == label:
return self.c[i]
cdef Transition init_transition(self, int clas, int move, int label) except *: cdef Transition init_transition(self, int clas, int move, int label) except *:
# TODO: Apparent Cython bug here when we try to use the Transition() # TODO: Apparent Cython bug here when we try to use the Transition()
# constructor with the function pointers # constructor with the function pointers

View File

@ -3,31 +3,21 @@ from cymem.cymem cimport Pool
from ..structs cimport TokenC from ..structs cimport TokenC
from .transition_system cimport Transition from .transition_system cimport Transition
cimport numpy
cdef class GoldParse: cdef class GoldParse:
cdef Pool mem cdef Pool mem
cdef int length cdef int length
cdef readonly int loss cdef readonly int loss
cdef readonly object ids cdef readonly list tags
cdef readonly object tags cdef readonly list heads
cdef readonly object heads cdef readonly list labels
cdef readonly object labels cdef readonly list ner
cdef readonly object tags_ cdef int* c_tags
cdef readonly object labels_
cdef readonly object ner_
cdef Transition* ner
cdef int* c_heads cdef int* c_heads
cdef int* c_labels cdef int* c_labels
cdef Transition* c_ner
cdef int heads_correct(self, TokenC* tokens, bint score_punct=?) except -1 cdef int heads_correct(self, TokenC* tokens, bint score_punct=?) except -1
cdef class NERAnnotation:
cdef Pool mem
cdef int* starts
cdef int* ends
cdef int* labels
cdef readonly list entities

View File

@ -34,38 +34,37 @@ def read_docparse_file(loc):
sents.append((raw_text, tokenized, (ids, tags, heads, labels, iob_ents))) sents.append((raw_text, tokenized, (ids, tags, heads, labels, iob_ents)))
return sents return sents
def _parse_line(line):
pieces = line.split()
if len(pieces) == 4:
return 0, pieces[0], pieces[1], int(pieces[2]) - 1, pieces[3]
else:
id_ = int(pieces[0])
word = pieces[1]
pos = pieces[3]
iob_ent = pieces[5]
head_idx = int(pieces[6])
label = pieces[7]
return id_, word, pos, head_idx, label, iob_ent
cdef class GoldParse: cdef class GoldParse:
def __init__(self, tokens, annot_tuples, pos_tags, dep_labels, entity_types): def __init__(self, tokens, annot_tuples):
self.mem = Pool() self.mem = Pool()
self.loss = 0 self.loss = 0
self.length = len(tokens) self.length = len(tokens)
self.ids = numpy.empty(shape=(len(tokens), 1), dtype=numpy.int32)
self.tags = numpy.empty(shape=(len(tokens), 1), dtype=numpy.int32)
self.heads = numpy.empty(shape=(len(tokens), 1), dtype=numpy.int32)
self.labels = numpy.empty(shape=(len(tokens), 1), dtype=numpy.int32)
self.ids[:] = -1 # These are filled by the tagger/parser/entity recogniser
self.tags[:] = -1 self.c_tags = <int*>self.mem.alloc(len(tokens), sizeof(int))
self.heads[:] = -1
self.labels[:] = -1
self.ner = <Transition*>self.mem.alloc(len(tokens), sizeof(Transition))
self.c_heads = <int*>self.mem.alloc(len(tokens), sizeof(int)) self.c_heads = <int*>self.mem.alloc(len(tokens), sizeof(int))
self.c_labels = <int*>self.mem.alloc(len(tokens), sizeof(int)) self.c_labels = <int*>self.mem.alloc(len(tokens), sizeof(int))
self.c_ner = <Transition*>self.mem.alloc(len(tokens), sizeof(Transition))
for i in range(len(tokens)): self.tags = [None] * len(tokens)
self.c_heads[i] = -1 self.heads = [-1] * len(tokens)
self.c_labels[i] = -1 self.labels = ['MISSING'] * len(tokens)
self.ner = [None] * len(tokens)
self.tags_ = [None] * len(tokens)
self.labels_ = [None] * len(tokens)
self.ner_ = [None] * len(tokens)
idx_map = {token.idx: token.i for token in tokens} idx_map = {token.idx: token.i for token in tokens}
print idx_map
# TODO: Fill NER moves
print raw_text
for idx, tag, head, label, ner in zip(*annot_tuples): for idx, tag, head, label, ner in zip(*annot_tuples):
if idx < tokens[0].idx: if idx < tokens[0].idx:
pass pass
@ -73,16 +72,12 @@ cdef class GoldParse:
break break
elif idx in idx_map: elif idx in idx_map:
i = idx_map[idx] i = idx_map[idx]
print i, idx, head, idx_map.get(head, -1) self.tags[i] = tag
self.ids[i] = idx
self.tags[i] = pos_tags.index(tag)
self.heads[i] = idx_map.get(head, -1) self.heads[i] = idx_map.get(head, -1)
self.labels[i] = dep_labels[label] self.labels[i] = label
self.c_heads[i] = -1 self.tags[i] = tag
self.c_labels[i] = -1 self.labels[i] = label
self.tags_[i] = tag self.ner[i] = ner
self.labels_[i] = label
self.ner_[i] = ner
@property @property
def n_non_punct(self): def n_non_punct(self):
@ -116,71 +111,3 @@ def _map_indices_to_tokens(ids, heads):
return mapped return mapped
def _parse_line(line):
pieces = line.split()
if len(pieces) == 4:
return 0, pieces[0], pieces[1], int(pieces[2]) - 1, pieces[3]
else:
id_ = int(pieces[0])
word = pieces[1]
pos = pieces[3]
iob_ent = pieces[5]
head_idx = int(pieces[6])
label = pieces[7]
return id_, word, pos, head_idx, label, iob_ent
cdef class NERAnnotation:
def __init__(self, entities, length, entity_types):
self.mem = Pool()
self.starts = <int*>self.mem.alloc(length, sizeof(int))
self.ends = <int*>self.mem.alloc(length, sizeof(int))
self.labels = <int*>self.mem.alloc(length, sizeof(int))
self.entities = entities
memset(self.starts, -1, sizeof(int) * length)
memset(self.ends, -1, sizeof(int) * length)
memset(self.labels, -1, sizeof(int) * length)
cdef int start, end, label
for start, end, label in entities:
for i in range(start, end):
self.starts[i] = start
self.ends[i] = end
self.labels[i] = label
@property
def biluo_tags(self):
pass
@property
def iob_tags(self):
pass
@classmethod
def from_iobs(cls, iob_strs, entity_types):
return cls.from_biluos(iob_to_biluo(iob_strs), entity_types)
@classmethod
def from_biluos(cls, tag_strs, entity_types):
entities = []
start = None
for i, tag_str in enumerate(tag_strs):
if tag_str == 'O' or tag_str == '-':
continue
move, label_str = tag_str.split('-')
label = entity_types.index(label_str)
if label == -1:
label = len(entity_types)
entity_types.append(label)
if move == 'U':
assert start is None
entities.append((i, i+1, label))
elif move == 'B':
assert start is None
start = i
elif move == 'L':
assert start is not None
entities.append((start, i+1, label))
start = None
return cls(entities, len(tag_strs), entity_types)

View File

@ -21,6 +21,14 @@ cdef enum:
OUT OUT
N_MOVES N_MOVES
MOVE_NAMES = [None] * N_MOVES
MOVE_NAMES[MISSING] = 'M'
MOVE_NAMES[BEGIN] = 'B'
MOVE_NAMES[IN] = 'I'
MOVE_NAMES[LAST] = 'L'
MOVE_NAMES[UNIT] = 'U'
MOVE_NAMES[OUT] = 'O'
cdef do_func_t[N_MOVES] do_funcs cdef do_func_t[N_MOVES] do_funcs
@ -70,6 +78,23 @@ cdef class BiluoPushDown(TransitionSystem):
move_labels[moves.index(move_str)][label] = True move_labels[moves.index(move_str)][label] = True
return move_labels return move_labels
cdef int preprocess_gold(self, GoldParse gold) except -1:
biluo_strings = iob_to_biluo(gold.ner)
for i in range(gold.length):
gold.c_ner[i] = self.lookup_transition(biluo_strings[i])
cdef Transition lookup_transition(self, object name) except *:
if '-' in name:
move_str, label_str = name.split('-', 1)
label = self.label_ids[label_str]
else:
move_str = name
label = 0
move = MOVE_NAMES.index(move_str)
for i in range(self.n_moves):
if self.c[i].move == move and self.c[i].label == label:
return self.c[i]
cdef Transition init_transition(self, int clas, int move, int label) except *: cdef Transition init_transition(self, int clas, int move, int label) except *:
# TODO: Apparent Cython bug here when we try to use the Transition() # TODO: Apparent Cython bug here when we try to use the Transition()
# constructor with the function pointers # constructor with the function pointers
@ -101,9 +126,9 @@ cdef class BiluoPushDown(TransitionSystem):
cdef int _get_cost(const Transition* self, const State* s, GoldParse gold) except -1: cdef int _get_cost(const Transition* self, const State* s, GoldParse gold) except -1:
if not _is_valid(self.move, self.label, s): if not _is_valid(self.move, self.label, s):
return 9000 return 9000
cdef bint is_sunk = _entity_is_sunk(s, gold.ner) cdef bint is_sunk = _entity_is_sunk(s, gold.c_ner)
cdef int next_act = gold.ner[s.i+1].move if s.i < s.sent_len else OUT cdef int next_act = gold.c_ner[s.i+1].move if s.i < s.sent_len else OUT
return not _is_gold(self.move, self.label, gold.ner[s.i].move, gold.ner[s.i].label, return not _is_gold(self.move, self.label, gold.c_ner[s.i].move, gold.c_ner[s.i].label,
next_act, is_sunk) next_act, is_sunk)
cdef bint _is_gold(int act, int tag, int g_act, int g_tag, cdef bint _is_gold(int act, int tag, int g_act, int g_tag,

View File

@ -34,6 +34,8 @@ from .conll cimport GoldParse
from . import _parse_features from . import _parse_features
from ._parse_features cimport fill_context, CONTEXT_SIZE from ._parse_features cimport fill_context, CONTEXT_SIZE
from ._ner_features cimport _ner_features
DEBUG = False DEBUG = False
def set_debug(val): def set_debug(val):
@ -55,6 +57,8 @@ def get_templates(name):
pf = _parse_features pf = _parse_features
if name == 'zhang': if name == 'zhang':
return pf.arc_eager return pf.arc_eager
elif name == 'ner':
return _ner_features.basic
else: else:
return (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s0_n1 + pf.n0_n1 + \ return (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s0_n1 + pf.n0_n1 + \
pf.tree_shape + pf.trigrams) pf.tree_shape + pf.trigrams)
@ -95,7 +99,8 @@ cdef class GreedyParser:
Transition best Transition best
atom_t[CONTEXT_SIZE] context atom_t[CONTEXT_SIZE] context
self.moves.preprocess_gold(gold)
cdef Pool mem = Pool() cdef Pool mem = Pool()
cdef State* state = init_state(mem, tokens.data, tokens.length) cdef State* state = init_state(mem, tokens.data, tokens.length)
while not is_final(state): while not is_final(state):

View File

@ -29,6 +29,10 @@ cdef class TransitionSystem:
cdef const Transition* c cdef const Transition* c
cdef readonly int n_moves cdef readonly int n_moves
cdef int preprocess_gold(self, GoldParse gold) except -1
cdef Transition lookup_transition(self, object name) except *
cdef Transition init_transition(self, int clas, int move, int label) except * cdef Transition init_transition(self, int clas, int move, int label) except *
cdef Transition best_valid(self, const weight_t* scores, const State* state) except * cdef Transition best_valid(self, const weight_t* scores, const State* state) except *

View File

@ -28,6 +28,12 @@ cdef class TransitionSystem:
self.label_ids['MISSING'] = -1 self.label_ids['MISSING'] = -1
self.c = moves self.c = moves
cdef int preprocess_gold(self, GoldParse gold) except -1:
raise NotImplementedError
cdef Transition lookup_transition(self, object name) except *:
raise NotImplementedError
cdef Transition init_transition(self, int clas, int move, int label) except *: cdef Transition init_transition(self, int clas, int move, int label) except *:
raise NotImplementedError raise NotImplementedError