* Refactoring working for parser, but now need to rig up features for NER, and then debug oracle etc.

This commit is contained in:
Matthew Honnibal 2015-03-09 07:06:01 -04:00
parent 4539c70542
commit ae235e07b9
8 changed files with 105 additions and 126 deletions

View File

@ -206,7 +206,7 @@ def train(Language, train_loc, model_dir, n_iter=15, feat_set=u'basic', seed=0,
Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
labels=Language.ParserTransitionSystem.get_labels(gold_tuples))
Config.write(ner_model_dir, 'config', features=feat_set, seed=seed,
Config.write(ner_model_dir, 'config', features='ner', seed=seed,
labels=Language.EntityTransitionSystem.get_labels(gold_tuples))
nlp = Language()
@ -214,6 +214,7 @@ def train(Language, train_loc, model_dir, n_iter=15, feat_set=u'basic', seed=0,
for itn in range(n_iter):
dep_corr = 0
pos_corr = 0
ent_corr = 0
n_tokens = 0
for raw_text, segmented_text, annot_tuples in gold_tuples:
if gold_preproc:
@ -221,14 +222,11 @@ def train(Language, train_loc, model_dir, n_iter=15, feat_set=u'basic', seed=0,
else:
sents = [nlp.tokenizer(raw_text)]
for tokens in sents:
gold = GoldParse(tokens, annot_tuples, nlp.tags,
nlp.parser.moves.label_ids,
nlp.entity.moves.label_ids)
gold = GoldParse(tokens, annot_tuples)
nlp.tagger(tokens)
#ent_corr += nlp.entity.train(tokens, gold, force_gold=force_gold)
dep_corr += nlp.parser.train(tokens, gold, force_gold=force_gold)
pos_corr += nlp.tagger.train(tokens, gold.tags_)
pos_corr += nlp.tagger.train(tokens, gold.tags)
n_tokens += len(tokens)
acc = float(dep_corr) / n_tokens
pos_acc = float(pos_corr) / n_tokens

View File

@ -27,6 +27,13 @@ cdef enum:
BREAK
N_MOVES
MOVE_NAMES = [None] * N_MOVES
MOVE_NAMES[SHIFT] = 'S'
MOVE_NAMES[REDUCE] = 'D'
MOVE_NAMES[LEFT] = 'L'
MOVE_NAMES[RIGHT] = 'R'
MOVE_NAMES[BREAK] = 'B'
cdef do_func_t[N_MOVES] do_funcs
cdef get_cost_func_t[N_MOVES] get_cost_funcs
@ -46,6 +53,23 @@ cdef class ArcEager(TransitionSystem):
move_labels[LEFT][label] = True
return move_labels
cdef int preprocess_gold(self, GoldParse gold) except -1:
for i in range(gold.length):
gold.c_heads[i] = gold.heads[i]
gold.c_labels[i] = self.label_ids[gold.labels[i]]
cdef Transition lookup_transition(self, object name) except *:
if '-' in name:
move_str, label_str = name.split('-', 1)
label = self.label_ids[label_str]
else:
label = 0
move = MOVE_NAMES.index(move_str)
for i in range(self.n_moves):
if self.c[i].move == move and self.c[i].label == label:
return self.c[i]
cdef Transition init_transition(self, int clas, int move, int label) except *:
# TODO: Apparent Cython bug here when we try to use the Transition()
# constructor with the function pointers

View File

@ -3,31 +3,21 @@ from cymem.cymem cimport Pool
from ..structs cimport TokenC
from .transition_system cimport Transition
cimport numpy
cdef class GoldParse:
cdef Pool mem
cdef int length
cdef readonly int loss
cdef readonly object ids
cdef readonly object tags
cdef readonly object heads
cdef readonly object labels
cdef readonly list tags
cdef readonly list heads
cdef readonly list labels
cdef readonly list ner
cdef readonly object tags_
cdef readonly object labels_
cdef readonly object ner_
cdef Transition* ner
cdef int* c_tags
cdef int* c_heads
cdef int* c_labels
cdef Transition* c_ner
cdef int heads_correct(self, TokenC* tokens, bint score_punct=?) except -1
cdef class NERAnnotation:
cdef Pool mem
cdef int* starts
cdef int* ends
cdef int* labels
cdef readonly list entities

View File

@ -34,38 +34,37 @@ def read_docparse_file(loc):
sents.append((raw_text, tokenized, (ids, tags, heads, labels, iob_ents)))
return sents
def _parse_line(line):
pieces = line.split()
if len(pieces) == 4:
return 0, pieces[0], pieces[1], int(pieces[2]) - 1, pieces[3]
else:
id_ = int(pieces[0])
word = pieces[1]
pos = pieces[3]
iob_ent = pieces[5]
head_idx = int(pieces[6])
label = pieces[7]
return id_, word, pos, head_idx, label, iob_ent
cdef class GoldParse:
def __init__(self, tokens, annot_tuples, pos_tags, dep_labels, entity_types):
def __init__(self, tokens, annot_tuples):
self.mem = Pool()
self.loss = 0
self.length = len(tokens)
self.ids = numpy.empty(shape=(len(tokens), 1), dtype=numpy.int32)
self.tags = numpy.empty(shape=(len(tokens), 1), dtype=numpy.int32)
self.heads = numpy.empty(shape=(len(tokens), 1), dtype=numpy.int32)
self.labels = numpy.empty(shape=(len(tokens), 1), dtype=numpy.int32)
self.ids[:] = -1
self.tags[:] = -1
self.heads[:] = -1
self.labels[:] = -1
self.ner = <Transition*>self.mem.alloc(len(tokens), sizeof(Transition))
# These are filled by the tagger/parser/entity recogniser
self.c_tags = <int*>self.mem.alloc(len(tokens), sizeof(int))
self.c_heads = <int*>self.mem.alloc(len(tokens), sizeof(int))
self.c_labels = <int*>self.mem.alloc(len(tokens), sizeof(int))
self.c_ner = <Transition*>self.mem.alloc(len(tokens), sizeof(Transition))
for i in range(len(tokens)):
self.c_heads[i] = -1
self.c_labels[i] = -1
self.tags_ = [None] * len(tokens)
self.labels_ = [None] * len(tokens)
self.ner_ = [None] * len(tokens)
self.tags = [None] * len(tokens)
self.heads = [-1] * len(tokens)
self.labels = ['MISSING'] * len(tokens)
self.ner = [None] * len(tokens)
idx_map = {token.idx: token.i for token in tokens}
print idx_map
# TODO: Fill NER moves
print raw_text
for idx, tag, head, label, ner in zip(*annot_tuples):
if idx < tokens[0].idx:
pass
@ -73,16 +72,12 @@ cdef class GoldParse:
break
elif idx in idx_map:
i = idx_map[idx]
print i, idx, head, idx_map.get(head, -1)
self.ids[i] = idx
self.tags[i] = pos_tags.index(tag)
self.tags[i] = tag
self.heads[i] = idx_map.get(head, -1)
self.labels[i] = dep_labels[label]
self.c_heads[i] = -1
self.c_labels[i] = -1
self.tags_[i] = tag
self.labels_[i] = label
self.ner_[i] = ner
self.labels[i] = label
self.tags[i] = tag
self.labels[i] = label
self.ner[i] = ner
@property
def n_non_punct(self):
@ -116,71 +111,3 @@ def _map_indices_to_tokens(ids, heads):
return mapped
def _parse_line(line):
pieces = line.split()
if len(pieces) == 4:
return 0, pieces[0], pieces[1], int(pieces[2]) - 1, pieces[3]
else:
id_ = int(pieces[0])
word = pieces[1]
pos = pieces[3]
iob_ent = pieces[5]
head_idx = int(pieces[6])
label = pieces[7]
return id_, word, pos, head_idx, label, iob_ent
cdef class NERAnnotation:
def __init__(self, entities, length, entity_types):
self.mem = Pool()
self.starts = <int*>self.mem.alloc(length, sizeof(int))
self.ends = <int*>self.mem.alloc(length, sizeof(int))
self.labels = <int*>self.mem.alloc(length, sizeof(int))
self.entities = entities
memset(self.starts, -1, sizeof(int) * length)
memset(self.ends, -1, sizeof(int) * length)
memset(self.labels, -1, sizeof(int) * length)
cdef int start, end, label
for start, end, label in entities:
for i in range(start, end):
self.starts[i] = start
self.ends[i] = end
self.labels[i] = label
@property
def biluo_tags(self):
pass
@property
def iob_tags(self):
pass
@classmethod
def from_iobs(cls, iob_strs, entity_types):
return cls.from_biluos(iob_to_biluo(iob_strs), entity_types)
@classmethod
def from_biluos(cls, tag_strs, entity_types):
entities = []
start = None
for i, tag_str in enumerate(tag_strs):
if tag_str == 'O' or tag_str == '-':
continue
move, label_str = tag_str.split('-')
label = entity_types.index(label_str)
if label == -1:
label = len(entity_types)
entity_types.append(label)
if move == 'U':
assert start is None
entities.append((i, i+1, label))
elif move == 'B':
assert start is None
start = i
elif move == 'L':
assert start is not None
entities.append((start, i+1, label))
start = None
return cls(entities, len(tag_strs), entity_types)

View File

@ -21,6 +21,14 @@ cdef enum:
OUT
N_MOVES
MOVE_NAMES = [None] * N_MOVES
MOVE_NAMES[MISSING] = 'M'
MOVE_NAMES[BEGIN] = 'B'
MOVE_NAMES[IN] = 'I'
MOVE_NAMES[LAST] = 'L'
MOVE_NAMES[UNIT] = 'U'
MOVE_NAMES[OUT] = 'O'
cdef do_func_t[N_MOVES] do_funcs
@ -70,6 +78,23 @@ cdef class BiluoPushDown(TransitionSystem):
move_labels[moves.index(move_str)][label] = True
return move_labels
cdef int preprocess_gold(self, GoldParse gold) except -1:
biluo_strings = iob_to_biluo(gold.ner)
for i in range(gold.length):
gold.c_ner[i] = self.lookup_transition(biluo_strings[i])
cdef Transition lookup_transition(self, object name) except *:
if '-' in name:
move_str, label_str = name.split('-', 1)
label = self.label_ids[label_str]
else:
move_str = name
label = 0
move = MOVE_NAMES.index(move_str)
for i in range(self.n_moves):
if self.c[i].move == move and self.c[i].label == label:
return self.c[i]
cdef Transition init_transition(self, int clas, int move, int label) except *:
# TODO: Apparent Cython bug here when we try to use the Transition()
# constructor with the function pointers
@ -101,9 +126,9 @@ cdef class BiluoPushDown(TransitionSystem):
cdef int _get_cost(const Transition* self, const State* s, GoldParse gold) except -1:
if not _is_valid(self.move, self.label, s):
return 9000
cdef bint is_sunk = _entity_is_sunk(s, gold.ner)
cdef int next_act = gold.ner[s.i+1].move if s.i < s.sent_len else OUT
return not _is_gold(self.move, self.label, gold.ner[s.i].move, gold.ner[s.i].label,
cdef bint is_sunk = _entity_is_sunk(s, gold.c_ner)
cdef int next_act = gold.c_ner[s.i+1].move if s.i < s.sent_len else OUT
return not _is_gold(self.move, self.label, gold.c_ner[s.i].move, gold.c_ner[s.i].label,
next_act, is_sunk)
cdef bint _is_gold(int act, int tag, int g_act, int g_tag,

View File

@ -34,6 +34,8 @@ from .conll cimport GoldParse
from . import _parse_features
from ._parse_features cimport fill_context, CONTEXT_SIZE
from ._ner_features cimport _ner_features
DEBUG = False
def set_debug(val):
@ -55,6 +57,8 @@ def get_templates(name):
pf = _parse_features
if name == 'zhang':
return pf.arc_eager
elif name == 'ner':
return _ner_features.basic
else:
return (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s0_n1 + pf.n0_n1 + \
pf.tree_shape + pf.trigrams)
@ -95,7 +99,8 @@ cdef class GreedyParser:
Transition best
atom_t[CONTEXT_SIZE] context
self.moves.preprocess_gold(gold)
cdef Pool mem = Pool()
cdef State* state = init_state(mem, tokens.data, tokens.length)
while not is_final(state):

View File

@ -29,6 +29,10 @@ cdef class TransitionSystem:
cdef const Transition* c
cdef readonly int n_moves
cdef int preprocess_gold(self, GoldParse gold) except -1
cdef Transition lookup_transition(self, object name) except *
cdef Transition init_transition(self, int clas, int move, int label) except *
cdef Transition best_valid(self, const weight_t* scores, const State* state) except *

View File

@ -28,6 +28,12 @@ cdef class TransitionSystem:
self.label_ids['MISSING'] = -1
self.c = moves
cdef int preprocess_gold(self, GoldParse gold) except -1:
raise NotImplementedError
cdef Transition lookup_transition(self, object name) except *:
raise NotImplementedError
cdef Transition init_transition(self, int clas, int move, int label) except *:
raise NotImplementedError