From ae235e07b9fb0bd53fb2028ed078973d4995e4d3 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 9 Mar 2015 07:06:01 -0400 Subject: [PATCH] * Refactoring working for parser, but now need to rig up features for NER, and then debug oracle etc. --- bin/parser/train.py | 12 ++- spacy/syntax/arc_eager.pyx | 24 ++++++ spacy/syntax/conll.pxd | 24 ++---- spacy/syntax/conll.pyx | 123 ++++++----------------------- spacy/syntax/ner.pyx | 31 +++++++- spacy/syntax/parser.pyx | 7 +- spacy/syntax/transition_system.pxd | 4 + spacy/syntax/transition_system.pyx | 6 ++ 8 files changed, 105 insertions(+), 126 deletions(-) diff --git a/bin/parser/train.py b/bin/parser/train.py index bb66a2969..e53a4edb4 100755 --- a/bin/parser/train.py +++ b/bin/parser/train.py @@ -206,7 +206,7 @@ def train(Language, train_loc, model_dir, n_iter=15, feat_set=u'basic', seed=0, Config.write(dep_model_dir, 'config', features=feat_set, seed=seed, labels=Language.ParserTransitionSystem.get_labels(gold_tuples)) - Config.write(ner_model_dir, 'config', features=feat_set, seed=seed, + Config.write(ner_model_dir, 'config', features='ner', seed=seed, labels=Language.EntityTransitionSystem.get_labels(gold_tuples)) nlp = Language() @@ -214,6 +214,7 @@ def train(Language, train_loc, model_dir, n_iter=15, feat_set=u'basic', seed=0, for itn in range(n_iter): dep_corr = 0 pos_corr = 0 + ent_corr = 0 n_tokens = 0 for raw_text, segmented_text, annot_tuples in gold_tuples: if gold_preproc: @@ -221,14 +222,11 @@ def train(Language, train_loc, model_dir, n_iter=15, feat_set=u'basic', seed=0, else: sents = [nlp.tokenizer(raw_text)] for tokens in sents: - - gold = GoldParse(tokens, annot_tuples, nlp.tags, - nlp.parser.moves.label_ids, - nlp.entity.moves.label_ids) - + gold = GoldParse(tokens, annot_tuples) nlp.tagger(tokens) + #ent_corr += nlp.entity.train(tokens, gold, force_gold=force_gold) dep_corr += nlp.parser.train(tokens, gold, force_gold=force_gold) - pos_corr += nlp.tagger.train(tokens, gold.tags_) + pos_corr += nlp.tagger.train(tokens, gold.tags) n_tokens += len(tokens) acc = float(dep_corr) / n_tokens pos_acc = float(pos_corr) / n_tokens diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index e8c4b6574..d757d28c1 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -27,6 +27,13 @@ cdef enum: BREAK N_MOVES +MOVE_NAMES = [None] * N_MOVES +MOVE_NAMES[SHIFT] = 'S' +MOVE_NAMES[REDUCE] = 'D' +MOVE_NAMES[LEFT] = 'L' +MOVE_NAMES[RIGHT] = 'R' +MOVE_NAMES[BREAK] = 'B' + cdef do_func_t[N_MOVES] do_funcs cdef get_cost_func_t[N_MOVES] get_cost_funcs @@ -46,6 +53,23 @@ cdef class ArcEager(TransitionSystem): move_labels[LEFT][label] = True return move_labels + cdef int preprocess_gold(self, GoldParse gold) except -1: + for i in range(gold.length): + gold.c_heads[i] = gold.heads[i] + gold.c_labels[i] = self.label_ids[gold.labels[i]] + + + cdef Transition lookup_transition(self, object name) except *: + if '-' in name: + move_str, label_str = name.split('-', 1) + label = self.label_ids[label_str] + else: + label = 0 + move = MOVE_NAMES.index(move_str) + for i in range(self.n_moves): + if self.c[i].move == move and self.c[i].label == label: + return self.c[i] + cdef Transition init_transition(self, int clas, int move, int label) except *: # TODO: Apparent Cython bug here when we try to use the Transition() # constructor with the function pointers diff --git a/spacy/syntax/conll.pxd b/spacy/syntax/conll.pxd index ab05cb666..626cc699d 100644 --- a/spacy/syntax/conll.pxd +++ b/spacy/syntax/conll.pxd @@ -3,31 +3,21 @@ from cymem.cymem cimport Pool from ..structs cimport TokenC from .transition_system cimport Transition +cimport numpy cdef class GoldParse: cdef Pool mem cdef int length cdef readonly int loss - cdef readonly object ids - cdef readonly object tags - cdef readonly object heads - cdef readonly object labels + cdef readonly list tags + cdef readonly list heads + cdef readonly list labels + cdef readonly list ner - cdef readonly object tags_ - cdef readonly object labels_ - cdef readonly object ner_ - - cdef Transition* ner + cdef int* c_tags cdef int* c_heads cdef int* c_labels + cdef Transition* c_ner cdef int heads_correct(self, TokenC* tokens, bint score_punct=?) except -1 - - -cdef class NERAnnotation: - cdef Pool mem - cdef int* starts - cdef int* ends - cdef int* labels - cdef readonly list entities diff --git a/spacy/syntax/conll.pyx b/spacy/syntax/conll.pyx index a1ddf48c0..1f4252138 100644 --- a/spacy/syntax/conll.pyx +++ b/spacy/syntax/conll.pyx @@ -34,38 +34,37 @@ def read_docparse_file(loc): sents.append((raw_text, tokenized, (ids, tags, heads, labels, iob_ents))) return sents +def _parse_line(line): + pieces = line.split() + if len(pieces) == 4: + return 0, pieces[0], pieces[1], int(pieces[2]) - 1, pieces[3] + else: + id_ = int(pieces[0]) + word = pieces[1] + pos = pieces[3] + iob_ent = pieces[5] + head_idx = int(pieces[6]) + label = pieces[7] + return id_, word, pos, head_idx, label, iob_ent cdef class GoldParse: - def __init__(self, tokens, annot_tuples, pos_tags, dep_labels, entity_types): + def __init__(self, tokens, annot_tuples): self.mem = Pool() self.loss = 0 self.length = len(tokens) - self.ids = numpy.empty(shape=(len(tokens), 1), dtype=numpy.int32) - self.tags = numpy.empty(shape=(len(tokens), 1), dtype=numpy.int32) - self.heads = numpy.empty(shape=(len(tokens), 1), dtype=numpy.int32) - self.labels = numpy.empty(shape=(len(tokens), 1), dtype=numpy.int32) - self.ids[:] = -1 - self.tags[:] = -1 - self.heads[:] = -1 - self.labels[:] = -1 - - self.ner = self.mem.alloc(len(tokens), sizeof(Transition)) + # These are filled by the tagger/parser/entity recogniser + self.c_tags = self.mem.alloc(len(tokens), sizeof(int)) self.c_heads = self.mem.alloc(len(tokens), sizeof(int)) self.c_labels = self.mem.alloc(len(tokens), sizeof(int)) + self.c_ner = self.mem.alloc(len(tokens), sizeof(Transition)) - for i in range(len(tokens)): - self.c_heads[i] = -1 - self.c_labels[i] = -1 - - self.tags_ = [None] * len(tokens) - self.labels_ = [None] * len(tokens) - self.ner_ = [None] * len(tokens) + self.tags = [None] * len(tokens) + self.heads = [-1] * len(tokens) + self.labels = ['MISSING'] * len(tokens) + self.ner = [None] * len(tokens) idx_map = {token.idx: token.i for token in tokens} - print idx_map - # TODO: Fill NER moves - print raw_text for idx, tag, head, label, ner in zip(*annot_tuples): if idx < tokens[0].idx: pass @@ -73,16 +72,12 @@ cdef class GoldParse: break elif idx in idx_map: i = idx_map[idx] - print i, idx, head, idx_map.get(head, -1) - self.ids[i] = idx - self.tags[i] = pos_tags.index(tag) + self.tags[i] = tag self.heads[i] = idx_map.get(head, -1) - self.labels[i] = dep_labels[label] - self.c_heads[i] = -1 - self.c_labels[i] = -1 - self.tags_[i] = tag - self.labels_[i] = label - self.ner_[i] = ner + self.labels[i] = label + self.tags[i] = tag + self.labels[i] = label + self.ner[i] = ner @property def n_non_punct(self): @@ -116,71 +111,3 @@ def _map_indices_to_tokens(ids, heads): return mapped -def _parse_line(line): - pieces = line.split() - if len(pieces) == 4: - return 0, pieces[0], pieces[1], int(pieces[2]) - 1, pieces[3] - else: - id_ = int(pieces[0]) - word = pieces[1] - pos = pieces[3] - iob_ent = pieces[5] - head_idx = int(pieces[6]) - label = pieces[7] - return id_, word, pos, head_idx, label, iob_ent - - -cdef class NERAnnotation: - def __init__(self, entities, length, entity_types): - self.mem = Pool() - self.starts = self.mem.alloc(length, sizeof(int)) - self.ends = self.mem.alloc(length, sizeof(int)) - self.labels = self.mem.alloc(length, sizeof(int)) - self.entities = entities - memset(self.starts, -1, sizeof(int) * length) - memset(self.ends, -1, sizeof(int) * length) - memset(self.labels, -1, sizeof(int) * length) - - cdef int start, end, label - for start, end, label in entities: - for i in range(start, end): - self.starts[i] = start - self.ends[i] = end - self.labels[i] = label - @property - def biluo_tags(self): - pass - - @property - def iob_tags(self): - pass - - @classmethod - def from_iobs(cls, iob_strs, entity_types): - return cls.from_biluos(iob_to_biluo(iob_strs), entity_types) - - @classmethod - def from_biluos(cls, tag_strs, entity_types): - entities = [] - start = None - for i, tag_str in enumerate(tag_strs): - if tag_str == 'O' or tag_str == '-': - continue - move, label_str = tag_str.split('-') - label = entity_types.index(label_str) - if label == -1: - label = len(entity_types) - entity_types.append(label) - if move == 'U': - assert start is None - entities.append((i, i+1, label)) - elif move == 'B': - assert start is None - start = i - elif move == 'L': - assert start is not None - entities.append((start, i+1, label)) - start = None - return cls(entities, len(tag_strs), entity_types) - - diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx index 7f8217645..3cfe7657b 100644 --- a/spacy/syntax/ner.pyx +++ b/spacy/syntax/ner.pyx @@ -21,6 +21,14 @@ cdef enum: OUT N_MOVES +MOVE_NAMES = [None] * N_MOVES +MOVE_NAMES[MISSING] = 'M' +MOVE_NAMES[BEGIN] = 'B' +MOVE_NAMES[IN] = 'I' +MOVE_NAMES[LAST] = 'L' +MOVE_NAMES[UNIT] = 'U' +MOVE_NAMES[OUT] = 'O' + cdef do_func_t[N_MOVES] do_funcs @@ -70,6 +78,23 @@ cdef class BiluoPushDown(TransitionSystem): move_labels[moves.index(move_str)][label] = True return move_labels + cdef int preprocess_gold(self, GoldParse gold) except -1: + biluo_strings = iob_to_biluo(gold.ner) + for i in range(gold.length): + gold.c_ner[i] = self.lookup_transition(biluo_strings[i]) + + cdef Transition lookup_transition(self, object name) except *: + if '-' in name: + move_str, label_str = name.split('-', 1) + label = self.label_ids[label_str] + else: + move_str = name + label = 0 + move = MOVE_NAMES.index(move_str) + for i in range(self.n_moves): + if self.c[i].move == move and self.c[i].label == label: + return self.c[i] + cdef Transition init_transition(self, int clas, int move, int label) except *: # TODO: Apparent Cython bug here when we try to use the Transition() # constructor with the function pointers @@ -101,9 +126,9 @@ cdef class BiluoPushDown(TransitionSystem): cdef int _get_cost(const Transition* self, const State* s, GoldParse gold) except -1: if not _is_valid(self.move, self.label, s): return 9000 - cdef bint is_sunk = _entity_is_sunk(s, gold.ner) - cdef int next_act = gold.ner[s.i+1].move if s.i < s.sent_len else OUT - return not _is_gold(self.move, self.label, gold.ner[s.i].move, gold.ner[s.i].label, + cdef bint is_sunk = _entity_is_sunk(s, gold.c_ner) + cdef int next_act = gold.c_ner[s.i+1].move if s.i < s.sent_len else OUT + return not _is_gold(self.move, self.label, gold.c_ner[s.i].move, gold.c_ner[s.i].label, next_act, is_sunk) cdef bint _is_gold(int act, int tag, int g_act, int g_tag, diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index ad1aca12d..6360f4f8b 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -34,6 +34,8 @@ from .conll cimport GoldParse from . import _parse_features from ._parse_features cimport fill_context, CONTEXT_SIZE +from ._ner_features cimport _ner_features + DEBUG = False def set_debug(val): @@ -55,6 +57,8 @@ def get_templates(name): pf = _parse_features if name == 'zhang': return pf.arc_eager + elif name == 'ner': + return _ner_features.basic else: return (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s0_n1 + pf.n0_n1 + \ pf.tree_shape + pf.trigrams) @@ -95,7 +99,8 @@ cdef class GreedyParser: Transition best atom_t[CONTEXT_SIZE] context - + + self.moves.preprocess_gold(gold) cdef Pool mem = Pool() cdef State* state = init_state(mem, tokens.data, tokens.length) while not is_final(state): diff --git a/spacy/syntax/transition_system.pxd b/spacy/syntax/transition_system.pxd index e72d2799c..2b02e1e03 100644 --- a/spacy/syntax/transition_system.pxd +++ b/spacy/syntax/transition_system.pxd @@ -29,6 +29,10 @@ cdef class TransitionSystem: cdef const Transition* c cdef readonly int n_moves + cdef int preprocess_gold(self, GoldParse gold) except -1 + + cdef Transition lookup_transition(self, object name) except * + cdef Transition init_transition(self, int clas, int move, int label) except * cdef Transition best_valid(self, const weight_t* scores, const State* state) except * diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx index 63c5ef45c..ae9591690 100644 --- a/spacy/syntax/transition_system.pyx +++ b/spacy/syntax/transition_system.pyx @@ -28,6 +28,12 @@ cdef class TransitionSystem: self.label_ids['MISSING'] = -1 self.c = moves + cdef int preprocess_gold(self, GoldParse gold) except -1: + raise NotImplementedError + + cdef Transition lookup_transition(self, object name) except *: + raise NotImplementedError + cdef Transition init_transition(self, int clas, int move, int label) except *: raise NotImplementedError