# cython: infer_types=True # cython: profile=True """ MALT-style dependency parser """ from __future__ import unicode_literals cimport cython cimport cython.parallel from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF from cpython.exc cimport PyErr_CheckSignals from libc.stdint cimport uint32_t, uint64_t from libc.string cimport memset, memcpy from libc.stdlib cimport malloc, calloc, free import os.path from os import path import shutil import json import sys from .nonproj import PseudoProjectivity import random from cymem.cymem cimport Pool, Address from murmurhash.mrmr cimport hash64 from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t, idx_t from thinc.linear.avgtron cimport AveragedPerceptron from thinc.linalg cimport VecVec from thinc.structs cimport NeuralNetC, SparseArrayC, ExampleC from preshed.maps cimport MapStruct from preshed.maps cimport map_get from thinc.structs cimport FeatureC from util import Config from ..structs cimport TokenC from ..tokens.doc cimport Doc from ..strings cimport StringStore from .transition_system import OracleError from .transition_system cimport TransitionSystem, Transition from ..gold cimport GoldParse from . import _parse_features from ._parse_features cimport CONTEXT_SIZE from ._parse_features cimport fill_context from ._parse_features cimport * from .stateclass cimport StateClass from ._state cimport StateC DEBUG = False def set_debug(val): global DEBUG DEBUG = val def get_templates(name): pf = _parse_features if name == 'ner': return pf.ner elif name == 'debug': return pf.unigrams elif name.startswith('neural'): features = pf.words + pf.tags + pf.labels slots = [0] * len(pf.words) + [1] * len(pf.tags) + [2] * len(pf.labels) return ([(f,) for f in features], slots) else: return (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s1_s0 + pf.s0_n1 + pf.n0_n1 + \ pf.tree_shape + pf.trigrams) def ParserFactory(transition_system): return lambda strings, dir_: Parser(strings, dir_, transition_system) cdef class ParserPerceptron(AveragedPerceptron): @property def widths(self): return (self.extracter.nr_templ,) def update(self, Example eg): '''Does regression on negative cost. Sort of cute?''' self.time += 1 cdef weight_t loss = 0.0 best = eg.best for clas in range(eg.c.nr_class): if not eg.c.is_valid[clas]: continue if eg.c.scores[clas] < eg.c.scores[best]: continue loss += (-eg.c.costs[clas] - eg.c.scores[clas]) ** 2 d_loss = 2 * (-eg.c.costs[clas] - eg.c.scores[clas]) step = d_loss * 0.001 for feat in eg.c.features[:eg.c.nr_feat]: self.update_weight(feat.key, clas, feat.value * step) return int(loss) cdef void set_featuresC(self, ExampleC* eg, const void* _state) nogil: state = _state fill_context(eg.atoms, state) eg.nr_feat = self.extracter.set_features(eg.features, eg.atoms) cdef class ParserNeuralNet(NeuralNet): def __init__(self, shape, **kwargs): vector_widths = [4] * 57 slots = [0, 1, 2, 3] # S0 slots += [4, 5, 6, 7] # S1 slots += [8, 9, 10, 11] # S2 slots += [12, 13, 14, 15] # S3+ slots += [16, 17, 18, 19] # B0 slots += [20, 21, 22, 23] # B1 slots += [24, 25, 26, 27] # B2 slots += [28, 29, 30, 31] # B3+ slots += [32, 33, 34, 35] * 2 # S0l, S0r slots += [36, 37, 38, 39] * 2 # B0l, B0r slots += [40, 41, 42, 43] * 2 # S1l, S1r slots += [44, 45, 46, 47] * 2 # S2l, S2r slots += [48, 49, 50, 51, 52] slots += [53, 54, 55, 56] input_length = sum(vector_widths[slot] for slot in slots) widths = [input_length] + shape[3:] NeuralNet.__init__(self, widths, embed=(vector_widths, slots), **kwargs) @property def nr_feat(self): return 2000 cdef void set_featuresC(self, ExampleC* eg, const void* _state) nogil: memset(eg.features, 0, 2000 * sizeof(FeatureC)) state = _state fill_context(eg.atoms, state) feats = eg.features feats = _add_token(feats, 0, state.S_(0), 1.0) feats = _add_token(feats, 4, state.S_(1), 1.0) feats = _add_token(feats, 8, state.S_(2), 1.0) # Rest of the stack, with exponential decay for i in range(3, state.stack_depth()): feats = _add_token(feats, 12, state.S_(i), 1.0 * 0.5**(i-2)) feats = _add_token(feats, 16, state.B_(0), 1.0) feats = _add_token(feats, 20, state.B_(1), 1.0) feats = _add_token(feats, 24, state.B_(2), 1.0) # Rest of the buffer, with exponential decay for i in range(3, min(8, state.buffer_length())): feats = _add_token(feats, 28, state.B_(i), 1.0 * 0.5**(i-2)) feats = _add_subtree(feats, 32, state, state.S(0)) feats = _add_subtree(feats, 40, state, state.B(0)) feats = _add_subtree(feats, 48, state, state.S(1)) feats = _add_subtree(feats, 56, state, state.S(2)) feats = _add_pos_bigram(feats, 64, state.S_(0), state.B_(0)) feats = _add_pos_bigram(feats, 65, state.S_(1), state.S_(0)) feats = _add_pos_bigram(feats, 66, state.S_(1), state.B_(0)) feats = _add_pos_bigram(feats, 67, state.S_(0), state.B_(1)) feats = _add_pos_bigram(feats, 68, state.B_(0), state.B_(1)) feats = _add_pos_trigram(feats, 69, state.S_(1), state.S_(0), state.B_(0)) feats = _add_pos_trigram(feats, 70, state.S_(0), state.B_(0), state.B_(1)) feats = _add_pos_trigram(feats, 71, state.S_(0), state.R_(state.S(0), 1), state.R_(state.S(0), 2)) feats = _add_pos_trigram(feats, 72, state.S_(0), state.L_(state.S(0), 1), state.L_(state.S(0), 2)) eg.nr_feat = feats - eg.features cdef inline FeatureC* _add_token(FeatureC* feats, int slot, const TokenC* token, weight_t value) nogil: # Word feats.i = slot feats.key = token.lex.norm feats.value = value feats += 1 # POS tag feats.i = slot+1 feats.key = token.tag feats.value = value feats += 1 # Dependency label feats.i = slot+2 feats.key = token.dep feats.value = value feats += 1 # Word, label, tag feats.i = slot+3 cdef uint64_t key[3] key[0] = token.lex.cluster key[1] = token.tag key[2] = token.dep feats.key = hash64(key, sizeof(key), 0) feats.value = value feats += 1 return feats cdef inline FeatureC* _add_subtree(FeatureC* feats, int slot, const StateC* state, int t) nogil: value = 1.0 for i in range(state.n_R(t)): feats = _add_token(feats, slot, state.R_(t, i+1), value) value *= 0.5 slot += 4 value = 1.0 for i in range(state.n_L(t)): feats = _add_token(feats, slot, state.L_(t, i+1), value) value *= 0.5 return feats cdef inline FeatureC* _add_pos_bigram(FeatureC* feat, int slot, const TokenC* t1, const TokenC* t2) nogil: cdef uint64_t[2] key key[0] = t1.tag key[1] = t2.tag feat.i = slot feat.key = hash64(key, sizeof(key), slot) feat.value = 1.0 return feat+1 cdef inline FeatureC* _add_pos_trigram(FeatureC* feat, int slot, const TokenC* t1, const TokenC* t2, const TokenC* t3) nogil: cdef uint64_t[3] key key[0] = t1.tag key[1] = t2.tag key[2] = t3.tag feat.i = slot feat.key = hash64(key, sizeof(key), slot) feat.value = 1.0 return feat+1 cdef class ParserNeuralNetEnsemble(ParserNeuralNet): def __init__(self, shape, update_step='sgd', eta=0.01, rho=0.0, n=5): ParserNeuralNet.__init__(self, shape, update_step=update_step, eta=eta, rho=rho) self._models_c = self.mem.alloc(sizeof(NeuralNetC*), n) self._masks = self.mem.alloc(sizeof(int*), n) self._models = [] cdef ParserNeuralNet model threshold = 1.5 / n self._nr_model = n for i in range(n): self._masks[i] = self.mem.alloc(sizeof(int), self.nr_feat) for j in range(self.nr_feat): self._masks[i][j] = random.random() < threshold # We have to pass our pool here, because the embedding table passes # it around. model = ParserNeuralNet(shape, update_step=update_step, eta=eta, rho=rho) self._models_c[i] = &model.c self._models.append(model) property eta: def __get__(self): return self._models[0].eta def __set__(self, weight_t value): for model in self._models: model.eta = value def sparsify_embeddings(self, penalty): p = 0.0 for model in self._models: p += model.sparsify_embeddings(penalty) return p / len(self._models) cdef void set_scoresC(self, weight_t* scores, const void* _feats, int nr_feat, int is_sparse) nogil: nr_class = self.c.widths[self.c.nr_layer-1] sub_scores = calloc(sizeof(weight_t), nr_class) sub_feats = calloc(sizeof(FeatureC), nr_feat) feats = _feats for i in range(self._nr_model): for j in range(nr_feat): sub_feats[j] = feats[j] sub_feats[j].value *= self._masks[i][j] self.c = self._models_c[i][0] self.c.weights = self._models_c[i].weights self.c.gradient = self._models_c[i].gradient ParserNeuralNet.set_scoresC(self, sub_scores, sub_feats, nr_feat, 1) for j in range(nr_class): scores[j] += sub_scores[j] sub_scores[j] = 0.0 for j in range(nr_class): scores[j] /= self._nr_model free(sub_feats) free(sub_scores) def update(self, Example eg): if eg.cost == 0: return 0.0 loss = 0.0 full_feats = calloc(sizeof(FeatureC), eg.nr_feat) memcpy(full_feats, eg.c.features, sizeof(FeatureC) * eg.nr_feat) cdef ParserNeuralNet model for i, model in enumerate(self._models): for j in range(eg.nr_feat): eg.c.features[j].value *= self._masks[i][j] loss += model.update(eg) memcpy(eg.c.features, full_feats, sizeof(FeatureC) * eg.nr_feat) free(full_feats) return loss def end_training(self): for model in self._models: model.end_training() cdef class Parser: def __init__(self, StringStore strings, transition_system, model): self.moves = transition_system self.model = model @classmethod def from_dir(cls, model_dir, strings, transition_system): if not os.path.exists(model_dir): print >> sys.stderr, "Warning: No model found at", model_dir elif not os.path.isdir(model_dir): print >> sys.stderr, "Warning: model path:", model_dir, "is not a directory" cfg = Config.read(model_dir, 'config') moves = transition_system(strings, cfg.labels) if cfg.get('model') == 'neural': shape = [cfg.vector_widths, cfg.slots, cfg.feat_set] shape.extend(cfg.hidden_layers) shape.append(moves.n_moves) if cfg.get('ensemble_size') >= 2: model = ParserNeuralNetEnsemble(shape, update_step=cfg.update_step, eta=cfg.eta, rho=cfg.rho, n=cfg.ensemble_size) else: model = ParserNeuralNet(shape, update_step=cfg.update_step, eta=cfg.eta, rho=cfg.rho) else: model = ParserPerceptron(get_templates(cfg.feat_set)) if path.exists(path.join(model_dir, 'model')): model.load(path.join(model_dir, 'model')) return cls(strings, moves, model) @classmethod def load(cls, pkg_or_str_or_file, vocab): # TODO raise NotImplementedError( "This should be here, but isn't yet =/. Use Parser.from_dir") def __reduce__(self): return (Parser, (self.moves.strings, self.moves, self.model), None, None) def __call__(self, Doc tokens): cdef int nr_class = self.moves.n_moves cdef int nr_feat = self.model.nr_feat with nogil: self.parseC(tokens.c, tokens.length, nr_feat, nr_class) # Check for KeyboardInterrupt etc. Untested PyErr_CheckSignals() self.moves.finalize_doc(tokens) def pipe(self, stream, int batch_size=1000, int n_threads=2): cdef Pool mem = Pool() cdef TokenC** doc_ptr = mem.alloc(batch_size, sizeof(TokenC*)) cdef int* lengths = mem.alloc(batch_size, sizeof(int)) cdef Doc doc cdef int i cdef int nr_class = self.moves.n_moves cdef int nr_feat = self.model.nr_feat cdef int status queue = [] for doc in stream: doc_ptr[len(queue)] = doc.c lengths[len(queue)] = doc.length queue.append(doc) if len(queue) == batch_size: with nogil: for i in cython.parallel.prange(batch_size, num_threads=n_threads): status = self.parseC(doc_ptr[i], lengths[i], nr_feat, nr_class) if status != 0: with gil: sent_str = queue[i].text raise ValueError("Error parsing doc: %s" % sent_str) PyErr_CheckSignals() for doc in queue: self.moves.finalize_doc(doc) yield doc queue = [] batch_size = len(queue) with nogil: for i in cython.parallel.prange(batch_size, num_threads=n_threads): status = self.parseC(doc_ptr[i], lengths[i], nr_feat, nr_class) if status != 0: with gil: sent_str = queue[i].text raise ValueError("Error parsing doc: %s" % sent_str) PyErr_CheckSignals() for doc in queue: self.moves.finalize_doc(doc) yield doc cdef int parseC(self, TokenC* tokens, int length, int nr_feat, int nr_class) with gil: cdef Example py_eg = Example(nr_class=nr_class, nr_atom=CONTEXT_SIZE, nr_feat=nr_feat, widths=self.model.widths) cdef ExampleC* eg = py_eg.c state = new StateC(tokens, length) self.moves.initialize_state(state) cdef int i while not state.is_final(): self.model.set_featuresC(eg, state) self.moves.set_valid(eg.is_valid, state) self.model.set_scoresC(eg.scores, eg.features, eg.nr_feat, 1) guess = VecVec.arg_max_if_true(eg.scores, eg.is_valid, eg.nr_class) action = self.moves.c[guess] if not eg.is_valid[guess]: return 1 action.do(state, action.label) py_eg.reset() self.moves.finalize_state(state) for i in range(length): tokens[i] = state._sent[i] del state return 0 def train(self, Doc tokens, GoldParse gold): self.moves.preprocess_gold(gold) cdef StateClass stcls = StateClass.init(tokens.c, tokens.length) self.moves.initialize_state(stcls.c) cdef Pool mem = Pool() cdef Example eg = Example( nr_class=self.moves.n_moves, widths=self.model.widths, nr_atom=CONTEXT_SIZE, nr_feat=self.model.nr_feat) loss = 0 cdef Transition action while not stcls.is_final(): self.model.set_featuresC(eg.c, stcls.c) self.model.set_scoresC(eg.c.scores, eg.c.features, eg.c.nr_feat, 1) self.moves.set_costs(eg.c.is_valid, eg.c.costs, stcls, gold) guess = VecVec.arg_max_if_true(eg.c.scores, eg.c.is_valid, eg.c.nr_class) assert guess >= 0 action = self.moves.c[guess] action.do(stcls.c, action.label) loss += self.model.update(eg) eg.reset() return loss def step_through(self, Doc doc): return StepwiseState(self, doc) def from_transition_sequence(self, Doc doc, sequence): with self.step_through(doc) as stepwise: for transition in sequence: stepwise.transition(transition) def add_label(self, label): for action in self.moves.action_types: self.moves.add_action(action, label) cdef class StepwiseState: cdef readonly StateClass stcls cdef readonly Example eg cdef readonly Doc doc cdef readonly Parser parser def __init__(self, Parser parser, Doc doc): self.parser = parser self.doc = doc self.stcls = StateClass.init(doc.c, doc.length) self.parser.moves.initialize_state(self.stcls.c) self.eg = Example( nr_class=self.parser.moves.n_moves, nr_atom=CONTEXT_SIZE, nr_feat=self.parser.model.nr_feat) def __enter__(self): return self def __exit__(self, type, value, traceback): self.finish() @property def is_final(self): return self.stcls.is_final() @property def stack(self): return self.stcls.stack @property def queue(self): return self.stcls.queue @property def heads(self): return [self.stcls.H(i) for i in range(self.stcls.c.length)] @property def deps(self): return [self.doc.vocab.strings[self.stcls.c._sent[i].dep] for i in range(self.stcls.c.length)] def predict(self): self.eg.reset() self.parser.model.set_featuresC(self.eg.c, self.stcls.c) self.parser.moves.set_valid(self.eg.c.is_valid, self.stcls.c) self.parser.model.set_scoresC(self.eg.c.scores, self.eg.c.features, self.eg.c.nr_feat, 1) cdef Transition action = self.parser.moves.c[self.eg.guess] return self.parser.moves.move_name(action.move, action.label) def transition(self, action_name): moves = {'S': 0, 'D': 1, 'L': 2, 'R': 3} if action_name == '_': action_name = self.predict() action = self.parser.moves.lookup_transition(action_name) elif action_name == 'L' or action_name == 'R': self.predict() move = moves[action_name] clas = _arg_max_clas(self.eg.c.scores, move, self.parser.moves.c, self.eg.c.nr_class) action = self.parser.moves.c[clas] else: action = self.parser.moves.lookup_transition(action_name) action.do(self.stcls.c, action.label) def finish(self): if self.stcls.is_final(): self.parser.moves.finalize_state(self.stcls.c) self.doc.set_parse(self.stcls.c._sent) self.parser.moves.finalize_doc(self.doc) cdef int _arg_max_clas(const weight_t* scores, int move, const Transition* actions, int nr_class) except -1: cdef weight_t score = 0 cdef int mode = -1 cdef int i for i in range(nr_class): if actions[i].move == move and (mode == -1 or scores[i] >= score): mode = i score = scores[i] return mode