spaCy/spacy/syntax/parser.pyx
2016-07-20 16:28:02 +02:00

547 lines
19 KiB
Cython

# cython: infer_types=True
# cython: profile=True
"""
MALT-style dependency parser
"""
from __future__ import unicode_literals
cimport cython
cimport cython.parallel
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
from cpython.exc cimport PyErr_CheckSignals
from libc.stdint cimport uint32_t, uint64_t
from libc.string cimport memset, memcpy
from libc.stdlib cimport malloc, calloc, free
import os.path
from os import path
import shutil
import json
import sys
from .nonproj import PseudoProjectivity
import random
from cymem.cymem cimport Pool, Address
from murmurhash.mrmr cimport hash64
from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t, idx_t
from thinc.linear.avgtron cimport AveragedPerceptron
from thinc.linalg cimport VecVec
from thinc.structs cimport NeuralNetC, SparseArrayC, ExampleC
from preshed.maps cimport MapStruct
from preshed.maps cimport map_get
from thinc.structs cimport FeatureC
from util import Config
from ..structs cimport TokenC
from ..tokens.doc cimport Doc
from ..strings cimport StringStore
from .transition_system import OracleError
from .transition_system cimport TransitionSystem, Transition
from ..gold cimport GoldParse
from . import _parse_features
from ._parse_features cimport CONTEXT_SIZE
from ._parse_features cimport fill_context
from ._parse_features cimport *
from .stateclass cimport StateClass
from ._state cimport StateC
DEBUG = False
def set_debug(val):
global DEBUG
DEBUG = val
def get_templates(name):
pf = _parse_features
if name == 'ner':
return pf.ner
elif name == 'debug':
return pf.unigrams
elif name.startswith('neural'):
features = pf.words + pf.tags + pf.labels
slots = [0] * len(pf.words) + [1] * len(pf.tags) + [2] * len(pf.labels)
return ([(f,) for f in features], slots)
else:
return (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s1_s0 + pf.s0_n1 + pf.n0_n1 + \
pf.tree_shape + pf.trigrams)
def ParserFactory(transition_system):
return lambda strings, dir_: Parser(strings, dir_, transition_system)
cdef class ParserPerceptron(AveragedPerceptron):
@property
def widths(self):
return (self.extracter.nr_templ,)
def update(self, Example eg):
'''Does regression on negative cost. Sort of cute?'''
self.time += 1
cdef weight_t loss = 0.0
best = eg.best
for clas in range(eg.c.nr_class):
if not eg.c.is_valid[clas]:
continue
if eg.c.scores[clas] < eg.c.scores[best]:
continue
loss += (-eg.c.costs[clas] - eg.c.scores[clas]) ** 2
d_loss = 2 * (-eg.c.costs[clas] - eg.c.scores[clas])
step = d_loss * 0.001
for feat in eg.c.features[:eg.c.nr_feat]:
self.update_weight(feat.key, clas, feat.value * step)
return int(loss)
cdef void set_featuresC(self, ExampleC* eg, const void* _state) nogil:
state = <const StateC*>_state
fill_context(eg.atoms, state)
eg.nr_feat = self.extracter.set_features(eg.features, eg.atoms)
cdef class ParserNeuralNet(NeuralNet):
def __init__(self, shape, **kwargs):
vector_widths = [4] * 57
slots = [0, 1, 2, 3] # S0
slots += [4, 5, 6, 7] # S1
slots += [8, 9, 10, 11] # S2
slots += [12, 13, 14, 15] # S3+
slots += [16, 17, 18, 19] # B0
slots += [20, 21, 22, 23] # B1
slots += [24, 25, 26, 27] # B2
slots += [28, 29, 30, 31] # B3+
slots += [32, 33, 34, 35] * 2 # S0l, S0r
slots += [36, 37, 38, 39] * 2 # B0l, B0r
slots += [40, 41, 42, 43] * 2 # S1l, S1r
slots += [44, 45, 46, 47] * 2 # S2l, S2r
slots += [48, 49, 50, 51, 52]
slots += [53, 54, 55, 56]
input_length = sum(vector_widths[slot] for slot in slots)
widths = [input_length] + shape[3:]
NeuralNet.__init__(self, widths, embed=(vector_widths, slots), **kwargs)
@property
def nr_feat(self):
return 2000
cdef void set_featuresC(self, ExampleC* eg, const void* _state) nogil:
memset(eg.features, 0, 2000 * sizeof(FeatureC))
state = <const StateC*>_state
fill_context(eg.atoms, state)
feats = eg.features
feats = _add_token(feats, 0, state.S_(0), 1.0)
feats = _add_token(feats, 4, state.S_(1), 1.0)
feats = _add_token(feats, 8, state.S_(2), 1.0)
# Rest of the stack, with exponential decay
for i in range(3, state.stack_depth()):
feats = _add_token(feats, 12, state.S_(i), 1.0 * 0.5**(i-2))
feats = _add_token(feats, 16, state.B_(0), 1.0)
feats = _add_token(feats, 20, state.B_(1), 1.0)
feats = _add_token(feats, 24, state.B_(2), 1.0)
# Rest of the buffer, with exponential decay
for i in range(3, min(8, state.buffer_length())):
feats = _add_token(feats, 28, state.B_(i), 1.0 * 0.5**(i-2))
feats = _add_subtree(feats, 32, state, state.S(0))
feats = _add_subtree(feats, 40, state, state.B(0))
feats = _add_subtree(feats, 48, state, state.S(1))
feats = _add_subtree(feats, 56, state, state.S(2))
feats = _add_pos_bigram(feats, 64, state.S_(0), state.B_(0))
feats = _add_pos_bigram(feats, 65, state.S_(1), state.S_(0))
feats = _add_pos_bigram(feats, 66, state.S_(1), state.B_(0))
feats = _add_pos_bigram(feats, 67, state.S_(0), state.B_(1))
feats = _add_pos_bigram(feats, 68, state.B_(0), state.B_(1))
feats = _add_pos_trigram(feats, 69, state.S_(1), state.S_(0), state.B_(0))
feats = _add_pos_trigram(feats, 70, state.S_(0), state.B_(0), state.B_(1))
feats = _add_pos_trigram(feats, 71, state.S_(0), state.R_(state.S(0), 1),
state.R_(state.S(0), 2))
feats = _add_pos_trigram(feats, 72, state.S_(0), state.L_(state.S(0), 1),
state.L_(state.S(0), 2))
eg.nr_feat = feats - eg.features
cdef inline FeatureC* _add_token(FeatureC* feats,
int slot, const TokenC* token, weight_t value) nogil:
# Word
feats.i = slot
feats.key = token.lex.norm
feats.value = value
feats += 1
# POS tag
feats.i = slot+1
feats.key = token.tag
feats.value = value
feats += 1
# Dependency label
feats.i = slot+2
feats.key = token.dep
feats.value = value
feats += 1
# Word, label, tag
feats.i = slot+3
cdef uint64_t key[3]
key[0] = token.lex.cluster
key[1] = token.tag
key[2] = token.dep
feats.key = hash64(key, sizeof(key), 0)
feats.value = value
feats += 1
return feats
cdef inline FeatureC* _add_subtree(FeatureC* feats, int slot, const StateC* state, int t) nogil:
value = 1.0
for i in range(state.n_R(t)):
feats = _add_token(feats, slot, state.R_(t, i+1), value)
value *= 0.5
slot += 4
value = 1.0
for i in range(state.n_L(t)):
feats = _add_token(feats, slot, state.L_(t, i+1), value)
value *= 0.5
return feats
cdef inline FeatureC* _add_pos_bigram(FeatureC* feat, int slot,
const TokenC* t1, const TokenC* t2) nogil:
cdef uint64_t[2] key
key[0] = t1.tag
key[1] = t2.tag
feat.i = slot
feat.key = hash64(key, sizeof(key), slot)
feat.value = 1.0
return feat+1
cdef inline FeatureC* _add_pos_trigram(FeatureC* feat, int slot,
const TokenC* t1, const TokenC* t2, const TokenC* t3) nogil:
cdef uint64_t[3] key
key[0] = t1.tag
key[1] = t2.tag
key[2] = t3.tag
feat.i = slot
feat.key = hash64(key, sizeof(key), slot)
feat.value = 1.0
return feat+1
cdef class ParserNeuralNetEnsemble(ParserNeuralNet):
def __init__(self, shape, update_step='sgd', eta=0.01, rho=0.0, n=5):
ParserNeuralNet.__init__(self, shape, update_step=update_step, eta=eta, rho=rho)
self._models_c = <NeuralNetC**>self.mem.alloc(sizeof(NeuralNetC*), n)
self._masks = <int**>self.mem.alloc(sizeof(int*), n)
self._models = []
cdef ParserNeuralNet model
threshold = 1.5 / n
self._nr_model = n
for i in range(n):
self._masks[i] = <int*>self.mem.alloc(sizeof(int), self.nr_feat)
for j in range(self.nr_feat):
self._masks[i][j] = random.random() < threshold
# We have to pass our pool here, because the embedding table passes
# it around.
model = ParserNeuralNet(shape, update_step=update_step, eta=eta, rho=rho)
self._models_c[i] = &model.c
self._models.append(model)
property eta:
def __get__(self):
return self._models[0].eta
def __set__(self, weight_t value):
for model in self._models:
model.eta = value
def sparsify_embeddings(self, penalty):
p = 0.0
for model in self._models:
p += model.sparsify_embeddings(penalty)
return p / len(self._models)
cdef void set_scoresC(self, weight_t* scores, const void* _feats,
int nr_feat, int is_sparse) nogil:
nr_class = self.c.widths[self.c.nr_layer-1]
sub_scores = <weight_t*>calloc(sizeof(weight_t), nr_class)
sub_feats = <FeatureC*>calloc(sizeof(FeatureC), nr_feat)
feats = <const FeatureC*>_feats
for i in range(self._nr_model):
for j in range(nr_feat):
sub_feats[j] = feats[j]
sub_feats[j].value *= self._masks[i][j]
self.c = self._models_c[i][0]
self.c.weights = self._models_c[i].weights
self.c.gradient = self._models_c[i].gradient
ParserNeuralNet.set_scoresC(self, sub_scores, sub_feats, nr_feat, 1)
for j in range(nr_class):
scores[j] += sub_scores[j]
sub_scores[j] = 0.0
for j in range(nr_class):
scores[j] /= self._nr_model
free(sub_feats)
free(sub_scores)
def update(self, Example eg):
if eg.cost == 0:
return 0.0
loss = 0.0
full_feats = <FeatureC*>calloc(sizeof(FeatureC), eg.nr_feat)
memcpy(full_feats, eg.c.features, sizeof(FeatureC) * eg.nr_feat)
cdef ParserNeuralNet model
for i, model in enumerate(self._models):
for j in range(eg.nr_feat):
eg.c.features[j].value *= self._masks[i][j]
loss += model.update(eg)
memcpy(eg.c.features, full_feats, sizeof(FeatureC) * eg.nr_feat)
free(full_feats)
return loss
def end_training(self):
for model in self._models:
model.end_training()
cdef class Parser:
def __init__(self, StringStore strings, transition_system, model):
self.moves = transition_system
self.model = model
@classmethod
def from_dir(cls, model_dir, strings, transition_system):
if not os.path.exists(model_dir):
print >> sys.stderr, "Warning: No model found at", model_dir
elif not os.path.isdir(model_dir):
print >> sys.stderr, "Warning: model path:", model_dir, "is not a directory"
cfg = Config.read(model_dir, 'config')
moves = transition_system(strings, cfg.labels)
if cfg.get('model') == 'neural':
shape = [cfg.vector_widths, cfg.slots, cfg.feat_set]
shape.extend(cfg.hidden_layers)
shape.append(moves.n_moves)
if cfg.get('ensemble_size') >= 2:
model = ParserNeuralNetEnsemble(shape, update_step=cfg.update_step,
eta=cfg.eta, rho=cfg.rho,
n=cfg.ensemble_size)
else:
model = ParserNeuralNet(shape, update_step=cfg.update_step,
eta=cfg.eta, rho=cfg.rho)
else:
model = ParserPerceptron(get_templates(cfg.feat_set))
if path.exists(path.join(model_dir, 'model')):
model.load(path.join(model_dir, 'model'))
return cls(strings, moves, model)
@classmethod
def load(cls, pkg_or_str_or_file, vocab):
# TODO
raise NotImplementedError(
"This should be here, but isn't yet =/. Use Parser.from_dir")
def __reduce__(self):
return (Parser, (self.moves.strings, self.moves, self.model), None, None)
def __call__(self, Doc tokens):
cdef int nr_class = self.moves.n_moves
cdef int nr_feat = self.model.nr_feat
with nogil:
self.parseC(tokens.c, tokens.length, nr_feat, nr_class)
# Check for KeyboardInterrupt etc. Untested
PyErr_CheckSignals()
self.moves.finalize_doc(tokens)
def pipe(self, stream, int batch_size=1000, int n_threads=2):
cdef Pool mem = Pool()
cdef TokenC** doc_ptr = <TokenC**>mem.alloc(batch_size, sizeof(TokenC*))
cdef int* lengths = <int*>mem.alloc(batch_size, sizeof(int))
cdef Doc doc
cdef int i
cdef int nr_class = self.moves.n_moves
cdef int nr_feat = self.model.nr_feat
cdef int status
queue = []
for doc in stream:
doc_ptr[len(queue)] = doc.c
lengths[len(queue)] = doc.length
queue.append(doc)
if len(queue) == batch_size:
with nogil:
for i in cython.parallel.prange(batch_size, num_threads=n_threads):
status = self.parseC(doc_ptr[i], lengths[i], nr_feat, nr_class)
if status != 0:
with gil:
sent_str = queue[i].text
raise ValueError("Error parsing doc: %s" % sent_str)
PyErr_CheckSignals()
for doc in queue:
self.moves.finalize_doc(doc)
yield doc
queue = []
batch_size = len(queue)
with nogil:
for i in cython.parallel.prange(batch_size, num_threads=n_threads):
status = self.parseC(doc_ptr[i], lengths[i], nr_feat, nr_class)
if status != 0:
with gil:
sent_str = queue[i].text
raise ValueError("Error parsing doc: %s" % sent_str)
PyErr_CheckSignals()
for doc in queue:
self.moves.finalize_doc(doc)
yield doc
cdef int parseC(self, TokenC* tokens, int length, int nr_feat, int nr_class) with gil:
cdef Example py_eg = Example(nr_class=nr_class, nr_atom=CONTEXT_SIZE, nr_feat=nr_feat,
widths=self.model.widths)
cdef ExampleC* eg = py_eg.c
state = new StateC(tokens, length)
self.moves.initialize_state(state)
cdef int i
while not state.is_final():
self.model.set_featuresC(eg, state)
self.moves.set_valid(eg.is_valid, state)
self.model.set_scoresC(eg.scores, eg.features, eg.nr_feat, 1)
guess = VecVec.arg_max_if_true(eg.scores, eg.is_valid, eg.nr_class)
action = self.moves.c[guess]
if not eg.is_valid[guess]:
return 1
action.do(state, action.label)
py_eg.reset()
self.moves.finalize_state(state)
for i in range(length):
tokens[i] = state._sent[i]
del state
return 0
def train(self, Doc tokens, GoldParse gold):
self.moves.preprocess_gold(gold)
cdef StateClass stcls = StateClass.init(tokens.c, tokens.length)
self.moves.initialize_state(stcls.c)
cdef Pool mem = Pool()
cdef Example eg = Example(
nr_class=self.moves.n_moves,
widths=self.model.widths,
nr_atom=CONTEXT_SIZE,
nr_feat=self.model.nr_feat)
loss = 0
cdef Transition action
while not stcls.is_final():
self.model.set_featuresC(eg.c, stcls.c)
self.model.set_scoresC(eg.c.scores, eg.c.features, eg.c.nr_feat, 1)
self.moves.set_costs(eg.c.is_valid, eg.c.costs, stcls, gold)
guess = VecVec.arg_max_if_true(eg.c.scores, eg.c.is_valid, eg.c.nr_class)
assert guess >= 0
action = self.moves.c[guess]
action.do(stcls.c, action.label)
loss += self.model.update(eg)
eg.reset()
return loss
def step_through(self, Doc doc):
return StepwiseState(self, doc)
def from_transition_sequence(self, Doc doc, sequence):
with self.step_through(doc) as stepwise:
for transition in sequence:
stepwise.transition(transition)
def add_label(self, label):
for action in self.moves.action_types:
self.moves.add_action(action, label)
cdef class StepwiseState:
cdef readonly StateClass stcls
cdef readonly Example eg
cdef readonly Doc doc
cdef readonly Parser parser
def __init__(self, Parser parser, Doc doc):
self.parser = parser
self.doc = doc
self.stcls = StateClass.init(doc.c, doc.length)
self.parser.moves.initialize_state(self.stcls.c)
self.eg = Example(
nr_class=self.parser.moves.n_moves,
nr_atom=CONTEXT_SIZE,
nr_feat=self.parser.model.nr_feat)
def __enter__(self):
return self
def __exit__(self, type, value, traceback):
self.finish()
@property
def is_final(self):
return self.stcls.is_final()
@property
def stack(self):
return self.stcls.stack
@property
def queue(self):
return self.stcls.queue
@property
def heads(self):
return [self.stcls.H(i) for i in range(self.stcls.c.length)]
@property
def deps(self):
return [self.doc.vocab.strings[self.stcls.c._sent[i].dep]
for i in range(self.stcls.c.length)]
def predict(self):
self.eg.reset()
self.parser.model.set_featuresC(self.eg.c, self.stcls.c)
self.parser.moves.set_valid(self.eg.c.is_valid, self.stcls.c)
self.parser.model.set_scoresC(self.eg.c.scores,
self.eg.c.features, self.eg.c.nr_feat, 1)
cdef Transition action = self.parser.moves.c[self.eg.guess]
return self.parser.moves.move_name(action.move, action.label)
def transition(self, action_name):
moves = {'S': 0, 'D': 1, 'L': 2, 'R': 3}
if action_name == '_':
action_name = self.predict()
action = self.parser.moves.lookup_transition(action_name)
elif action_name == 'L' or action_name == 'R':
self.predict()
move = moves[action_name]
clas = _arg_max_clas(self.eg.c.scores, move, self.parser.moves.c,
self.eg.c.nr_class)
action = self.parser.moves.c[clas]
else:
action = self.parser.moves.lookup_transition(action_name)
action.do(self.stcls.c, action.label)
def finish(self):
if self.stcls.is_final():
self.parser.moves.finalize_state(self.stcls.c)
self.doc.set_parse(self.stcls.c._sent)
self.parser.moves.finalize_doc(self.doc)
cdef int _arg_max_clas(const weight_t* scores, int move, const Transition* actions,
int nr_class) except -1:
cdef weight_t score = 0
cdef int mode = -1
cdef int i
for i in range(nr_class):
if actions[i].move == move and (mode == -1 or scores[i] >= score):
mode = i
score = scores[i]
return mode