Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Matthew Honnibal 2017-08-14 12:55:44 +02:00
commit ea8de11ad5
15 changed files with 660 additions and 143 deletions

View File

@ -36,6 +36,7 @@ MOD_NAMES = [
'spacy.syntax.transition_system', 'spacy.syntax.transition_system',
'spacy.syntax.arc_eager', 'spacy.syntax.arc_eager',
'spacy.syntax._parse_features', 'spacy.syntax._parse_features',
'spacy.syntax._beam_utils',
'spacy.gold', 'spacy.gold',
'spacy.tokens.doc', 'spacy.tokens.doc',
'spacy.tokens.span', 'spacy.tokens.span',

View File

@ -5,10 +5,12 @@ from thinc.neural._classes.hash_embed import HashEmbed
from thinc.neural.ops import NumpyOps, CupyOps from thinc.neural.ops import NumpyOps, CupyOps
from thinc.neural.util import get_array_module from thinc.neural.util import get_array_module
import random import random
import cytoolz
from thinc.neural._classes.convolution import ExtractWindow from thinc.neural._classes.convolution import ExtractWindow
from thinc.neural._classes.static_vectors import StaticVectors from thinc.neural._classes.static_vectors import StaticVectors
from thinc.neural._classes.batchnorm import BatchNorm from thinc.neural._classes.batchnorm import BatchNorm
from thinc.neural._classes.layernorm import LayerNorm as LN
from thinc.neural._classes.resnet import Residual from thinc.neural._classes.resnet import Residual
from thinc.neural import ReLu from thinc.neural import ReLu
from thinc.neural._classes.selu import SELU from thinc.neural._classes.selu import SELU
@ -19,10 +21,12 @@ from thinc.api import FeatureExtracter, with_getitem
from thinc.neural.pooling import Pooling, max_pool, mean_pool, sum_pool from thinc.neural.pooling import Pooling, max_pool, mean_pool, sum_pool
from thinc.neural._classes.attention import ParametricAttention from thinc.neural._classes.attention import ParametricAttention
from thinc.linear.linear import LinearModel from thinc.linear.linear import LinearModel
from thinc.api import uniqued, wrap from thinc.api import uniqued, wrap, flatten_add_lengths
from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP
from .tokens.doc import Doc from .tokens.doc import Doc
from . import util
import numpy import numpy
import io import io
@ -53,6 +57,27 @@ def _logistic(X, drop=0.):
return Y, logistic_bwd return Y, logistic_bwd
@layerize
def add_tuples(X, drop=0.):
"""Give inputs of sequence pairs, where each sequence is (vals, length),
sum the values, returning a single sequence.
If input is:
((vals1, length), (vals2, length)
Output is:
(vals1+vals2, length)
vals are a single tensor for the whole batch.
"""
(vals1, length1), (vals2, length2) = X
assert length1 == length2
def add_tuples_bwd(dY, sgd=None):
return (dY, dY)
return (vals1+vals2, length), add_tuples_bwd
def _zero_init(model): def _zero_init(model):
def _zero_init_impl(self, X, y): def _zero_init_impl(self, X, y):
self.W.fill(0) self.W.fill(0)
@ -61,6 +86,7 @@ def _zero_init(model):
model.W.fill(0.) model.W.fill(0.)
return model return model
@layerize @layerize
def _preprocess_doc(docs, drop=0.): def _preprocess_doc(docs, drop=0.):
keys = [doc.to_array([LOWER]) for doc in docs] keys = [doc.to_array([LOWER]) for doc in docs]
@ -72,7 +98,6 @@ def _preprocess_doc(docs, drop=0.):
return (keys, vals, lengths), None return (keys, vals, lengths), None
def _init_for_precomputed(W, ops): def _init_for_precomputed(W, ops):
if (W**2).sum() != 0.: if (W**2).sum() != 0.:
return return
@ -80,6 +105,7 @@ def _init_for_precomputed(W, ops):
ops.xavier_uniform_init(reshaped) ops.xavier_uniform_init(reshaped)
W[:] = reshaped.reshape(W.shape) W[:] = reshaped.reshape(W.shape)
@describe.on_data(_set_dimensions_if_needed) @describe.on_data(_set_dimensions_if_needed)
@describe.attributes( @describe.attributes(
nI=Dimension("Input size"), nI=Dimension("Input size"),
@ -184,25 +210,36 @@ class PrecomputableMaxouts(Model):
return Yfp, backward return Yfp, backward
def drop_layer(layer, factor=2.):
def drop_layer_fwd(X, drop=0.):
drop *= factor
mask = layer.ops.get_dropout_mask((1,), drop)
if mask is None or mask > 0:
return layer.begin_update(X, drop=drop)
else:
return X, lambda dX, sgd=None: dX
return wrap(drop_layer_fwd, layer)
def Tok2Vec(width, embed_size, preprocess=None): def Tok2Vec(width, embed_size, preprocess=None):
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE] cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}): with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}):
norm = get_col(cols.index(NORM)) >> HashEmbed(width, embed_size, name='embed_lower') norm = get_col(cols.index(NORM)) >> HashEmbed(width, embed_size, name='embed_lower')
prefix = get_col(cols.index(PREFIX)) >> HashEmbed(width, embed_size//2, name='embed_prefix') prefix = get_col(cols.index(PREFIX)) >> HashEmbed(width, embed_size//2, name='embed_prefix')
suffix = get_col(cols.index(SUFFIX)) >> HashEmbed(width, embed_size//2, name='embed_suffix') suffix = get_col(cols.index(SUFFIX)) >> HashEmbed(width, embed_size//2, name='embed_suffix')
shape = get_col(cols.index(SHAPE)) >> HashEmbed(width, embed_size//2, name='embed_shape') shape = get_col(cols.index(SHAPE)) >> HashEmbed(width, embed_size//2, name='embed_shape')
embed = (norm | prefix | suffix | shape ) embed = (norm | prefix | suffix | shape ) >> Maxout(width, width*4, pieces=3)
tok2vec = ( tok2vec = (
with_flatten( with_flatten(
asarray(Model.ops, dtype='uint64') asarray(Model.ops, dtype='uint64')
>> embed >> uniqued(embed, column=5)
>> Maxout(width, width*4, pieces=3) >> drop_layer(
>> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)) Residual(
>> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)) (ExtractWindow(nW=1) >> ReLu(width, width*3))
>> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)) )
>> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)), ) ** 4, pad=4
pad=4) )
) )
if preprocess not in (False, None): if preprocess not in (False, None):
tok2vec = preprocess >> tok2vec tok2vec = preprocess >> tok2vec
@ -297,7 +334,8 @@ def zero_init(model):
def doc2feats(cols=None): def doc2feats(cols=None):
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE] if cols is None:
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
def forward(docs, drop=0.): def forward(docs, drop=0.):
feats = [] feats = []
for doc in docs: for doc in docs:
@ -323,6 +361,37 @@ def get_token_vectors(tokens_attrs_vectors, drop=0.):
return vectors, backward return vectors, backward
def fine_tune(embedding, combine=None):
if combine is not None:
raise NotImplementedError(
"fine_tune currently only supports addition. Set combine=None")
def fine_tune_fwd(docs_tokvecs, drop=0.):
docs, tokvecs = docs_tokvecs
lengths = model.ops.asarray([len(doc) for doc in docs], dtype='i')
vecs, bp_vecs = embedding.begin_update(docs, drop=drop)
flat_tokvecs = embedding.ops.flatten(tokvecs)
flat_vecs = embedding.ops.flatten(vecs)
output = embedding.ops.unflatten(
(model.mix[0] * flat_vecs + model.mix[1] * flat_tokvecs),
lengths)
def fine_tune_bwd(d_output, sgd=None):
bp_vecs(d_output, sgd=sgd)
flat_grad = model.ops.flatten(d_output)
model.d_mix[1] += flat_tokvecs.dot(flat_grad.T).sum()
model.d_mix[0] += flat_vecs.dot(flat_grad.T).sum()
if sgd is not None:
sgd(model._mem.weights, model._mem.gradient, key=model.id)
return d_output
return output, fine_tune_bwd
model = wrap(fine_tune_fwd, embedding)
model.mix = model._mem.add((model.id, 'mix'), (2,))
model.mix.fill(1.)
model.d_mix = model._mem.add_gradient((model.id, 'd_mix'), (model.id, 'mix'))
return model
@layerize @layerize
def flatten(seqs, drop=0.): def flatten(seqs, drop=0.):
if isinstance(seqs[0], numpy.ndarray): if isinstance(seqs[0], numpy.ndarray):
@ -369,6 +438,27 @@ def preprocess_doc(docs, drop=0.):
vals = ops.allocate(keys.shape[0]) + 1 vals = ops.allocate(keys.shape[0]) + 1
return (keys, vals, lengths), None return (keys, vals, lengths), None
def getitem(i):
def getitem_fwd(X, drop=0.):
return X[i], None
return layerize(getitem_fwd)
def build_tagger_model(nr_class, token_vector_width, **cfg):
embed_size = util.env_opt('embed_size', 7500)
with Model.define_operators({'>>': chain, '+': add}):
# Input: (doc, tensor) tuples
private_tok2vec = Tok2Vec(token_vector_width, embed_size, preprocess=doc2feats())
model = (
fine_tune(private_tok2vec)
>> with_flatten(
Maxout(token_vector_width, token_vector_width)
>> Softmax(nr_class, token_vector_width)
)
)
model.nI = None
return model
def build_text_classifier(nr_class, width=64, **cfg): def build_text_classifier(nr_class, width=64, **cfg):
nr_vector = cfg.get('nr_vector', 200) nr_vector = cfg.get('nr_vector', 200)

View File

@ -21,10 +21,10 @@ CONVERTERS = {
@plac.annotations( @plac.annotations(
input_file=("input file", "positional", None, str), input_file=("input file", "positional", None, str),
output_dir=("output directory for converted file", "positional", None, str), output_dir=("output directory for converted file", "positional", None, str),
n_sents=("Number of sentences per doc", "option", "n", float), n_sents=("Number of sentences per doc", "option", "n", int),
morphology=("Enable appending morphology to tags", "flag", "m", bool) morphology=("Enable appending morphology to tags", "flag", "m", bool)
) )
def convert(cmd, input_file, output_dir, n_sents, morphology): def convert(cmd, input_file, output_dir, n_sents=1, morphology=False):
""" """
Convert files into JSON format for use with train command and other Convert files into JSON format for use with train command and other
experiment management functions. experiment management functions.

View File

@ -91,15 +91,14 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
for batch in minibatch(train_docs, size=batch_sizes): for batch in minibatch(train_docs, size=batch_sizes):
docs, golds = zip(*batch) docs, golds = zip(*batch)
nlp.update(docs, golds, sgd=optimizer, nlp.update(docs, golds, sgd=optimizer,
drop=next(dropout_rates), losses=losses) drop=next(dropout_rates), losses=losses,
update_tensors=True)
pbar.update(sum(len(doc) for doc in docs)) pbar.update(sum(len(doc) for doc in docs))
with nlp.use_params(optimizer.averages): with nlp.use_params(optimizer.averages):
util.set_env_log(False) util.set_env_log(False)
epoch_model_path = output_path / ('model%d' % i) epoch_model_path = output_path / ('model%d' % i)
nlp.to_disk(epoch_model_path) nlp.to_disk(epoch_model_path)
with (output_path / ('model%d.pickle' % i)).open('wb') as file_:
dill.dump(nlp, file_, -1)
nlp_loaded = lang_class(pipeline=pipeline) nlp_loaded = lang_class(pipeline=pipeline)
nlp_loaded = nlp_loaded.from_disk(epoch_model_path) nlp_loaded = nlp_loaded.from_disk(epoch_model_path)
scorer = nlp_loaded.evaluate( scorer = nlp_loaded.evaluate(

View File

@ -277,7 +277,8 @@ class Language(object):
def make_doc(self, text): def make_doc(self, text):
return self.tokenizer(text) return self.tokenizer(text)
def update(self, docs, golds, drop=0., sgd=None, losses=None): def update(self, docs, golds, drop=0., sgd=None, losses=None,
update_tensors=False):
"""Update the models in the pipeline. """Update the models in the pipeline.
docs (iterable): A batch of `Doc` objects. docs (iterable): A batch of `Doc` objects.
@ -304,14 +305,17 @@ class Language(object):
grads[key] = (W, dW) grads[key] = (W, dW)
pipes = list(self.pipeline[1:]) pipes = list(self.pipeline[1:])
random.shuffle(pipes) random.shuffle(pipes)
tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop)
all_d_tokvecses = [tok2vec.model.ops.allocate(tv.shape) for tv in tokvecses]
for proc in pipes: for proc in pipes:
if not hasattr(proc, 'update'): if not hasattr(proc, 'update'):
continue continue
tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop)
d_tokvecses = proc.update((docs, tokvecses), golds, d_tokvecses = proc.update((docs, tokvecses), golds,
drop=drop, sgd=get_grads, losses=losses) drop=drop, sgd=get_grads, losses=losses)
if d_tokvecses is not None: if update_tensors and d_tokvecses is not None:
bp_tokvecses(d_tokvecses, sgd=sgd) for i, d_tv in enumerate(d_tokvecses):
all_d_tokvecses[i] += d_tv
bp_tokvecses(all_d_tokvecses, sgd=sgd)
for key, (W, dW) in grads.items(): for key, (W, dW) in grads.items():
sgd(W, dW, key=key) sgd(W, dW, key=key)
# Clear the tensor variable, to free GPU memory. # Clear the tensor variable, to free GPU memory.
@ -381,9 +385,18 @@ class Language(object):
return optimizer return optimizer
def evaluate(self, docs_golds): def evaluate(self, docs_golds):
docs, golds = zip(*docs_golds)
scorer = Scorer() scorer = Scorer()
for doc, gold in zip(self.pipe(docs, batch_size=32), golds): docs, golds = zip(*docs_golds)
docs = list(docs)
golds = list(golds)
for pipe in self.pipeline:
if not hasattr(pipe, 'pipe'):
for doc in docs:
pipe(doc)
else:
docs = list(pipe.pipe(docs))
assert len(docs) == len(golds)
for doc, gold in zip(docs, golds):
scorer.score(doc, gold) scorer.score(doc, gold)
doc.tensor = None doc.tensor = None
return scorer return scorer

View File

@ -42,7 +42,7 @@ from .compat import json_dumps
from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS
from ._ml import rebatch, Tok2Vec, flatten, get_col, doc2feats from ._ml import rebatch, Tok2Vec, flatten, get_col, doc2feats
from ._ml import build_text_classifier from ._ml import build_text_classifier, build_tagger_model
from .parts_of_speech import X from .parts_of_speech import X
@ -138,7 +138,7 @@ class TokenVectorEncoder(BaseThincComponent):
name = 'tensorizer' name = 'tensorizer'
@classmethod @classmethod
def Model(cls, width=128, embed_size=7500, **cfg): def Model(cls, width=128, embed_size=4000, **cfg):
"""Create a new statistical model for the class. """Create a new statistical model for the class.
width (int): Output size of the model. width (int): Output size of the model.
@ -253,23 +253,25 @@ class NeuralTagger(BaseThincComponent):
self.cfg = dict(cfg) self.cfg = dict(cfg)
def __call__(self, doc): def __call__(self, doc):
tags = self.predict([doc.tensor]) tags = self.predict(([doc], [doc.tensor]))
self.set_annotations([doc], tags) self.set_annotations([doc], tags)
return doc return doc
def pipe(self, stream, batch_size=128, n_threads=-1): def pipe(self, stream, batch_size=128, n_threads=-1):
for docs in cytoolz.partition_all(batch_size, stream): for docs in cytoolz.partition_all(batch_size, stream):
docs = list(docs)
tokvecs = [d.tensor for d in docs] tokvecs = [d.tensor for d in docs]
tag_ids = self.predict(tokvecs) tag_ids = self.predict((docs, tokvecs))
self.set_annotations(docs, tag_ids) self.set_annotations(docs, tag_ids)
yield from docs yield from docs
def predict(self, tokvecs): def predict(self, docs_tokvecs):
scores = self.model(tokvecs) scores = self.model(docs_tokvecs)
scores = self.model.ops.flatten(scores) scores = self.model.ops.flatten(scores)
guesses = scores.argmax(axis=1) guesses = scores.argmax(axis=1)
if not isinstance(guesses, numpy.ndarray): if not isinstance(guesses, numpy.ndarray):
guesses = guesses.get() guesses = guesses.get()
tokvecs = docs_tokvecs[1]
guesses = self.model.ops.unflatten(guesses, guesses = self.model.ops.unflatten(guesses,
[tv.shape[0] for tv in tokvecs]) [tv.shape[0] for tv in tokvecs])
return guesses return guesses
@ -282,6 +284,8 @@ class NeuralTagger(BaseThincComponent):
cdef Vocab vocab = self.vocab cdef Vocab vocab = self.vocab
for i, doc in enumerate(docs): for i, doc in enumerate(docs):
doc_tag_ids = batch_tag_ids[i] doc_tag_ids = batch_tag_ids[i]
if hasattr(doc_tag_ids, 'get'):
doc_tag_ids = doc_tag_ids.get()
for j, tag_id in enumerate(doc_tag_ids): for j, tag_id in enumerate(doc_tag_ids):
# Don't clobber preset POS tags # Don't clobber preset POS tags
if doc.c[j].tag == 0 and doc.c[j].pos == 0: if doc.c[j].tag == 0 and doc.c[j].pos == 0:
@ -294,8 +298,7 @@ class NeuralTagger(BaseThincComponent):
if self.model.nI is None: if self.model.nI is None:
self.model.nI = tokvecs[0].shape[1] self.model.nI = tokvecs[0].shape[1]
tag_scores, bp_tag_scores = self.model.begin_update(docs_tokvecs, drop=drop)
tag_scores, bp_tag_scores = self.model.begin_update(tokvecs, drop=drop)
loss, d_tag_scores = self.get_loss(docs, golds, tag_scores) loss, d_tag_scores = self.get_loss(docs, golds, tag_scores)
d_tokvecs = bp_tag_scores(d_tag_scores, sgd=sgd) d_tokvecs = bp_tag_scores(d_tag_scores, sgd=sgd)
@ -346,9 +349,7 @@ class NeuralTagger(BaseThincComponent):
@classmethod @classmethod
def Model(cls, n_tags, token_vector_width): def Model(cls, n_tags, token_vector_width):
return with_flatten( return build_tagger_model(n_tags, token_vector_width)
chain(Maxout(token_vector_width, token_vector_width),
Softmax(n_tags, token_vector_width)))
def use_params(self, params): def use_params(self, params):
with self.model.use_params(params): with self.model.use_params(params):
@ -432,7 +433,7 @@ class NeuralLabeller(NeuralTagger):
@property @property
def labels(self): def labels(self):
return self.cfg.get('labels', {}) return self.cfg.setdefault('labels', {})
@labels.setter @labels.setter
def labels(self, value): def labels(self, value):
@ -455,9 +456,7 @@ class NeuralLabeller(NeuralTagger):
@classmethod @classmethod
def Model(cls, n_tags, token_vector_width): def Model(cls, n_tags, token_vector_width):
return with_flatten( return build_tagger_model(n_tags, token_vector_width)
chain(Maxout(token_vector_width, token_vector_width),
Softmax(n_tags, token_vector_width)))
def get_loss(self, docs, golds, scores): def get_loss(self, docs, golds, scores):
scores = self.model.ops.flatten(scores) scores = self.model.ops.flatten(scores)

View File

@ -0,0 +1,273 @@
# cython: infer_types=True
# cython: profile=True
cimport numpy as np
import numpy
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
from thinc.extra.search cimport Beam
from thinc.extra.search import MaxViolation
from thinc.typedefs cimport hash_t, class_t
from .transition_system cimport TransitionSystem, Transition
from .stateclass cimport StateClass
from ..gold cimport GoldParse
from ..tokens.doc cimport Doc
# These are passed as callbacks to thinc.search.Beam
cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1:
dest = <StateClass>_dest
src = <StateClass>_src
moves = <const Transition*>_moves
dest.clone(src)
moves[clas].do(dest.c, moves[clas].label)
cdef int _check_final_state(void* _state, void* extra_args) except -1:
return (<StateClass>_state).is_final()
def _cleanup(Beam beam):
for i in range(beam.width):
Py_XDECREF(<PyObject*>beam._states[i].content)
Py_XDECREF(<PyObject*>beam._parents[i].content)
cdef hash_t _hash_state(void* _state, void* _) except 0:
state = <StateClass>_state
if state.c.is_final():
return 1
else:
return state.c.hash()
cdef class ParserBeam(object):
cdef public TransitionSystem moves
cdef public object states
cdef public object golds
cdef public object beams
def __init__(self, TransitionSystem moves, states, golds,
int width=4, float density=0.001):
self.moves = moves
self.states = states
self.golds = golds
self.beams = []
cdef Beam beam
cdef StateClass state, st
for state in states:
beam = Beam(self.moves.n_moves, width, density)
beam.initialize(self.moves.init_beam_state, state.c.length, state.c._sent)
for i in range(beam.width):
st = <StateClass>beam.at(i)
st.c.offset = state.c.offset
self.beams.append(beam)
def __dealloc__(self):
if self.beams is not None:
for beam in self.beams:
if beam is not None:
_cleanup(beam)
@property
def is_done(self):
return all(b.is_done for b in self.beams)
def __getitem__(self, i):
return self.beams[i]
def __len__(self):
return len(self.beams)
def advance(self, scores, follow_gold=False):
cdef Beam beam
for i, beam in enumerate(self.beams):
if beam.is_done or not scores[i].size:
continue
self._set_scores(beam, scores[i])
if self.golds is not None:
self._set_costs(beam, self.golds[i], follow_gold=follow_gold)
if follow_gold:
assert self.golds is not None
beam.advance(_transition_state, NULL, <void*>self.moves.c)
else:
beam.advance(_transition_state, _hash_state, <void*>self.moves.c)
beam.check_done(_check_final_state, NULL)
if beam.is_done:
for j in range(beam.size):
if is_gold(<StateClass>beam.at(j), self.golds[i], self.moves.strings):
beam._states[j].loss = 0.0
elif beam._states[j].loss == 0.0:
beam._states[j].loss = 1.0
def _set_scores(self, Beam beam, float[:, ::1] scores):
cdef float* c_scores = &scores[0, 0]
for i in range(beam.size):
state = <StateClass>beam.at(i)
if not state.is_final():
for j in range(beam.nr_class):
beam.scores[i][j] = c_scores[i * beam.nr_class + j]
self.moves.set_valid(beam.is_valid[i], state.c)
def _set_costs(self, Beam beam, GoldParse gold, int follow_gold=False):
for i in range(beam.size):
state = <StateClass>beam.at(i)
if not state.c.is_final():
self.moves.set_costs(beam.is_valid[i], beam.costs[i], state, gold)
if follow_gold:
for j in range(beam.nr_class):
if beam.costs[i][j] >= 1:
beam.is_valid[i][j] = 0
def is_gold(StateClass state, GoldParse gold, strings):
predicted = set()
truth = set()
for i in range(gold.length):
if gold.cand_to_gold[i] is None:
continue
if state.safe_get(i).dep:
predicted.add((i, state.H(i), strings[state.safe_get(i).dep]))
else:
predicted.add((i, state.H(i), 'ROOT'))
id_, word, tag, head, dep, ner = gold.orig_annot[gold.cand_to_gold[i]]
truth.add((id_, head, dep))
return truth == predicted
def get_token_ids(states, int n_tokens):
cdef StateClass state
cdef np.ndarray ids = numpy.zeros((len(states), n_tokens),
dtype='int32', order='C')
c_ids = <int*>ids.data
for i, state in enumerate(states):
if not state.is_final():
state.c.set_context_tokens(c_ids, n_tokens)
else:
ids[i] = -1
c_ids += ids.shape[1]
return ids
nr_update = 0
def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
states, tokvecs, golds,
state2vec, vec2scores, drop=0., sgd=None,
losses=None, int width=4, float density=0.001):
global nr_update
nr_update += 1
pbeam = ParserBeam(moves, states, golds,
width=width, density=density)
gbeam = ParserBeam(moves, states, golds,
width=width, density=0.0)
cdef StateClass state
beam_maps = []
backprops = []
violns = [MaxViolation() for _ in range(len(states))]
for t in range(max_steps):
# The beam maps let us find the right row in the flattened scores
# arrays for each state. States are identified by (example id, history).
# We keep a different beam map for each step (since we'll have a flat
# scores array for each step). The beam map will let us take the per-state
# losses, and compute the gradient for each (step, state, class).
beam_maps.append({})
# Gather all states from the two beams in a list. Some stats may occur
# in both beams. To figure out which beam each state belonged to,
# we keep two lists of indices, p_indices and g_indices
states, p_indices, g_indices = get_states(pbeam, gbeam, beam_maps[-1], nr_update)
if not states:
break
# Now that we have our flat list of states, feed them through the model
token_ids = get_token_ids(states, nr_feature)
vectors, bp_vectors = state2vec.begin_update(token_ids, drop=drop)
scores, bp_scores = vec2scores.begin_update(vectors, drop=drop)
# Store the callbacks for the backward pass
backprops.append((token_ids, bp_vectors, bp_scores))
# Unpack the flat scores into lists for the two beams. The indices arrays
# tell us which example and state the scores-row refers to.
p_scores = [numpy.ascontiguousarray(scores[indices], dtype='f') for indices in p_indices]
g_scores = [numpy.ascontiguousarray(scores[indices], dtype='f') for indices in g_indices]
# Now advance the states in the beams. The gold beam is contrained to
# to follow only gold analyses.
pbeam.advance(p_scores)
gbeam.advance(g_scores, follow_gold=True)
# Track the "maximum violation", to use in the update.
for i, violn in enumerate(violns):
violn.check_crf(pbeam[i], gbeam[i])
# Only make updates if we have non-gold states
histories = [((v.p_hist + v.g_hist) if v.p_hist else []) for v in violns]
losses = [((v.p_probs + v.g_probs) if v.p_probs else []) for v in violns]
states_d_scores = get_gradient(moves.n_moves, beam_maps,
histories, losses)
assert len(states_d_scores) == len(backprops), (len(states_d_scores), len(backprops))
return states_d_scores, backprops
def get_states(pbeams, gbeams, beam_map, nr_update):
seen = {}
states = []
p_indices = []
g_indices = []
cdef Beam pbeam, gbeam
assert len(pbeams) == len(gbeams)
for eg_id, (pbeam, gbeam) in enumerate(zip(pbeams, gbeams)):
p_indices.append([])
g_indices.append([])
if pbeam.loss > 0 and pbeam.min_score > gbeam.score:
continue
for i in range(pbeam.size):
state = <StateClass>pbeam.at(i)
if not state.is_final():
key = tuple([eg_id] + pbeam.histories[i])
seen[key] = len(states)
p_indices[-1].append(len(states))
states.append(state)
beam_map.update(seen)
for i in range(gbeam.size):
state = <StateClass>gbeam.at(i)
if not state.is_final():
key = tuple([eg_id] + gbeam.histories[i])
if key in seen:
g_indices[-1].append(seen[key])
else:
g_indices[-1].append(len(states))
beam_map[key] = len(states)
states.append(state)
p_idx = [numpy.asarray(idx, dtype='i') for idx in p_indices]
g_idx = [numpy.asarray(idx, dtype='i') for idx in g_indices]
return states, p_idx, g_idx
def get_gradient(nr_class, beam_maps, histories, losses):
"""
The global model assigns a loss to each parse. The beam scores
are additive, so the same gradient is applied to each action
in the history. This gives the gradient of a single *action*
for a beam state -- so we have "the gradient of loss for taking
action i given history H."
Histories: Each hitory is a list of actions
Each candidate has a history
Each beam has multiple candidates
Each batch has multiple beams
So history is list of lists of lists of ints
"""
nr_step = len(beam_maps)
grads = []
for beam_map in beam_maps:
if beam_map:
grads.append(numpy.zeros((max(beam_map.values())+1, nr_class), dtype='f'))
assert len(histories) == len(losses)
for eg_id, hists in enumerate(histories):
for loss, hist in zip(losses[eg_id], hists):
key = tuple([eg_id])
for j, clas in enumerate(hist):
i = beam_maps[j][key]
# In step j, at state i action clas
# resulted in loss
grads[j][i, clas] += loss / len(histories)
key = key + tuple([clas])
return grads

View File

@ -37,6 +37,7 @@ cdef cppclass StateC:
this.shifted = <bint*>calloc(length + (PADDING * 2), sizeof(bint)) this.shifted = <bint*>calloc(length + (PADDING * 2), sizeof(bint))
this._sent = <TokenC*>calloc(length + (PADDING * 2), sizeof(TokenC)) this._sent = <TokenC*>calloc(length + (PADDING * 2), sizeof(TokenC))
this._ents = <Entity*>calloc(length + (PADDING * 2), sizeof(Entity)) this._ents = <Entity*>calloc(length + (PADDING * 2), sizeof(Entity))
this.offset = 0
cdef int i cdef int i
for i in range(length + (PADDING * 2)): for i in range(length + (PADDING * 2)):
this._ents[i].end = -1 this._ents[i].end = -1

View File

@ -385,6 +385,7 @@ cdef class ArcEager(TransitionSystem):
for i in range(self.n_moves): for i in range(self.n_moves):
if self.c[i].move == move and self.c[i].label == label: if self.c[i].move == move and self.c[i].label == label:
return self.c[i] return self.c[i]
return Transition(clas=0, move=MISSING, label=0)
def move_name(self, int move, attr_t label): def move_name(self, int move, attr_t label):
label_str = self.strings[label] label_str = self.strings[label]

View File

@ -34,6 +34,7 @@ from ._parse_features cimport CONTEXT_SIZE
from ._parse_features cimport fill_context from ._parse_features cimport fill_context
from .stateclass cimport StateClass from .stateclass cimport StateClass
from .parser cimport Parser from .parser cimport Parser
from ._beam_utils import is_gold
DEBUG = False DEBUG = False
@ -237,16 +238,3 @@ def _check_train_integrity(Beam pred, Beam gold, GoldParse gold_parse, Transitio
raise Exception("Gold parse is not gold-standard") raise Exception("Gold parse is not gold-standard")
def is_gold(StateClass state, GoldParse gold, StringStore strings):
predicted = set()
truth = set()
for i in range(gold.length):
if gold.cand_to_gold[i] is None:
continue
if state.safe_get(i).dep:
predicted.add((i, state.H(i), strings[state.safe_get(i).dep]))
else:
predicted.add((i, state.H(i), 'ROOT'))
id_, word, tag, head, dep, ner = gold.orig_annot[gold.cand_to_gold[i]]
truth.add((id_, head, dep))
return truth == predicted

View File

@ -14,8 +14,4 @@ cdef class Parser:
cdef readonly TransitionSystem moves cdef readonly TransitionSystem moves
cdef readonly object cfg cdef readonly object cfg
cdef void _parse_step(self, StateC* state,
const float* feat_weights,
int nr_class, int nr_feat, int nr_piece) nogil
#cdef int parseC(self, TokenC* tokens, int length, int nr_feat) nogil #cdef int parseC(self, TokenC* tokens, int length, int nr_feat) nogil

View File

@ -37,14 +37,17 @@ from preshed.maps cimport MapStruct
from preshed.maps cimport map_get from preshed.maps cimport map_get
from thinc.api import layerize, chain, noop, clone from thinc.api import layerize, chain, noop, clone
from thinc.neural import Model, Affine, ELU, ReLu, Maxout from thinc.neural import Model, Affine, ReLu, Maxout
from thinc.neural._classes.selu import SELU
from thinc.neural._classes.layernorm import LayerNorm
from thinc.neural.ops import NumpyOps, CupyOps from thinc.neural.ops import NumpyOps, CupyOps
from thinc.neural.util import get_array_module from thinc.neural.util import get_array_module
from .. import util from .. import util
from ..util import get_async, get_cuda_stream from ..util import get_async, get_cuda_stream
from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts
from .._ml import Tok2Vec, doc2feats, rebatch from .._ml import Tok2Vec, doc2feats, rebatch, fine_tune
from .._ml import Residual, drop_layer
from ..compat import json_dumps from ..compat import json_dumps
from . import _parse_features from . import _parse_features
@ -59,8 +62,11 @@ from ..structs cimport TokenC
from ..tokens.doc cimport Doc from ..tokens.doc cimport Doc
from ..strings cimport StringStore from ..strings cimport StringStore
from ..gold cimport GoldParse from ..gold cimport GoldParse
from ..attrs cimport TAG, DEP from ..attrs cimport ID, TAG, DEP, ORTH, NORM, PREFIX, SUFFIX, TAG
from . import _beam_utils
USE_FINE_TUNE = True
BEAM_PARSE = True
def get_templates(*args, **kwargs): def get_templates(*args, **kwargs):
return [] return []
@ -232,11 +238,14 @@ cdef class Parser:
Base class of the DependencyParser and EntityRecognizer. Base class of the DependencyParser and EntityRecognizer.
""" """
@classmethod @classmethod
def Model(cls, nr_class, token_vector_width=128, hidden_width=128, depth=1, **cfg): def Model(cls, nr_class, token_vector_width=128, hidden_width=300, depth=1, **cfg):
depth = util.env_opt('parser_hidden_depth', depth) depth = util.env_opt('parser_hidden_depth', depth)
token_vector_width = util.env_opt('token_vector_width', token_vector_width) token_vector_width = util.env_opt('token_vector_width', token_vector_width)
hidden_width = util.env_opt('hidden_width', hidden_width) hidden_width = util.env_opt('hidden_width', hidden_width)
parser_maxout_pieces = util.env_opt('parser_maxout_pieces', 2) parser_maxout_pieces = util.env_opt('parser_maxout_pieces', 2)
embed_size = util.env_opt('embed_size', 4000)
tensors = fine_tune(Tok2Vec(token_vector_width, embed_size,
preprocess=doc2feats()))
if parser_maxout_pieces == 1: if parser_maxout_pieces == 1:
lower = PrecomputableAffine(hidden_width if depth >= 1 else nr_class, lower = PrecomputableAffine(hidden_width if depth >= 1 else nr_class,
nF=cls.nr_feature, nF=cls.nr_feature,
@ -248,15 +257,10 @@ cdef class Parser:
nI=token_vector_width) nI=token_vector_width)
with Model.use_device('cpu'): with Model.use_device('cpu'):
if depth == 0:
upper = chain()
upper.is_noop = True
else:
upper = chain( upper = chain(
clone(Maxout(hidden_width), (depth-1)), clone(Residual(ReLu(hidden_width)), (depth-1)),
zero_init(Affine(nr_class, drop_factor=0.0)) zero_init(Affine(nr_class, drop_factor=0.0))
) )
upper.is_noop = False
# TODO: This is an unfortunate hack atm! # TODO: This is an unfortunate hack atm!
# Used to set input dimensions in network. # Used to set input dimensions in network.
lower.begin_training(lower.ops.allocate((500, token_vector_width))) lower.begin_training(lower.ops.allocate((500, token_vector_width)))
@ -268,7 +272,7 @@ cdef class Parser:
'hidden_width': hidden_width, 'hidden_width': hidden_width,
'maxout_pieces': parser_maxout_pieces 'maxout_pieces': parser_maxout_pieces
} }
return (lower, upper), cfg return (tensors, lower, upper), cfg
def __init__(self, Vocab vocab, moves=True, model=True, **cfg): def __init__(self, Vocab vocab, moves=True, model=True, **cfg):
""" """
@ -344,17 +348,21 @@ cdef class Parser:
The number of threads with which to work on the buffer in parallel. The number of threads with which to work on the buffer in parallel.
Yields (Doc): Documents, in order. Yields (Doc): Documents, in order.
""" """
cdef StateClass parse_state if BEAM_PARSE:
beam_width = 8
cdef Doc doc cdef Doc doc
queue = [] cdef Beam beam
for docs in cytoolz.partition_all(batch_size, docs): for docs in cytoolz.partition_all(batch_size, docs):
docs = list(docs) docs = list(docs)
tokvecs = [d.tensor for d in docs] tokvecs = [doc.tensor for doc in docs]
if beam_width == 1: if beam_width == 1:
parse_states = self.parse_batch(docs, tokvecs) parse_states = self.parse_batch(docs, tokvecs)
else: else:
parse_states = self.beam_parse(docs, tokvecs, beams = self.beam_parse(docs, tokvecs,
beam_width=beam_width, beam_density=beam_density) beam_width=beam_width, beam_density=beam_density)
parse_states = []
for beam in beams:
parse_states.append(<StateClass>beam.at(0))
self.set_annotations(docs, parse_states) self.set_annotations(docs, parse_states)
yield from docs yield from docs
@ -369,8 +377,12 @@ cdef class Parser:
int nr_class, nr_feat, nr_piece, nr_dim, nr_state int nr_class, nr_feat, nr_piece, nr_dim, nr_state
if isinstance(docs, Doc): if isinstance(docs, Doc):
docs = [docs] docs = [docs]
if isinstance(tokvecses, np.ndarray):
tokvecses = [tokvecses]
tokvecs = self.model[0].ops.flatten(tokvecses) tokvecs = self.model[0].ops.flatten(tokvecses)
if USE_FINE_TUNE:
tokvecs += self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
nr_state = len(docs) nr_state = len(docs)
nr_class = self.moves.n_moves nr_class = self.moves.n_moves
@ -394,14 +406,7 @@ cdef class Parser:
cdef np.ndarray scores cdef np.ndarray scores
c_token_ids = <int*>token_ids.data c_token_ids = <int*>token_ids.data
c_is_valid = <int*>is_valid.data c_is_valid = <int*>is_valid.data
cdef int has_hidden = not getattr(vec2scores, 'is_noop', False)
while not next_step.empty(): while not next_step.empty():
if not has_hidden:
for i in cython.parallel.prange(
next_step.size(), num_threads=6, nogil=True):
self._parse_step(next_step[i],
feat_weights, nr_class, nr_feat, nr_piece)
else:
for i in range(next_step.size()): for i in range(next_step.size()):
st = next_step[i] st = next_step[i]
st.set_context_tokens(&c_token_ids[i*nr_feat], nr_feat) st.set_context_tokens(&c_token_ids[i*nr_feat], nr_feat)
@ -429,11 +434,15 @@ cdef class Parser:
cdef int nr_class = self.moves.n_moves cdef int nr_class = self.moves.n_moves
cdef StateClass stcls, output cdef StateClass stcls, output
tokvecs = self.model[0].ops.flatten(tokvecses) tokvecs = self.model[0].ops.flatten(tokvecses)
if USE_FINE_TUNE:
tokvecs += self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
cuda_stream = get_cuda_stream() cuda_stream = get_cuda_stream()
state2vec, vec2scores = self.get_batch_model(len(docs), tokvecs, state2vec, vec2scores = self.get_batch_model(len(docs), tokvecs,
cuda_stream, 0.0) cuda_stream, 0.0)
beams = [] beams = []
cdef int offset = 0 cdef int offset = 0
cdef int j = 0
cdef int k
for doc in docs: for doc in docs:
beam = Beam(nr_class, beam_width, min_density=beam_density) beam = Beam(nr_class, beam_width, min_density=beam_density)
beam.initialize(self.moves.init_beam_state, doc.length, doc.c) beam.initialize(self.moves.init_beam_state, doc.length, doc.c)
@ -446,44 +455,31 @@ cdef class Parser:
states = [] states = []
for i in range(beam.size): for i in range(beam.size):
stcls = <StateClass>beam.at(i) stcls = <StateClass>beam.at(i)
# This way we avoid having to score finalized states
# We do have to take care to keep indexes aligned, though
if not stcls.is_final():
states.append(stcls) states.append(stcls)
token_ids = self.get_token_ids(states) token_ids = self.get_token_ids(states)
vectors = state2vec(token_ids) vectors = state2vec(token_ids)
scores = vec2scores(vectors) scores = vec2scores(vectors)
j = 0
c_scores = <float*>scores.data
for i in range(beam.size): for i in range(beam.size):
stcls = <StateClass>beam.at(i) stcls = <StateClass>beam.at(i)
if not stcls.is_final(): if not stcls.is_final():
self.moves.set_valid(beam.is_valid[i], stcls.c) self.moves.set_valid(beam.is_valid[i], stcls.c)
for j in range(nr_class): for k in range(nr_class):
beam.scores[i][j] = scores[i, j] beam.scores[i][k] = c_scores[j * scores.shape[1] + k]
j += 1
beam.advance(_transition_state, _hash_state, <void*>self.moves.c) beam.advance(_transition_state, _hash_state, <void*>self.moves.c)
beam.check_done(_check_final_state, NULL) beam.check_done(_check_final_state, NULL)
beams.append(beam) beams.append(beam)
return beams return beams
cdef void _parse_step(self, StateC* state,
const float* feat_weights,
int nr_class, int nr_feat, int nr_piece) nogil:
'''This only works with no hidden layers -- fast but inaccurate'''
#for i in cython.parallel.prange(next_step.size(), num_threads=4, nogil=True):
# self._parse_step(next_step[i], feat_weights, nr_class, nr_feat)
token_ids = <int*>calloc(nr_feat, sizeof(int))
scores = <float*>calloc(nr_class * nr_piece, sizeof(float))
is_valid = <int*>calloc(nr_class, sizeof(int))
state.set_context_tokens(token_ids, nr_feat)
sum_state_features(scores,
feat_weights, token_ids, 1, nr_feat, nr_class * nr_piece)
self.moves.set_valid(is_valid, state)
guess = arg_maxout_if_valid(scores, is_valid, nr_class, nr_piece)
action = self.moves.c[guess]
action.do(state, action.label)
free(is_valid)
free(scores)
free(token_ids)
def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None): def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
if BEAM_PARSE:
return self.update_beam(docs_tokvecs, golds, drop=drop, sgd=sgd,
losses=losses)
if losses is not None and self.name not in losses: if losses is not None and self.name not in losses:
losses[self.name] = 0. losses[self.name] = 0.
docs, tokvec_lists = docs_tokvecs docs, tokvec_lists = docs_tokvecs
@ -491,6 +487,10 @@ cdef class Parser:
if isinstance(docs, Doc) and isinstance(golds, GoldParse): if isinstance(docs, Doc) and isinstance(golds, GoldParse):
docs = [docs] docs = [docs]
golds = [golds] golds = [golds]
if USE_FINE_TUNE:
my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
my_tokvecs = self.model[0].ops.flatten(my_tokvecs)
tokvecs += my_tokvecs
cuda_stream = get_cuda_stream() cuda_stream = get_cuda_stream()
@ -517,13 +517,13 @@ cdef class Parser:
scores, bp_scores = vec2scores.begin_update(vector, drop=drop) scores, bp_scores = vec2scores.begin_update(vector, drop=drop)
d_scores = self.get_batch_loss(states, golds, scores) d_scores = self.get_batch_loss(states, golds, scores)
d_vector = bp_scores(d_scores / d_scores.shape[0], sgd=sgd) d_vector = bp_scores(d_scores, sgd=sgd)
if drop != 0: if drop != 0:
d_vector *= mask d_vector *= mask
if isinstance(self.model[0].ops, CupyOps) \ if isinstance(self.model[0].ops, CupyOps) \
and not isinstance(token_ids, state2vec.ops.xp.ndarray): and not isinstance(token_ids, state2vec.ops.xp.ndarray):
# Move token_ids and d_vector to CPU, asynchronously # Move token_ids and d_vector to GPU, asynchronously
backprops.append(( backprops.append((
get_async(cuda_stream, token_ids), get_async(cuda_stream, token_ids),
get_async(cuda_stream, d_vector), get_async(cuda_stream, d_vector),
@ -540,7 +540,55 @@ cdef class Parser:
break break
self._make_updates(d_tokvecs, self._make_updates(d_tokvecs,
backprops, sgd, cuda_stream) backprops, sgd, cuda_stream)
return self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs]) d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs])
if USE_FINE_TUNE:
bp_my_tokvecs(d_tokvecs, sgd=sgd)
return d_tokvecs
def update_beam(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
if losses is not None and self.name not in losses:
losses[self.name] = 0.
docs, tokvecs = docs_tokvecs
lengths = [len(d) for d in docs]
assert min(lengths) >= 1
tokvecs = self.model[0].ops.flatten(tokvecs)
if USE_FINE_TUNE:
my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
my_tokvecs = self.model[0].ops.flatten(my_tokvecs)
tokvecs += my_tokvecs
states = self.moves.init_batch(docs)
for gold in golds:
self.moves.preprocess_gold(gold)
cuda_stream = get_cuda_stream()
state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream, 0.0)
states_d_scores, backprops = _beam_utils.update_beam(self.moves, self.nr_feature, 500,
states, tokvecs, golds,
state2vec, vec2scores,
drop, sgd, losses,
width=8)
backprop_lower = []
for i, d_scores in enumerate(states_d_scores):
if losses is not None:
losses[self.name] += (d_scores**2).sum()
ids, bp_vectors, bp_scores = backprops[i]
d_vector = bp_scores(d_scores, sgd=sgd)
if isinstance(self.model[0].ops, CupyOps) \
and not isinstance(ids, state2vec.ops.xp.ndarray):
backprop_lower.append((
get_async(cuda_stream, ids),
get_async(cuda_stream, d_vector),
bp_vectors))
else:
backprop_lower.append((ids, d_vector, bp_vectors))
d_tokvecs = self.model[0].ops.allocate(tokvecs.shape)
self._make_updates(d_tokvecs, backprop_lower, sgd, cuda_stream)
d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, lengths)
if USE_FINE_TUNE:
bp_my_tokvecs(d_tokvecs, sgd=sgd)
return d_tokvecs
def _init_gold_batch(self, whole_docs, whole_golds): def _init_gold_batch(self, whole_docs, whole_golds):
"""Make a square batch, of length equal to the shortest doc. A long """Make a square batch, of length equal to the shortest doc. A long
@ -585,14 +633,10 @@ cdef class Parser:
xp = get_array_module(d_tokvecs) xp = get_array_module(d_tokvecs)
for ids, d_vector, bp_vector in backprops: for ids, d_vector, bp_vector in backprops:
d_state_features = bp_vector(d_vector, sgd=sgd) d_state_features = bp_vector(d_vector, sgd=sgd)
active_feats = ids * (ids >= 0) mask = ids >= 0
active_feats = active_feats.reshape((ids.shape[0], ids.shape[1], 1)) indices = xp.nonzero(mask)
if hasattr(xp, 'scatter_add'): self.model[0].ops.scatter_add(d_tokvecs, ids[indices],
xp.scatter_add(d_tokvecs, d_state_features[indices])
ids, d_state_features * active_feats)
else:
xp.add.at(d_tokvecs,
ids, d_state_features * active_feats)
@property @property
def move_names(self): def move_names(self):
@ -603,7 +647,7 @@ cdef class Parser:
return names return names
def get_batch_model(self, batch_size, tokvecs, stream, dropout): def get_batch_model(self, batch_size, tokvecs, stream, dropout):
lower, upper = self.model _, lower, upper = self.model
state2vec = precompute_hiddens(batch_size, tokvecs, state2vec = precompute_hiddens(batch_size, tokvecs,
lower, stream, drop=dropout) lower, stream, drop=dropout)
return state2vec, upper return state2vec, upper
@ -693,10 +737,12 @@ cdef class Parser:
def to_disk(self, path, **exclude): def to_disk(self, path, **exclude):
serializers = { serializers = {
'lower_model': lambda p: p.open('wb').write( 'tok2vec_model': lambda p: p.open('wb').write(
self.model[0].to_bytes()), self.model[0].to_bytes()),
'upper_model': lambda p: p.open('wb').write( 'lower_model': lambda p: p.open('wb').write(
self.model[1].to_bytes()), self.model[1].to_bytes()),
'upper_model': lambda p: p.open('wb').write(
self.model[2].to_bytes()),
'vocab': lambda p: self.vocab.to_disk(p), 'vocab': lambda p: self.vocab.to_disk(p),
'moves': lambda p: self.moves.to_disk(p, strings=False), 'moves': lambda p: self.moves.to_disk(p, strings=False),
'cfg': lambda p: p.open('w').write(json_dumps(self.cfg)) 'cfg': lambda p: p.open('w').write(json_dumps(self.cfg))
@ -717,24 +763,29 @@ cdef class Parser:
self.model, cfg = self.Model(**self.cfg) self.model, cfg = self.Model(**self.cfg)
else: else:
cfg = {} cfg = {}
with (path / 'lower_model').open('rb') as file_: with (path / 'tok2vec_model').open('rb') as file_:
bytes_data = file_.read() bytes_data = file_.read()
self.model[0].from_bytes(bytes_data) self.model[0].from_bytes(bytes_data)
with (path / 'upper_model').open('rb') as file_: with (path / 'lower_model').open('rb') as file_:
bytes_data = file_.read() bytes_data = file_.read()
self.model[1].from_bytes(bytes_data) self.model[1].from_bytes(bytes_data)
with (path / 'upper_model').open('rb') as file_:
bytes_data = file_.read()
self.model[2].from_bytes(bytes_data)
self.cfg.update(cfg) self.cfg.update(cfg)
return self return self
def to_bytes(self, **exclude): def to_bytes(self, **exclude):
serializers = OrderedDict(( serializers = OrderedDict((
('lower_model', lambda: self.model[0].to_bytes()), ('tok2vec_model', lambda: self.model[0].to_bytes()),
('upper_model', lambda: self.model[1].to_bytes()), ('lower_model', lambda: self.model[1].to_bytes()),
('upper_model', lambda: self.model[2].to_bytes()),
('vocab', lambda: self.vocab.to_bytes()), ('vocab', lambda: self.vocab.to_bytes()),
('moves', lambda: self.moves.to_bytes(strings=False)), ('moves', lambda: self.moves.to_bytes(strings=False)),
('cfg', lambda: ujson.dumps(self.cfg)) ('cfg', lambda: ujson.dumps(self.cfg))
)) ))
if 'model' in exclude: if 'model' in exclude:
exclude['tok2vec_model'] = True
exclude['lower_model'] = True exclude['lower_model'] = True
exclude['upper_model'] = True exclude['upper_model'] = True
exclude.pop('model') exclude.pop('model')
@ -745,6 +796,7 @@ cdef class Parser:
('vocab', lambda b: self.vocab.from_bytes(b)), ('vocab', lambda b: self.vocab.from_bytes(b)),
('moves', lambda b: self.moves.from_bytes(b, strings=False)), ('moves', lambda b: self.moves.from_bytes(b, strings=False)),
('cfg', lambda b: self.cfg.update(ujson.loads(b))), ('cfg', lambda b: self.cfg.update(ujson.loads(b))),
('tok2vec_model', lambda b: None),
('lower_model', lambda b: None), ('lower_model', lambda b: None),
('upper_model', lambda b: None) ('upper_model', lambda b: None)
)) ))
@ -754,10 +806,12 @@ cdef class Parser:
self.model, cfg = self.Model(self.moves.n_moves) self.model, cfg = self.Model(self.moves.n_moves)
else: else:
cfg = {} cfg = {}
if 'tok2vec_model' in msg:
self.model[0].from_bytes(msg['tok2vec_model'])
if 'lower_model' in msg: if 'lower_model' in msg:
self.model[0].from_bytes(msg['lower_model']) self.model[1].from_bytes(msg['lower_model'])
if 'upper_model' in msg: if 'upper_model' in msg:
self.model[1].from_bytes(msg['upper_model']) self.model[2].from_bytes(msg['upper_model'])
self.cfg.update(cfg) self.cfg.update(cfg)
return self return self

View File

@ -107,6 +107,8 @@ cdef class TransitionSystem:
def is_valid(self, StateClass stcls, move_name): def is_valid(self, StateClass stcls, move_name):
action = self.lookup_transition(move_name) action = self.lookup_transition(move_name)
if action.move == 0:
return False
return action.is_valid(stcls.c, action.label) return action.is_valid(stcls.c, action.label)
cdef int set_valid(self, int* is_valid, const StateC* st) nogil: cdef int set_valid(self, int* is_valid, const StateC* st) nogil:

View File

@ -78,3 +78,16 @@ def test_predict_doc_beam(parser, tok2vec, model, doc):
parser(doc, beam_width=32, beam_density=0.001) parser(doc, beam_width=32, beam_density=0.001)
for word in doc: for word in doc:
print(word.text, word.head, word.dep_) print(word.text, word.head, word.dep_)
def test_update_doc_beam(parser, tok2vec, model, doc, gold):
parser.model = model
tokvecs, bp_tokvecs = tok2vec.begin_update([doc])
d_tokvecs = parser.update_beam(([doc], tokvecs), [gold])
assert d_tokvecs[0].shape == tokvecs[0].shape
def optimize(weights, gradient, key=None):
weights -= 0.001 * gradient
bp_tokvecs(d_tokvecs, sgd=optimize)
assert d_tokvecs[0].sum() == 0.

View File

@ -0,0 +1,87 @@
from __future__ import unicode_literals
import pytest
import numpy
from thinc.api import layerize
from ...vocab import Vocab
from ...syntax.arc_eager import ArcEager
from ...tokens import Doc
from ...gold import GoldParse
from ...syntax._beam_utils import ParserBeam, update_beam
from ...syntax.stateclass import StateClass
@pytest.fixture
def vocab():
return Vocab()
@pytest.fixture
def moves(vocab):
aeager = ArcEager(vocab.strings, {})
aeager.add_action(2, 'nsubj')
aeager.add_action(3, 'dobj')
aeager.add_action(2, 'aux')
return aeager
@pytest.fixture
def docs(vocab):
return [Doc(vocab, words=['Rats', 'bite', 'things'])]
@pytest.fixture
def states(docs):
return [StateClass(doc) for doc in docs]
@pytest.fixture
def tokvecs(docs, vector_size):
output = []
for doc in docs:
vec = numpy.random.uniform(-0.1, 0.1, (len(doc), vector_size))
output.append(numpy.asarray(vec))
return output
@pytest.fixture
def golds(docs):
return [GoldParse(doc) for doc in docs]
@pytest.fixture
def batch_size(docs):
return len(docs)
@pytest.fixture
def beam_width():
return 4
@pytest.fixture
def vector_size():
return 6
@pytest.fixture
def beam(moves, states, golds, beam_width):
return ParserBeam(moves, states, golds, width=beam_width)
@pytest.fixture
def scores(moves, batch_size, beam_width):
return [
numpy.asarray(
numpy.random.uniform(-0.1, 0.1, (batch_size, moves.n_moves)),
dtype='f')
for _ in range(batch_size)]
def test_create_beam(beam):
pass
def test_beam_advance(beam, scores):
beam.advance(scores)
def test_beam_advance_too_few_scores(beam, scores):
with pytest.raises(IndexError):
beam.advance(scores[:-1])