mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
commit
ea8de11ad5
1
setup.py
1
setup.py
|
@ -36,6 +36,7 @@ MOD_NAMES = [
|
||||||
'spacy.syntax.transition_system',
|
'spacy.syntax.transition_system',
|
||||||
'spacy.syntax.arc_eager',
|
'spacy.syntax.arc_eager',
|
||||||
'spacy.syntax._parse_features',
|
'spacy.syntax._parse_features',
|
||||||
|
'spacy.syntax._beam_utils',
|
||||||
'spacy.gold',
|
'spacy.gold',
|
||||||
'spacy.tokens.doc',
|
'spacy.tokens.doc',
|
||||||
'spacy.tokens.span',
|
'spacy.tokens.span',
|
||||||
|
|
120
spacy/_ml.py
120
spacy/_ml.py
|
@ -5,10 +5,12 @@ from thinc.neural._classes.hash_embed import HashEmbed
|
||||||
from thinc.neural.ops import NumpyOps, CupyOps
|
from thinc.neural.ops import NumpyOps, CupyOps
|
||||||
from thinc.neural.util import get_array_module
|
from thinc.neural.util import get_array_module
|
||||||
import random
|
import random
|
||||||
|
import cytoolz
|
||||||
|
|
||||||
from thinc.neural._classes.convolution import ExtractWindow
|
from thinc.neural._classes.convolution import ExtractWindow
|
||||||
from thinc.neural._classes.static_vectors import StaticVectors
|
from thinc.neural._classes.static_vectors import StaticVectors
|
||||||
from thinc.neural._classes.batchnorm import BatchNorm
|
from thinc.neural._classes.batchnorm import BatchNorm
|
||||||
|
from thinc.neural._classes.layernorm import LayerNorm as LN
|
||||||
from thinc.neural._classes.resnet import Residual
|
from thinc.neural._classes.resnet import Residual
|
||||||
from thinc.neural import ReLu
|
from thinc.neural import ReLu
|
||||||
from thinc.neural._classes.selu import SELU
|
from thinc.neural._classes.selu import SELU
|
||||||
|
@ -19,10 +21,12 @@ from thinc.api import FeatureExtracter, with_getitem
|
||||||
from thinc.neural.pooling import Pooling, max_pool, mean_pool, sum_pool
|
from thinc.neural.pooling import Pooling, max_pool, mean_pool, sum_pool
|
||||||
from thinc.neural._classes.attention import ParametricAttention
|
from thinc.neural._classes.attention import ParametricAttention
|
||||||
from thinc.linear.linear import LinearModel
|
from thinc.linear.linear import LinearModel
|
||||||
from thinc.api import uniqued, wrap
|
from thinc.api import uniqued, wrap, flatten_add_lengths
|
||||||
|
|
||||||
|
|
||||||
from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP
|
from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP
|
||||||
from .tokens.doc import Doc
|
from .tokens.doc import Doc
|
||||||
|
from . import util
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
import io
|
import io
|
||||||
|
@ -53,6 +57,27 @@ def _logistic(X, drop=0.):
|
||||||
return Y, logistic_bwd
|
return Y, logistic_bwd
|
||||||
|
|
||||||
|
|
||||||
|
@layerize
|
||||||
|
def add_tuples(X, drop=0.):
|
||||||
|
"""Give inputs of sequence pairs, where each sequence is (vals, length),
|
||||||
|
sum the values, returning a single sequence.
|
||||||
|
|
||||||
|
If input is:
|
||||||
|
((vals1, length), (vals2, length)
|
||||||
|
Output is:
|
||||||
|
(vals1+vals2, length)
|
||||||
|
|
||||||
|
vals are a single tensor for the whole batch.
|
||||||
|
"""
|
||||||
|
(vals1, length1), (vals2, length2) = X
|
||||||
|
assert length1 == length2
|
||||||
|
|
||||||
|
def add_tuples_bwd(dY, sgd=None):
|
||||||
|
return (dY, dY)
|
||||||
|
|
||||||
|
return (vals1+vals2, length), add_tuples_bwd
|
||||||
|
|
||||||
|
|
||||||
def _zero_init(model):
|
def _zero_init(model):
|
||||||
def _zero_init_impl(self, X, y):
|
def _zero_init_impl(self, X, y):
|
||||||
self.W.fill(0)
|
self.W.fill(0)
|
||||||
|
@ -61,6 +86,7 @@ def _zero_init(model):
|
||||||
model.W.fill(0.)
|
model.W.fill(0.)
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
@layerize
|
@layerize
|
||||||
def _preprocess_doc(docs, drop=0.):
|
def _preprocess_doc(docs, drop=0.):
|
||||||
keys = [doc.to_array([LOWER]) for doc in docs]
|
keys = [doc.to_array([LOWER]) for doc in docs]
|
||||||
|
@ -72,7 +98,6 @@ def _preprocess_doc(docs, drop=0.):
|
||||||
return (keys, vals, lengths), None
|
return (keys, vals, lengths), None
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def _init_for_precomputed(W, ops):
|
def _init_for_precomputed(W, ops):
|
||||||
if (W**2).sum() != 0.:
|
if (W**2).sum() != 0.:
|
||||||
return
|
return
|
||||||
|
@ -80,6 +105,7 @@ def _init_for_precomputed(W, ops):
|
||||||
ops.xavier_uniform_init(reshaped)
|
ops.xavier_uniform_init(reshaped)
|
||||||
W[:] = reshaped.reshape(W.shape)
|
W[:] = reshaped.reshape(W.shape)
|
||||||
|
|
||||||
|
|
||||||
@describe.on_data(_set_dimensions_if_needed)
|
@describe.on_data(_set_dimensions_if_needed)
|
||||||
@describe.attributes(
|
@describe.attributes(
|
||||||
nI=Dimension("Input size"),
|
nI=Dimension("Input size"),
|
||||||
|
@ -184,25 +210,36 @@ class PrecomputableMaxouts(Model):
|
||||||
return Yfp, backward
|
return Yfp, backward
|
||||||
|
|
||||||
|
|
||||||
|
def drop_layer(layer, factor=2.):
|
||||||
|
def drop_layer_fwd(X, drop=0.):
|
||||||
|
drop *= factor
|
||||||
|
mask = layer.ops.get_dropout_mask((1,), drop)
|
||||||
|
if mask is None or mask > 0:
|
||||||
|
return layer.begin_update(X, drop=drop)
|
||||||
|
else:
|
||||||
|
return X, lambda dX, sgd=None: dX
|
||||||
|
return wrap(drop_layer_fwd, layer)
|
||||||
|
|
||||||
|
|
||||||
def Tok2Vec(width, embed_size, preprocess=None):
|
def Tok2Vec(width, embed_size, preprocess=None):
|
||||||
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE]
|
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
||||||
with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}):
|
with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}):
|
||||||
norm = get_col(cols.index(NORM)) >> HashEmbed(width, embed_size, name='embed_lower')
|
norm = get_col(cols.index(NORM)) >> HashEmbed(width, embed_size, name='embed_lower')
|
||||||
prefix = get_col(cols.index(PREFIX)) >> HashEmbed(width, embed_size//2, name='embed_prefix')
|
prefix = get_col(cols.index(PREFIX)) >> HashEmbed(width, embed_size//2, name='embed_prefix')
|
||||||
suffix = get_col(cols.index(SUFFIX)) >> HashEmbed(width, embed_size//2, name='embed_suffix')
|
suffix = get_col(cols.index(SUFFIX)) >> HashEmbed(width, embed_size//2, name='embed_suffix')
|
||||||
shape = get_col(cols.index(SHAPE)) >> HashEmbed(width, embed_size//2, name='embed_shape')
|
shape = get_col(cols.index(SHAPE)) >> HashEmbed(width, embed_size//2, name='embed_shape')
|
||||||
|
|
||||||
embed = (norm | prefix | suffix | shape )
|
embed = (norm | prefix | suffix | shape ) >> Maxout(width, width*4, pieces=3)
|
||||||
tok2vec = (
|
tok2vec = (
|
||||||
with_flatten(
|
with_flatten(
|
||||||
asarray(Model.ops, dtype='uint64')
|
asarray(Model.ops, dtype='uint64')
|
||||||
>> embed
|
>> uniqued(embed, column=5)
|
||||||
>> Maxout(width, width*4, pieces=3)
|
>> drop_layer(
|
||||||
>> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
|
Residual(
|
||||||
>> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
|
(ExtractWindow(nW=1) >> ReLu(width, width*3))
|
||||||
>> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
|
)
|
||||||
>> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)),
|
) ** 4, pad=4
|
||||||
pad=4)
|
)
|
||||||
)
|
)
|
||||||
if preprocess not in (False, None):
|
if preprocess not in (False, None):
|
||||||
tok2vec = preprocess >> tok2vec
|
tok2vec = preprocess >> tok2vec
|
||||||
|
@ -297,7 +334,8 @@ def zero_init(model):
|
||||||
|
|
||||||
|
|
||||||
def doc2feats(cols=None):
|
def doc2feats(cols=None):
|
||||||
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE]
|
if cols is None:
|
||||||
|
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
||||||
def forward(docs, drop=0.):
|
def forward(docs, drop=0.):
|
||||||
feats = []
|
feats = []
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
|
@ -323,6 +361,37 @@ def get_token_vectors(tokens_attrs_vectors, drop=0.):
|
||||||
return vectors, backward
|
return vectors, backward
|
||||||
|
|
||||||
|
|
||||||
|
def fine_tune(embedding, combine=None):
|
||||||
|
if combine is not None:
|
||||||
|
raise NotImplementedError(
|
||||||
|
"fine_tune currently only supports addition. Set combine=None")
|
||||||
|
def fine_tune_fwd(docs_tokvecs, drop=0.):
|
||||||
|
docs, tokvecs = docs_tokvecs
|
||||||
|
lengths = model.ops.asarray([len(doc) for doc in docs], dtype='i')
|
||||||
|
|
||||||
|
vecs, bp_vecs = embedding.begin_update(docs, drop=drop)
|
||||||
|
flat_tokvecs = embedding.ops.flatten(tokvecs)
|
||||||
|
flat_vecs = embedding.ops.flatten(vecs)
|
||||||
|
output = embedding.ops.unflatten(
|
||||||
|
(model.mix[0] * flat_vecs + model.mix[1] * flat_tokvecs),
|
||||||
|
lengths)
|
||||||
|
|
||||||
|
def fine_tune_bwd(d_output, sgd=None):
|
||||||
|
bp_vecs(d_output, sgd=sgd)
|
||||||
|
flat_grad = model.ops.flatten(d_output)
|
||||||
|
model.d_mix[1] += flat_tokvecs.dot(flat_grad.T).sum()
|
||||||
|
model.d_mix[0] += flat_vecs.dot(flat_grad.T).sum()
|
||||||
|
if sgd is not None:
|
||||||
|
sgd(model._mem.weights, model._mem.gradient, key=model.id)
|
||||||
|
return d_output
|
||||||
|
return output, fine_tune_bwd
|
||||||
|
model = wrap(fine_tune_fwd, embedding)
|
||||||
|
model.mix = model._mem.add((model.id, 'mix'), (2,))
|
||||||
|
model.mix.fill(1.)
|
||||||
|
model.d_mix = model._mem.add_gradient((model.id, 'd_mix'), (model.id, 'mix'))
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
@layerize
|
@layerize
|
||||||
def flatten(seqs, drop=0.):
|
def flatten(seqs, drop=0.):
|
||||||
if isinstance(seqs[0], numpy.ndarray):
|
if isinstance(seqs[0], numpy.ndarray):
|
||||||
|
@ -369,6 +438,27 @@ def preprocess_doc(docs, drop=0.):
|
||||||
vals = ops.allocate(keys.shape[0]) + 1
|
vals = ops.allocate(keys.shape[0]) + 1
|
||||||
return (keys, vals, lengths), None
|
return (keys, vals, lengths), None
|
||||||
|
|
||||||
|
def getitem(i):
|
||||||
|
def getitem_fwd(X, drop=0.):
|
||||||
|
return X[i], None
|
||||||
|
return layerize(getitem_fwd)
|
||||||
|
|
||||||
|
def build_tagger_model(nr_class, token_vector_width, **cfg):
|
||||||
|
embed_size = util.env_opt('embed_size', 7500)
|
||||||
|
with Model.define_operators({'>>': chain, '+': add}):
|
||||||
|
# Input: (doc, tensor) tuples
|
||||||
|
private_tok2vec = Tok2Vec(token_vector_width, embed_size, preprocess=doc2feats())
|
||||||
|
|
||||||
|
model = (
|
||||||
|
fine_tune(private_tok2vec)
|
||||||
|
>> with_flatten(
|
||||||
|
Maxout(token_vector_width, token_vector_width)
|
||||||
|
>> Softmax(nr_class, token_vector_width)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
model.nI = None
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
def build_text_classifier(nr_class, width=64, **cfg):
|
def build_text_classifier(nr_class, width=64, **cfg):
|
||||||
nr_vector = cfg.get('nr_vector', 200)
|
nr_vector = cfg.get('nr_vector', 200)
|
||||||
|
@ -383,7 +473,7 @@ def build_text_classifier(nr_class, width=64, **cfg):
|
||||||
>> _flatten_add_lengths
|
>> _flatten_add_lengths
|
||||||
>> with_getitem(0,
|
>> with_getitem(0,
|
||||||
uniqued(
|
uniqued(
|
||||||
(embed_lower | embed_prefix | embed_suffix | embed_shape)
|
(embed_lower | embed_prefix | embed_suffix | embed_shape)
|
||||||
>> Maxout(width, width+(width//2)*3))
|
>> Maxout(width, width+(width//2)*3))
|
||||||
>> Residual(ExtractWindow(nW=1) >> ReLu(width, width*3))
|
>> Residual(ExtractWindow(nW=1) >> ReLu(width, width*3))
|
||||||
>> Residual(ExtractWindow(nW=1) >> ReLu(width, width*3))
|
>> Residual(ExtractWindow(nW=1) >> ReLu(width, width*3))
|
||||||
|
@ -404,7 +494,7 @@ def build_text_classifier(nr_class, width=64, **cfg):
|
||||||
>> zero_init(Affine(nr_class, nr_class*2, drop_factor=0.0))
|
>> zero_init(Affine(nr_class, nr_class*2, drop_factor=0.0))
|
||||||
>> logistic
|
>> logistic
|
||||||
)
|
)
|
||||||
|
|
||||||
model.lsuv = False
|
model.lsuv = False
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
|
@ -21,10 +21,10 @@ CONVERTERS = {
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
input_file=("input file", "positional", None, str),
|
input_file=("input file", "positional", None, str),
|
||||||
output_dir=("output directory for converted file", "positional", None, str),
|
output_dir=("output directory for converted file", "positional", None, str),
|
||||||
n_sents=("Number of sentences per doc", "option", "n", float),
|
n_sents=("Number of sentences per doc", "option", "n", int),
|
||||||
morphology=("Enable appending morphology to tags", "flag", "m", bool)
|
morphology=("Enable appending morphology to tags", "flag", "m", bool)
|
||||||
)
|
)
|
||||||
def convert(cmd, input_file, output_dir, n_sents, morphology):
|
def convert(cmd, input_file, output_dir, n_sents=1, morphology=False):
|
||||||
"""
|
"""
|
||||||
Convert files into JSON format for use with train command and other
|
Convert files into JSON format for use with train command and other
|
||||||
experiment management functions.
|
experiment management functions.
|
||||||
|
|
|
@ -91,15 +91,14 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
|
||||||
for batch in minibatch(train_docs, size=batch_sizes):
|
for batch in minibatch(train_docs, size=batch_sizes):
|
||||||
docs, golds = zip(*batch)
|
docs, golds = zip(*batch)
|
||||||
nlp.update(docs, golds, sgd=optimizer,
|
nlp.update(docs, golds, sgd=optimizer,
|
||||||
drop=next(dropout_rates), losses=losses)
|
drop=next(dropout_rates), losses=losses,
|
||||||
|
update_tensors=True)
|
||||||
pbar.update(sum(len(doc) for doc in docs))
|
pbar.update(sum(len(doc) for doc in docs))
|
||||||
|
|
||||||
with nlp.use_params(optimizer.averages):
|
with nlp.use_params(optimizer.averages):
|
||||||
util.set_env_log(False)
|
util.set_env_log(False)
|
||||||
epoch_model_path = output_path / ('model%d' % i)
|
epoch_model_path = output_path / ('model%d' % i)
|
||||||
nlp.to_disk(epoch_model_path)
|
nlp.to_disk(epoch_model_path)
|
||||||
with (output_path / ('model%d.pickle' % i)).open('wb') as file_:
|
|
||||||
dill.dump(nlp, file_, -1)
|
|
||||||
nlp_loaded = lang_class(pipeline=pipeline)
|
nlp_loaded = lang_class(pipeline=pipeline)
|
||||||
nlp_loaded = nlp_loaded.from_disk(epoch_model_path)
|
nlp_loaded = nlp_loaded.from_disk(epoch_model_path)
|
||||||
scorer = nlp_loaded.evaluate(
|
scorer = nlp_loaded.evaluate(
|
||||||
|
|
|
@ -277,7 +277,8 @@ class Language(object):
|
||||||
def make_doc(self, text):
|
def make_doc(self, text):
|
||||||
return self.tokenizer(text)
|
return self.tokenizer(text)
|
||||||
|
|
||||||
def update(self, docs, golds, drop=0., sgd=None, losses=None):
|
def update(self, docs, golds, drop=0., sgd=None, losses=None,
|
||||||
|
update_tensors=False):
|
||||||
"""Update the models in the pipeline.
|
"""Update the models in the pipeline.
|
||||||
|
|
||||||
docs (iterable): A batch of `Doc` objects.
|
docs (iterable): A batch of `Doc` objects.
|
||||||
|
@ -304,14 +305,17 @@ class Language(object):
|
||||||
grads[key] = (W, dW)
|
grads[key] = (W, dW)
|
||||||
pipes = list(self.pipeline[1:])
|
pipes = list(self.pipeline[1:])
|
||||||
random.shuffle(pipes)
|
random.shuffle(pipes)
|
||||||
|
tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop)
|
||||||
|
all_d_tokvecses = [tok2vec.model.ops.allocate(tv.shape) for tv in tokvecses]
|
||||||
for proc in pipes:
|
for proc in pipes:
|
||||||
if not hasattr(proc, 'update'):
|
if not hasattr(proc, 'update'):
|
||||||
continue
|
continue
|
||||||
tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop)
|
|
||||||
d_tokvecses = proc.update((docs, tokvecses), golds,
|
d_tokvecses = proc.update((docs, tokvecses), golds,
|
||||||
drop=drop, sgd=get_grads, losses=losses)
|
drop=drop, sgd=get_grads, losses=losses)
|
||||||
if d_tokvecses is not None:
|
if update_tensors and d_tokvecses is not None:
|
||||||
bp_tokvecses(d_tokvecses, sgd=sgd)
|
for i, d_tv in enumerate(d_tokvecses):
|
||||||
|
all_d_tokvecses[i] += d_tv
|
||||||
|
bp_tokvecses(all_d_tokvecses, sgd=sgd)
|
||||||
for key, (W, dW) in grads.items():
|
for key, (W, dW) in grads.items():
|
||||||
sgd(W, dW, key=key)
|
sgd(W, dW, key=key)
|
||||||
# Clear the tensor variable, to free GPU memory.
|
# Clear the tensor variable, to free GPU memory.
|
||||||
|
@ -381,9 +385,18 @@ class Language(object):
|
||||||
return optimizer
|
return optimizer
|
||||||
|
|
||||||
def evaluate(self, docs_golds):
|
def evaluate(self, docs_golds):
|
||||||
docs, golds = zip(*docs_golds)
|
|
||||||
scorer = Scorer()
|
scorer = Scorer()
|
||||||
for doc, gold in zip(self.pipe(docs, batch_size=32), golds):
|
docs, golds = zip(*docs_golds)
|
||||||
|
docs = list(docs)
|
||||||
|
golds = list(golds)
|
||||||
|
for pipe in self.pipeline:
|
||||||
|
if not hasattr(pipe, 'pipe'):
|
||||||
|
for doc in docs:
|
||||||
|
pipe(doc)
|
||||||
|
else:
|
||||||
|
docs = list(pipe.pipe(docs))
|
||||||
|
assert len(docs) == len(golds)
|
||||||
|
for doc, gold in zip(docs, golds):
|
||||||
scorer.score(doc, gold)
|
scorer.score(doc, gold)
|
||||||
doc.tensor = None
|
doc.tensor = None
|
||||||
return scorer
|
return scorer
|
||||||
|
|
|
@ -42,7 +42,7 @@ from .compat import json_dumps
|
||||||
|
|
||||||
from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS
|
from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS
|
||||||
from ._ml import rebatch, Tok2Vec, flatten, get_col, doc2feats
|
from ._ml import rebatch, Tok2Vec, flatten, get_col, doc2feats
|
||||||
from ._ml import build_text_classifier
|
from ._ml import build_text_classifier, build_tagger_model
|
||||||
from .parts_of_speech import X
|
from .parts_of_speech import X
|
||||||
|
|
||||||
|
|
||||||
|
@ -138,7 +138,7 @@ class TokenVectorEncoder(BaseThincComponent):
|
||||||
name = 'tensorizer'
|
name = 'tensorizer'
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def Model(cls, width=128, embed_size=7500, **cfg):
|
def Model(cls, width=128, embed_size=4000, **cfg):
|
||||||
"""Create a new statistical model for the class.
|
"""Create a new statistical model for the class.
|
||||||
|
|
||||||
width (int): Output size of the model.
|
width (int): Output size of the model.
|
||||||
|
@ -253,23 +253,25 @@ class NeuralTagger(BaseThincComponent):
|
||||||
self.cfg = dict(cfg)
|
self.cfg = dict(cfg)
|
||||||
|
|
||||||
def __call__(self, doc):
|
def __call__(self, doc):
|
||||||
tags = self.predict([doc.tensor])
|
tags = self.predict(([doc], [doc.tensor]))
|
||||||
self.set_annotations([doc], tags)
|
self.set_annotations([doc], tags)
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def pipe(self, stream, batch_size=128, n_threads=-1):
|
def pipe(self, stream, batch_size=128, n_threads=-1):
|
||||||
for docs in cytoolz.partition_all(batch_size, stream):
|
for docs in cytoolz.partition_all(batch_size, stream):
|
||||||
|
docs = list(docs)
|
||||||
tokvecs = [d.tensor for d in docs]
|
tokvecs = [d.tensor for d in docs]
|
||||||
tag_ids = self.predict(tokvecs)
|
tag_ids = self.predict((docs, tokvecs))
|
||||||
self.set_annotations(docs, tag_ids)
|
self.set_annotations(docs, tag_ids)
|
||||||
yield from docs
|
yield from docs
|
||||||
|
|
||||||
def predict(self, tokvecs):
|
def predict(self, docs_tokvecs):
|
||||||
scores = self.model(tokvecs)
|
scores = self.model(docs_tokvecs)
|
||||||
scores = self.model.ops.flatten(scores)
|
scores = self.model.ops.flatten(scores)
|
||||||
guesses = scores.argmax(axis=1)
|
guesses = scores.argmax(axis=1)
|
||||||
if not isinstance(guesses, numpy.ndarray):
|
if not isinstance(guesses, numpy.ndarray):
|
||||||
guesses = guesses.get()
|
guesses = guesses.get()
|
||||||
|
tokvecs = docs_tokvecs[1]
|
||||||
guesses = self.model.ops.unflatten(guesses,
|
guesses = self.model.ops.unflatten(guesses,
|
||||||
[tv.shape[0] for tv in tokvecs])
|
[tv.shape[0] for tv in tokvecs])
|
||||||
return guesses
|
return guesses
|
||||||
|
@ -282,6 +284,8 @@ class NeuralTagger(BaseThincComponent):
|
||||||
cdef Vocab vocab = self.vocab
|
cdef Vocab vocab = self.vocab
|
||||||
for i, doc in enumerate(docs):
|
for i, doc in enumerate(docs):
|
||||||
doc_tag_ids = batch_tag_ids[i]
|
doc_tag_ids = batch_tag_ids[i]
|
||||||
|
if hasattr(doc_tag_ids, 'get'):
|
||||||
|
doc_tag_ids = doc_tag_ids.get()
|
||||||
for j, tag_id in enumerate(doc_tag_ids):
|
for j, tag_id in enumerate(doc_tag_ids):
|
||||||
# Don't clobber preset POS tags
|
# Don't clobber preset POS tags
|
||||||
if doc.c[j].tag == 0 and doc.c[j].pos == 0:
|
if doc.c[j].tag == 0 and doc.c[j].pos == 0:
|
||||||
|
@ -294,8 +298,7 @@ class NeuralTagger(BaseThincComponent):
|
||||||
|
|
||||||
if self.model.nI is None:
|
if self.model.nI is None:
|
||||||
self.model.nI = tokvecs[0].shape[1]
|
self.model.nI = tokvecs[0].shape[1]
|
||||||
|
tag_scores, bp_tag_scores = self.model.begin_update(docs_tokvecs, drop=drop)
|
||||||
tag_scores, bp_tag_scores = self.model.begin_update(tokvecs, drop=drop)
|
|
||||||
loss, d_tag_scores = self.get_loss(docs, golds, tag_scores)
|
loss, d_tag_scores = self.get_loss(docs, golds, tag_scores)
|
||||||
|
|
||||||
d_tokvecs = bp_tag_scores(d_tag_scores, sgd=sgd)
|
d_tokvecs = bp_tag_scores(d_tag_scores, sgd=sgd)
|
||||||
|
@ -346,10 +349,8 @@ class NeuralTagger(BaseThincComponent):
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def Model(cls, n_tags, token_vector_width):
|
def Model(cls, n_tags, token_vector_width):
|
||||||
return with_flatten(
|
return build_tagger_model(n_tags, token_vector_width)
|
||||||
chain(Maxout(token_vector_width, token_vector_width),
|
|
||||||
Softmax(n_tags, token_vector_width)))
|
|
||||||
|
|
||||||
def use_params(self, params):
|
def use_params(self, params):
|
||||||
with self.model.use_params(params):
|
with self.model.use_params(params):
|
||||||
yield
|
yield
|
||||||
|
@ -432,7 +433,7 @@ class NeuralLabeller(NeuralTagger):
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def labels(self):
|
def labels(self):
|
||||||
return self.cfg.get('labels', {})
|
return self.cfg.setdefault('labels', {})
|
||||||
|
|
||||||
@labels.setter
|
@labels.setter
|
||||||
def labels(self, value):
|
def labels(self, value):
|
||||||
|
@ -455,10 +456,8 @@ class NeuralLabeller(NeuralTagger):
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def Model(cls, n_tags, token_vector_width):
|
def Model(cls, n_tags, token_vector_width):
|
||||||
return with_flatten(
|
return build_tagger_model(n_tags, token_vector_width)
|
||||||
chain(Maxout(token_vector_width, token_vector_width),
|
|
||||||
Softmax(n_tags, token_vector_width)))
|
|
||||||
|
|
||||||
def get_loss(self, docs, golds, scores):
|
def get_loss(self, docs, golds, scores):
|
||||||
scores = self.model.ops.flatten(scores)
|
scores = self.model.ops.flatten(scores)
|
||||||
cdef int idx = 0
|
cdef int idx = 0
|
||||||
|
|
273
spacy/syntax/_beam_utils.pyx
Normal file
273
spacy/syntax/_beam_utils.pyx
Normal file
|
@ -0,0 +1,273 @@
|
||||||
|
# cython: infer_types=True
|
||||||
|
# cython: profile=True
|
||||||
|
cimport numpy as np
|
||||||
|
import numpy
|
||||||
|
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
|
||||||
|
from thinc.extra.search cimport Beam
|
||||||
|
from thinc.extra.search import MaxViolation
|
||||||
|
from thinc.typedefs cimport hash_t, class_t
|
||||||
|
|
||||||
|
from .transition_system cimport TransitionSystem, Transition
|
||||||
|
from .stateclass cimport StateClass
|
||||||
|
from ..gold cimport GoldParse
|
||||||
|
from ..tokens.doc cimport Doc
|
||||||
|
|
||||||
|
|
||||||
|
# These are passed as callbacks to thinc.search.Beam
|
||||||
|
cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1:
|
||||||
|
dest = <StateClass>_dest
|
||||||
|
src = <StateClass>_src
|
||||||
|
moves = <const Transition*>_moves
|
||||||
|
dest.clone(src)
|
||||||
|
moves[clas].do(dest.c, moves[clas].label)
|
||||||
|
|
||||||
|
|
||||||
|
cdef int _check_final_state(void* _state, void* extra_args) except -1:
|
||||||
|
return (<StateClass>_state).is_final()
|
||||||
|
|
||||||
|
|
||||||
|
def _cleanup(Beam beam):
|
||||||
|
for i in range(beam.width):
|
||||||
|
Py_XDECREF(<PyObject*>beam._states[i].content)
|
||||||
|
Py_XDECREF(<PyObject*>beam._parents[i].content)
|
||||||
|
|
||||||
|
|
||||||
|
cdef hash_t _hash_state(void* _state, void* _) except 0:
|
||||||
|
state = <StateClass>_state
|
||||||
|
if state.c.is_final():
|
||||||
|
return 1
|
||||||
|
else:
|
||||||
|
return state.c.hash()
|
||||||
|
|
||||||
|
|
||||||
|
cdef class ParserBeam(object):
|
||||||
|
cdef public TransitionSystem moves
|
||||||
|
cdef public object states
|
||||||
|
cdef public object golds
|
||||||
|
cdef public object beams
|
||||||
|
|
||||||
|
def __init__(self, TransitionSystem moves, states, golds,
|
||||||
|
int width=4, float density=0.001):
|
||||||
|
self.moves = moves
|
||||||
|
self.states = states
|
||||||
|
self.golds = golds
|
||||||
|
self.beams = []
|
||||||
|
cdef Beam beam
|
||||||
|
cdef StateClass state, st
|
||||||
|
for state in states:
|
||||||
|
beam = Beam(self.moves.n_moves, width, density)
|
||||||
|
beam.initialize(self.moves.init_beam_state, state.c.length, state.c._sent)
|
||||||
|
for i in range(beam.width):
|
||||||
|
st = <StateClass>beam.at(i)
|
||||||
|
st.c.offset = state.c.offset
|
||||||
|
self.beams.append(beam)
|
||||||
|
|
||||||
|
def __dealloc__(self):
|
||||||
|
if self.beams is not None:
|
||||||
|
for beam in self.beams:
|
||||||
|
if beam is not None:
|
||||||
|
_cleanup(beam)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def is_done(self):
|
||||||
|
return all(b.is_done for b in self.beams)
|
||||||
|
|
||||||
|
def __getitem__(self, i):
|
||||||
|
return self.beams[i]
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self.beams)
|
||||||
|
|
||||||
|
def advance(self, scores, follow_gold=False):
|
||||||
|
cdef Beam beam
|
||||||
|
for i, beam in enumerate(self.beams):
|
||||||
|
if beam.is_done or not scores[i].size:
|
||||||
|
continue
|
||||||
|
self._set_scores(beam, scores[i])
|
||||||
|
if self.golds is not None:
|
||||||
|
self._set_costs(beam, self.golds[i], follow_gold=follow_gold)
|
||||||
|
if follow_gold:
|
||||||
|
assert self.golds is not None
|
||||||
|
beam.advance(_transition_state, NULL, <void*>self.moves.c)
|
||||||
|
else:
|
||||||
|
beam.advance(_transition_state, _hash_state, <void*>self.moves.c)
|
||||||
|
beam.check_done(_check_final_state, NULL)
|
||||||
|
if beam.is_done:
|
||||||
|
for j in range(beam.size):
|
||||||
|
if is_gold(<StateClass>beam.at(j), self.golds[i], self.moves.strings):
|
||||||
|
beam._states[j].loss = 0.0
|
||||||
|
elif beam._states[j].loss == 0.0:
|
||||||
|
beam._states[j].loss = 1.0
|
||||||
|
|
||||||
|
def _set_scores(self, Beam beam, float[:, ::1] scores):
|
||||||
|
cdef float* c_scores = &scores[0, 0]
|
||||||
|
for i in range(beam.size):
|
||||||
|
state = <StateClass>beam.at(i)
|
||||||
|
if not state.is_final():
|
||||||
|
for j in range(beam.nr_class):
|
||||||
|
beam.scores[i][j] = c_scores[i * beam.nr_class + j]
|
||||||
|
self.moves.set_valid(beam.is_valid[i], state.c)
|
||||||
|
|
||||||
|
def _set_costs(self, Beam beam, GoldParse gold, int follow_gold=False):
|
||||||
|
for i in range(beam.size):
|
||||||
|
state = <StateClass>beam.at(i)
|
||||||
|
if not state.c.is_final():
|
||||||
|
self.moves.set_costs(beam.is_valid[i], beam.costs[i], state, gold)
|
||||||
|
if follow_gold:
|
||||||
|
for j in range(beam.nr_class):
|
||||||
|
if beam.costs[i][j] >= 1:
|
||||||
|
beam.is_valid[i][j] = 0
|
||||||
|
|
||||||
|
|
||||||
|
def is_gold(StateClass state, GoldParse gold, strings):
|
||||||
|
predicted = set()
|
||||||
|
truth = set()
|
||||||
|
for i in range(gold.length):
|
||||||
|
if gold.cand_to_gold[i] is None:
|
||||||
|
continue
|
||||||
|
if state.safe_get(i).dep:
|
||||||
|
predicted.add((i, state.H(i), strings[state.safe_get(i).dep]))
|
||||||
|
else:
|
||||||
|
predicted.add((i, state.H(i), 'ROOT'))
|
||||||
|
id_, word, tag, head, dep, ner = gold.orig_annot[gold.cand_to_gold[i]]
|
||||||
|
truth.add((id_, head, dep))
|
||||||
|
return truth == predicted
|
||||||
|
|
||||||
|
|
||||||
|
def get_token_ids(states, int n_tokens):
|
||||||
|
cdef StateClass state
|
||||||
|
cdef np.ndarray ids = numpy.zeros((len(states), n_tokens),
|
||||||
|
dtype='int32', order='C')
|
||||||
|
c_ids = <int*>ids.data
|
||||||
|
for i, state in enumerate(states):
|
||||||
|
if not state.is_final():
|
||||||
|
state.c.set_context_tokens(c_ids, n_tokens)
|
||||||
|
else:
|
||||||
|
ids[i] = -1
|
||||||
|
c_ids += ids.shape[1]
|
||||||
|
return ids
|
||||||
|
|
||||||
|
nr_update = 0
|
||||||
|
def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
|
||||||
|
states, tokvecs, golds,
|
||||||
|
state2vec, vec2scores, drop=0., sgd=None,
|
||||||
|
losses=None, int width=4, float density=0.001):
|
||||||
|
global nr_update
|
||||||
|
nr_update += 1
|
||||||
|
pbeam = ParserBeam(moves, states, golds,
|
||||||
|
width=width, density=density)
|
||||||
|
gbeam = ParserBeam(moves, states, golds,
|
||||||
|
width=width, density=0.0)
|
||||||
|
cdef StateClass state
|
||||||
|
beam_maps = []
|
||||||
|
backprops = []
|
||||||
|
violns = [MaxViolation() for _ in range(len(states))]
|
||||||
|
for t in range(max_steps):
|
||||||
|
# The beam maps let us find the right row in the flattened scores
|
||||||
|
# arrays for each state. States are identified by (example id, history).
|
||||||
|
# We keep a different beam map for each step (since we'll have a flat
|
||||||
|
# scores array for each step). The beam map will let us take the per-state
|
||||||
|
# losses, and compute the gradient for each (step, state, class).
|
||||||
|
beam_maps.append({})
|
||||||
|
# Gather all states from the two beams in a list. Some stats may occur
|
||||||
|
# in both beams. To figure out which beam each state belonged to,
|
||||||
|
# we keep two lists of indices, p_indices and g_indices
|
||||||
|
states, p_indices, g_indices = get_states(pbeam, gbeam, beam_maps[-1], nr_update)
|
||||||
|
if not states:
|
||||||
|
break
|
||||||
|
# Now that we have our flat list of states, feed them through the model
|
||||||
|
token_ids = get_token_ids(states, nr_feature)
|
||||||
|
vectors, bp_vectors = state2vec.begin_update(token_ids, drop=drop)
|
||||||
|
scores, bp_scores = vec2scores.begin_update(vectors, drop=drop)
|
||||||
|
|
||||||
|
# Store the callbacks for the backward pass
|
||||||
|
backprops.append((token_ids, bp_vectors, bp_scores))
|
||||||
|
|
||||||
|
# Unpack the flat scores into lists for the two beams. The indices arrays
|
||||||
|
# tell us which example and state the scores-row refers to.
|
||||||
|
p_scores = [numpy.ascontiguousarray(scores[indices], dtype='f') for indices in p_indices]
|
||||||
|
g_scores = [numpy.ascontiguousarray(scores[indices], dtype='f') for indices in g_indices]
|
||||||
|
# Now advance the states in the beams. The gold beam is contrained to
|
||||||
|
# to follow only gold analyses.
|
||||||
|
pbeam.advance(p_scores)
|
||||||
|
gbeam.advance(g_scores, follow_gold=True)
|
||||||
|
# Track the "maximum violation", to use in the update.
|
||||||
|
for i, violn in enumerate(violns):
|
||||||
|
violn.check_crf(pbeam[i], gbeam[i])
|
||||||
|
|
||||||
|
# Only make updates if we have non-gold states
|
||||||
|
histories = [((v.p_hist + v.g_hist) if v.p_hist else []) for v in violns]
|
||||||
|
losses = [((v.p_probs + v.g_probs) if v.p_probs else []) for v in violns]
|
||||||
|
states_d_scores = get_gradient(moves.n_moves, beam_maps,
|
||||||
|
histories, losses)
|
||||||
|
assert len(states_d_scores) == len(backprops), (len(states_d_scores), len(backprops))
|
||||||
|
return states_d_scores, backprops
|
||||||
|
|
||||||
|
|
||||||
|
def get_states(pbeams, gbeams, beam_map, nr_update):
|
||||||
|
seen = {}
|
||||||
|
states = []
|
||||||
|
p_indices = []
|
||||||
|
g_indices = []
|
||||||
|
cdef Beam pbeam, gbeam
|
||||||
|
assert len(pbeams) == len(gbeams)
|
||||||
|
for eg_id, (pbeam, gbeam) in enumerate(zip(pbeams, gbeams)):
|
||||||
|
p_indices.append([])
|
||||||
|
g_indices.append([])
|
||||||
|
if pbeam.loss > 0 and pbeam.min_score > gbeam.score:
|
||||||
|
continue
|
||||||
|
for i in range(pbeam.size):
|
||||||
|
state = <StateClass>pbeam.at(i)
|
||||||
|
if not state.is_final():
|
||||||
|
key = tuple([eg_id] + pbeam.histories[i])
|
||||||
|
seen[key] = len(states)
|
||||||
|
p_indices[-1].append(len(states))
|
||||||
|
states.append(state)
|
||||||
|
beam_map.update(seen)
|
||||||
|
for i in range(gbeam.size):
|
||||||
|
state = <StateClass>gbeam.at(i)
|
||||||
|
if not state.is_final():
|
||||||
|
key = tuple([eg_id] + gbeam.histories[i])
|
||||||
|
if key in seen:
|
||||||
|
g_indices[-1].append(seen[key])
|
||||||
|
else:
|
||||||
|
g_indices[-1].append(len(states))
|
||||||
|
beam_map[key] = len(states)
|
||||||
|
states.append(state)
|
||||||
|
p_idx = [numpy.asarray(idx, dtype='i') for idx in p_indices]
|
||||||
|
g_idx = [numpy.asarray(idx, dtype='i') for idx in g_indices]
|
||||||
|
return states, p_idx, g_idx
|
||||||
|
|
||||||
|
|
||||||
|
def get_gradient(nr_class, beam_maps, histories, losses):
|
||||||
|
"""
|
||||||
|
The global model assigns a loss to each parse. The beam scores
|
||||||
|
are additive, so the same gradient is applied to each action
|
||||||
|
in the history. This gives the gradient of a single *action*
|
||||||
|
for a beam state -- so we have "the gradient of loss for taking
|
||||||
|
action i given history H."
|
||||||
|
|
||||||
|
Histories: Each hitory is a list of actions
|
||||||
|
Each candidate has a history
|
||||||
|
Each beam has multiple candidates
|
||||||
|
Each batch has multiple beams
|
||||||
|
So history is list of lists of lists of ints
|
||||||
|
"""
|
||||||
|
nr_step = len(beam_maps)
|
||||||
|
grads = []
|
||||||
|
for beam_map in beam_maps:
|
||||||
|
if beam_map:
|
||||||
|
grads.append(numpy.zeros((max(beam_map.values())+1, nr_class), dtype='f'))
|
||||||
|
assert len(histories) == len(losses)
|
||||||
|
for eg_id, hists in enumerate(histories):
|
||||||
|
for loss, hist in zip(losses[eg_id], hists):
|
||||||
|
key = tuple([eg_id])
|
||||||
|
for j, clas in enumerate(hist):
|
||||||
|
i = beam_maps[j][key]
|
||||||
|
# In step j, at state i action clas
|
||||||
|
# resulted in loss
|
||||||
|
grads[j][i, clas] += loss / len(histories)
|
||||||
|
key = key + tuple([clas])
|
||||||
|
return grads
|
||||||
|
|
||||||
|
|
|
@ -37,6 +37,7 @@ cdef cppclass StateC:
|
||||||
this.shifted = <bint*>calloc(length + (PADDING * 2), sizeof(bint))
|
this.shifted = <bint*>calloc(length + (PADDING * 2), sizeof(bint))
|
||||||
this._sent = <TokenC*>calloc(length + (PADDING * 2), sizeof(TokenC))
|
this._sent = <TokenC*>calloc(length + (PADDING * 2), sizeof(TokenC))
|
||||||
this._ents = <Entity*>calloc(length + (PADDING * 2), sizeof(Entity))
|
this._ents = <Entity*>calloc(length + (PADDING * 2), sizeof(Entity))
|
||||||
|
this.offset = 0
|
||||||
cdef int i
|
cdef int i
|
||||||
for i in range(length + (PADDING * 2)):
|
for i in range(length + (PADDING * 2)):
|
||||||
this._ents[i].end = -1
|
this._ents[i].end = -1
|
||||||
|
|
|
@ -385,6 +385,7 @@ cdef class ArcEager(TransitionSystem):
|
||||||
for i in range(self.n_moves):
|
for i in range(self.n_moves):
|
||||||
if self.c[i].move == move and self.c[i].label == label:
|
if self.c[i].move == move and self.c[i].label == label:
|
||||||
return self.c[i]
|
return self.c[i]
|
||||||
|
return Transition(clas=0, move=MISSING, label=0)
|
||||||
|
|
||||||
def move_name(self, int move, attr_t label):
|
def move_name(self, int move, attr_t label):
|
||||||
label_str = self.strings[label]
|
label_str = self.strings[label]
|
||||||
|
|
|
@ -34,6 +34,7 @@ from ._parse_features cimport CONTEXT_SIZE
|
||||||
from ._parse_features cimport fill_context
|
from ._parse_features cimport fill_context
|
||||||
from .stateclass cimport StateClass
|
from .stateclass cimport StateClass
|
||||||
from .parser cimport Parser
|
from .parser cimport Parser
|
||||||
|
from ._beam_utils import is_gold
|
||||||
|
|
||||||
|
|
||||||
DEBUG = False
|
DEBUG = False
|
||||||
|
@ -237,16 +238,3 @@ def _check_train_integrity(Beam pred, Beam gold, GoldParse gold_parse, Transitio
|
||||||
raise Exception("Gold parse is not gold-standard")
|
raise Exception("Gold parse is not gold-standard")
|
||||||
|
|
||||||
|
|
||||||
def is_gold(StateClass state, GoldParse gold, StringStore strings):
|
|
||||||
predicted = set()
|
|
||||||
truth = set()
|
|
||||||
for i in range(gold.length):
|
|
||||||
if gold.cand_to_gold[i] is None:
|
|
||||||
continue
|
|
||||||
if state.safe_get(i).dep:
|
|
||||||
predicted.add((i, state.H(i), strings[state.safe_get(i).dep]))
|
|
||||||
else:
|
|
||||||
predicted.add((i, state.H(i), 'ROOT'))
|
|
||||||
id_, word, tag, head, dep, ner = gold.orig_annot[gold.cand_to_gold[i]]
|
|
||||||
truth.add((id_, head, dep))
|
|
||||||
return truth == predicted
|
|
||||||
|
|
|
@ -14,8 +14,4 @@ cdef class Parser:
|
||||||
cdef readonly TransitionSystem moves
|
cdef readonly TransitionSystem moves
|
||||||
cdef readonly object cfg
|
cdef readonly object cfg
|
||||||
|
|
||||||
cdef void _parse_step(self, StateC* state,
|
|
||||||
const float* feat_weights,
|
|
||||||
int nr_class, int nr_feat, int nr_piece) nogil
|
|
||||||
|
|
||||||
#cdef int parseC(self, TokenC* tokens, int length, int nr_feat) nogil
|
#cdef int parseC(self, TokenC* tokens, int length, int nr_feat) nogil
|
||||||
|
|
|
@ -37,14 +37,17 @@ from preshed.maps cimport MapStruct
|
||||||
from preshed.maps cimport map_get
|
from preshed.maps cimport map_get
|
||||||
|
|
||||||
from thinc.api import layerize, chain, noop, clone
|
from thinc.api import layerize, chain, noop, clone
|
||||||
from thinc.neural import Model, Affine, ELU, ReLu, Maxout
|
from thinc.neural import Model, Affine, ReLu, Maxout
|
||||||
|
from thinc.neural._classes.selu import SELU
|
||||||
|
from thinc.neural._classes.layernorm import LayerNorm
|
||||||
from thinc.neural.ops import NumpyOps, CupyOps
|
from thinc.neural.ops import NumpyOps, CupyOps
|
||||||
from thinc.neural.util import get_array_module
|
from thinc.neural.util import get_array_module
|
||||||
|
|
||||||
from .. import util
|
from .. import util
|
||||||
from ..util import get_async, get_cuda_stream
|
from ..util import get_async, get_cuda_stream
|
||||||
from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts
|
from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts
|
||||||
from .._ml import Tok2Vec, doc2feats, rebatch
|
from .._ml import Tok2Vec, doc2feats, rebatch, fine_tune
|
||||||
|
from .._ml import Residual, drop_layer
|
||||||
from ..compat import json_dumps
|
from ..compat import json_dumps
|
||||||
|
|
||||||
from . import _parse_features
|
from . import _parse_features
|
||||||
|
@ -59,8 +62,11 @@ from ..structs cimport TokenC
|
||||||
from ..tokens.doc cimport Doc
|
from ..tokens.doc cimport Doc
|
||||||
from ..strings cimport StringStore
|
from ..strings cimport StringStore
|
||||||
from ..gold cimport GoldParse
|
from ..gold cimport GoldParse
|
||||||
from ..attrs cimport TAG, DEP
|
from ..attrs cimport ID, TAG, DEP, ORTH, NORM, PREFIX, SUFFIX, TAG
|
||||||
|
from . import _beam_utils
|
||||||
|
|
||||||
|
USE_FINE_TUNE = True
|
||||||
|
BEAM_PARSE = True
|
||||||
|
|
||||||
def get_templates(*args, **kwargs):
|
def get_templates(*args, **kwargs):
|
||||||
return []
|
return []
|
||||||
|
@ -232,11 +238,14 @@ cdef class Parser:
|
||||||
Base class of the DependencyParser and EntityRecognizer.
|
Base class of the DependencyParser and EntityRecognizer.
|
||||||
"""
|
"""
|
||||||
@classmethod
|
@classmethod
|
||||||
def Model(cls, nr_class, token_vector_width=128, hidden_width=128, depth=1, **cfg):
|
def Model(cls, nr_class, token_vector_width=128, hidden_width=300, depth=1, **cfg):
|
||||||
depth = util.env_opt('parser_hidden_depth', depth)
|
depth = util.env_opt('parser_hidden_depth', depth)
|
||||||
token_vector_width = util.env_opt('token_vector_width', token_vector_width)
|
token_vector_width = util.env_opt('token_vector_width', token_vector_width)
|
||||||
hidden_width = util.env_opt('hidden_width', hidden_width)
|
hidden_width = util.env_opt('hidden_width', hidden_width)
|
||||||
parser_maxout_pieces = util.env_opt('parser_maxout_pieces', 2)
|
parser_maxout_pieces = util.env_opt('parser_maxout_pieces', 2)
|
||||||
|
embed_size = util.env_opt('embed_size', 4000)
|
||||||
|
tensors = fine_tune(Tok2Vec(token_vector_width, embed_size,
|
||||||
|
preprocess=doc2feats()))
|
||||||
if parser_maxout_pieces == 1:
|
if parser_maxout_pieces == 1:
|
||||||
lower = PrecomputableAffine(hidden_width if depth >= 1 else nr_class,
|
lower = PrecomputableAffine(hidden_width if depth >= 1 else nr_class,
|
||||||
nF=cls.nr_feature,
|
nF=cls.nr_feature,
|
||||||
|
@ -248,15 +257,10 @@ cdef class Parser:
|
||||||
nI=token_vector_width)
|
nI=token_vector_width)
|
||||||
|
|
||||||
with Model.use_device('cpu'):
|
with Model.use_device('cpu'):
|
||||||
if depth == 0:
|
upper = chain(
|
||||||
upper = chain()
|
clone(Residual(ReLu(hidden_width)), (depth-1)),
|
||||||
upper.is_noop = True
|
zero_init(Affine(nr_class, drop_factor=0.0))
|
||||||
else:
|
)
|
||||||
upper = chain(
|
|
||||||
clone(Maxout(hidden_width), (depth-1)),
|
|
||||||
zero_init(Affine(nr_class, drop_factor=0.0))
|
|
||||||
)
|
|
||||||
upper.is_noop = False
|
|
||||||
# TODO: This is an unfortunate hack atm!
|
# TODO: This is an unfortunate hack atm!
|
||||||
# Used to set input dimensions in network.
|
# Used to set input dimensions in network.
|
||||||
lower.begin_training(lower.ops.allocate((500, token_vector_width)))
|
lower.begin_training(lower.ops.allocate((500, token_vector_width)))
|
||||||
|
@ -268,7 +272,7 @@ cdef class Parser:
|
||||||
'hidden_width': hidden_width,
|
'hidden_width': hidden_width,
|
||||||
'maxout_pieces': parser_maxout_pieces
|
'maxout_pieces': parser_maxout_pieces
|
||||||
}
|
}
|
||||||
return (lower, upper), cfg
|
return (tensors, lower, upper), cfg
|
||||||
|
|
||||||
def __init__(self, Vocab vocab, moves=True, model=True, **cfg):
|
def __init__(self, Vocab vocab, moves=True, model=True, **cfg):
|
||||||
"""
|
"""
|
||||||
|
@ -344,17 +348,21 @@ cdef class Parser:
|
||||||
The number of threads with which to work on the buffer in parallel.
|
The number of threads with which to work on the buffer in parallel.
|
||||||
Yields (Doc): Documents, in order.
|
Yields (Doc): Documents, in order.
|
||||||
"""
|
"""
|
||||||
cdef StateClass parse_state
|
if BEAM_PARSE:
|
||||||
|
beam_width = 8
|
||||||
cdef Doc doc
|
cdef Doc doc
|
||||||
queue = []
|
cdef Beam beam
|
||||||
for docs in cytoolz.partition_all(batch_size, docs):
|
for docs in cytoolz.partition_all(batch_size, docs):
|
||||||
docs = list(docs)
|
docs = list(docs)
|
||||||
tokvecs = [d.tensor for d in docs]
|
tokvecs = [doc.tensor for doc in docs]
|
||||||
if beam_width == 1:
|
if beam_width == 1:
|
||||||
parse_states = self.parse_batch(docs, tokvecs)
|
parse_states = self.parse_batch(docs, tokvecs)
|
||||||
else:
|
else:
|
||||||
parse_states = self.beam_parse(docs, tokvecs,
|
beams = self.beam_parse(docs, tokvecs,
|
||||||
beam_width=beam_width, beam_density=beam_density)
|
beam_width=beam_width, beam_density=beam_density)
|
||||||
|
parse_states = []
|
||||||
|
for beam in beams:
|
||||||
|
parse_states.append(<StateClass>beam.at(0))
|
||||||
self.set_annotations(docs, parse_states)
|
self.set_annotations(docs, parse_states)
|
||||||
yield from docs
|
yield from docs
|
||||||
|
|
||||||
|
@ -369,8 +377,12 @@ cdef class Parser:
|
||||||
int nr_class, nr_feat, nr_piece, nr_dim, nr_state
|
int nr_class, nr_feat, nr_piece, nr_dim, nr_state
|
||||||
if isinstance(docs, Doc):
|
if isinstance(docs, Doc):
|
||||||
docs = [docs]
|
docs = [docs]
|
||||||
|
if isinstance(tokvecses, np.ndarray):
|
||||||
|
tokvecses = [tokvecses]
|
||||||
|
|
||||||
tokvecs = self.model[0].ops.flatten(tokvecses)
|
tokvecs = self.model[0].ops.flatten(tokvecses)
|
||||||
|
if USE_FINE_TUNE:
|
||||||
|
tokvecs += self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
|
||||||
|
|
||||||
nr_state = len(docs)
|
nr_state = len(docs)
|
||||||
nr_class = self.moves.n_moves
|
nr_class = self.moves.n_moves
|
||||||
|
@ -394,27 +406,20 @@ cdef class Parser:
|
||||||
cdef np.ndarray scores
|
cdef np.ndarray scores
|
||||||
c_token_ids = <int*>token_ids.data
|
c_token_ids = <int*>token_ids.data
|
||||||
c_is_valid = <int*>is_valid.data
|
c_is_valid = <int*>is_valid.data
|
||||||
cdef int has_hidden = not getattr(vec2scores, 'is_noop', False)
|
|
||||||
while not next_step.empty():
|
while not next_step.empty():
|
||||||
if not has_hidden:
|
for i in range(next_step.size()):
|
||||||
for i in cython.parallel.prange(
|
st = next_step[i]
|
||||||
next_step.size(), num_threads=6, nogil=True):
|
st.set_context_tokens(&c_token_ids[i*nr_feat], nr_feat)
|
||||||
self._parse_step(next_step[i],
|
self.moves.set_valid(&c_is_valid[i*nr_class], st)
|
||||||
feat_weights, nr_class, nr_feat, nr_piece)
|
|
||||||
else:
|
|
||||||
for i in range(next_step.size()):
|
|
||||||
st = next_step[i]
|
|
||||||
st.set_context_tokens(&c_token_ids[i*nr_feat], nr_feat)
|
|
||||||
self.moves.set_valid(&c_is_valid[i*nr_class], st)
|
|
||||||
vectors = state2vec(token_ids[:next_step.size()])
|
vectors = state2vec(token_ids[:next_step.size()])
|
||||||
scores = vec2scores(vectors)
|
scores = vec2scores(vectors)
|
||||||
c_scores = <float*>scores.data
|
c_scores = <float*>scores.data
|
||||||
for i in range(next_step.size()):
|
for i in range(next_step.size()):
|
||||||
st = next_step[i]
|
st = next_step[i]
|
||||||
guess = arg_max_if_valid(
|
guess = arg_max_if_valid(
|
||||||
&c_scores[i*nr_class], &c_is_valid[i*nr_class], nr_class)
|
&c_scores[i*nr_class], &c_is_valid[i*nr_class], nr_class)
|
||||||
action = self.moves.c[guess]
|
action = self.moves.c[guess]
|
||||||
action.do(st, action.label)
|
action.do(st, action.label)
|
||||||
this_step, next_step = next_step, this_step
|
this_step, next_step = next_step, this_step
|
||||||
next_step.clear()
|
next_step.clear()
|
||||||
for st in this_step:
|
for st in this_step:
|
||||||
|
@ -429,11 +434,15 @@ cdef class Parser:
|
||||||
cdef int nr_class = self.moves.n_moves
|
cdef int nr_class = self.moves.n_moves
|
||||||
cdef StateClass stcls, output
|
cdef StateClass stcls, output
|
||||||
tokvecs = self.model[0].ops.flatten(tokvecses)
|
tokvecs = self.model[0].ops.flatten(tokvecses)
|
||||||
|
if USE_FINE_TUNE:
|
||||||
|
tokvecs += self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
|
||||||
cuda_stream = get_cuda_stream()
|
cuda_stream = get_cuda_stream()
|
||||||
state2vec, vec2scores = self.get_batch_model(len(docs), tokvecs,
|
state2vec, vec2scores = self.get_batch_model(len(docs), tokvecs,
|
||||||
cuda_stream, 0.0)
|
cuda_stream, 0.0)
|
||||||
beams = []
|
beams = []
|
||||||
cdef int offset = 0
|
cdef int offset = 0
|
||||||
|
cdef int j = 0
|
||||||
|
cdef int k
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
beam = Beam(nr_class, beam_width, min_density=beam_density)
|
beam = Beam(nr_class, beam_width, min_density=beam_density)
|
||||||
beam.initialize(self.moves.init_beam_state, doc.length, doc.c)
|
beam.initialize(self.moves.init_beam_state, doc.length, doc.c)
|
||||||
|
@ -446,44 +455,31 @@ cdef class Parser:
|
||||||
states = []
|
states = []
|
||||||
for i in range(beam.size):
|
for i in range(beam.size):
|
||||||
stcls = <StateClass>beam.at(i)
|
stcls = <StateClass>beam.at(i)
|
||||||
states.append(stcls)
|
# This way we avoid having to score finalized states
|
||||||
|
# We do have to take care to keep indexes aligned, though
|
||||||
|
if not stcls.is_final():
|
||||||
|
states.append(stcls)
|
||||||
token_ids = self.get_token_ids(states)
|
token_ids = self.get_token_ids(states)
|
||||||
vectors = state2vec(token_ids)
|
vectors = state2vec(token_ids)
|
||||||
scores = vec2scores(vectors)
|
scores = vec2scores(vectors)
|
||||||
|
j = 0
|
||||||
|
c_scores = <float*>scores.data
|
||||||
for i in range(beam.size):
|
for i in range(beam.size):
|
||||||
stcls = <StateClass>beam.at(i)
|
stcls = <StateClass>beam.at(i)
|
||||||
if not stcls.is_final():
|
if not stcls.is_final():
|
||||||
self.moves.set_valid(beam.is_valid[i], stcls.c)
|
self.moves.set_valid(beam.is_valid[i], stcls.c)
|
||||||
for j in range(nr_class):
|
for k in range(nr_class):
|
||||||
beam.scores[i][j] = scores[i, j]
|
beam.scores[i][k] = c_scores[j * scores.shape[1] + k]
|
||||||
|
j += 1
|
||||||
beam.advance(_transition_state, _hash_state, <void*>self.moves.c)
|
beam.advance(_transition_state, _hash_state, <void*>self.moves.c)
|
||||||
beam.check_done(_check_final_state, NULL)
|
beam.check_done(_check_final_state, NULL)
|
||||||
beams.append(beam)
|
beams.append(beam)
|
||||||
return beams
|
return beams
|
||||||
|
|
||||||
cdef void _parse_step(self, StateC* state,
|
|
||||||
const float* feat_weights,
|
|
||||||
int nr_class, int nr_feat, int nr_piece) nogil:
|
|
||||||
'''This only works with no hidden layers -- fast but inaccurate'''
|
|
||||||
#for i in cython.parallel.prange(next_step.size(), num_threads=4, nogil=True):
|
|
||||||
# self._parse_step(next_step[i], feat_weights, nr_class, nr_feat)
|
|
||||||
token_ids = <int*>calloc(nr_feat, sizeof(int))
|
|
||||||
scores = <float*>calloc(nr_class * nr_piece, sizeof(float))
|
|
||||||
is_valid = <int*>calloc(nr_class, sizeof(int))
|
|
||||||
|
|
||||||
state.set_context_tokens(token_ids, nr_feat)
|
|
||||||
sum_state_features(scores,
|
|
||||||
feat_weights, token_ids, 1, nr_feat, nr_class * nr_piece)
|
|
||||||
self.moves.set_valid(is_valid, state)
|
|
||||||
guess = arg_maxout_if_valid(scores, is_valid, nr_class, nr_piece)
|
|
||||||
action = self.moves.c[guess]
|
|
||||||
action.do(state, action.label)
|
|
||||||
|
|
||||||
free(is_valid)
|
|
||||||
free(scores)
|
|
||||||
free(token_ids)
|
|
||||||
|
|
||||||
def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
|
def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
|
||||||
|
if BEAM_PARSE:
|
||||||
|
return self.update_beam(docs_tokvecs, golds, drop=drop, sgd=sgd,
|
||||||
|
losses=losses)
|
||||||
if losses is not None and self.name not in losses:
|
if losses is not None and self.name not in losses:
|
||||||
losses[self.name] = 0.
|
losses[self.name] = 0.
|
||||||
docs, tokvec_lists = docs_tokvecs
|
docs, tokvec_lists = docs_tokvecs
|
||||||
|
@ -491,6 +487,10 @@ cdef class Parser:
|
||||||
if isinstance(docs, Doc) and isinstance(golds, GoldParse):
|
if isinstance(docs, Doc) and isinstance(golds, GoldParse):
|
||||||
docs = [docs]
|
docs = [docs]
|
||||||
golds = [golds]
|
golds = [golds]
|
||||||
|
if USE_FINE_TUNE:
|
||||||
|
my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
|
||||||
|
my_tokvecs = self.model[0].ops.flatten(my_tokvecs)
|
||||||
|
tokvecs += my_tokvecs
|
||||||
|
|
||||||
cuda_stream = get_cuda_stream()
|
cuda_stream = get_cuda_stream()
|
||||||
|
|
||||||
|
@ -517,13 +517,13 @@ cdef class Parser:
|
||||||
scores, bp_scores = vec2scores.begin_update(vector, drop=drop)
|
scores, bp_scores = vec2scores.begin_update(vector, drop=drop)
|
||||||
|
|
||||||
d_scores = self.get_batch_loss(states, golds, scores)
|
d_scores = self.get_batch_loss(states, golds, scores)
|
||||||
d_vector = bp_scores(d_scores / d_scores.shape[0], sgd=sgd)
|
d_vector = bp_scores(d_scores, sgd=sgd)
|
||||||
if drop != 0:
|
if drop != 0:
|
||||||
d_vector *= mask
|
d_vector *= mask
|
||||||
|
|
||||||
if isinstance(self.model[0].ops, CupyOps) \
|
if isinstance(self.model[0].ops, CupyOps) \
|
||||||
and not isinstance(token_ids, state2vec.ops.xp.ndarray):
|
and not isinstance(token_ids, state2vec.ops.xp.ndarray):
|
||||||
# Move token_ids and d_vector to CPU, asynchronously
|
# Move token_ids and d_vector to GPU, asynchronously
|
||||||
backprops.append((
|
backprops.append((
|
||||||
get_async(cuda_stream, token_ids),
|
get_async(cuda_stream, token_ids),
|
||||||
get_async(cuda_stream, d_vector),
|
get_async(cuda_stream, d_vector),
|
||||||
|
@ -540,7 +540,55 @@ cdef class Parser:
|
||||||
break
|
break
|
||||||
self._make_updates(d_tokvecs,
|
self._make_updates(d_tokvecs,
|
||||||
backprops, sgd, cuda_stream)
|
backprops, sgd, cuda_stream)
|
||||||
return self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs])
|
d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs])
|
||||||
|
if USE_FINE_TUNE:
|
||||||
|
bp_my_tokvecs(d_tokvecs, sgd=sgd)
|
||||||
|
return d_tokvecs
|
||||||
|
|
||||||
|
def update_beam(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
|
||||||
|
if losses is not None and self.name not in losses:
|
||||||
|
losses[self.name] = 0.
|
||||||
|
docs, tokvecs = docs_tokvecs
|
||||||
|
lengths = [len(d) for d in docs]
|
||||||
|
assert min(lengths) >= 1
|
||||||
|
tokvecs = self.model[0].ops.flatten(tokvecs)
|
||||||
|
if USE_FINE_TUNE:
|
||||||
|
my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
|
||||||
|
my_tokvecs = self.model[0].ops.flatten(my_tokvecs)
|
||||||
|
tokvecs += my_tokvecs
|
||||||
|
|
||||||
|
states = self.moves.init_batch(docs)
|
||||||
|
for gold in golds:
|
||||||
|
self.moves.preprocess_gold(gold)
|
||||||
|
|
||||||
|
cuda_stream = get_cuda_stream()
|
||||||
|
state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream, 0.0)
|
||||||
|
|
||||||
|
states_d_scores, backprops = _beam_utils.update_beam(self.moves, self.nr_feature, 500,
|
||||||
|
states, tokvecs, golds,
|
||||||
|
state2vec, vec2scores,
|
||||||
|
drop, sgd, losses,
|
||||||
|
width=8)
|
||||||
|
backprop_lower = []
|
||||||
|
for i, d_scores in enumerate(states_d_scores):
|
||||||
|
if losses is not None:
|
||||||
|
losses[self.name] += (d_scores**2).sum()
|
||||||
|
ids, bp_vectors, bp_scores = backprops[i]
|
||||||
|
d_vector = bp_scores(d_scores, sgd=sgd)
|
||||||
|
if isinstance(self.model[0].ops, CupyOps) \
|
||||||
|
and not isinstance(ids, state2vec.ops.xp.ndarray):
|
||||||
|
backprop_lower.append((
|
||||||
|
get_async(cuda_stream, ids),
|
||||||
|
get_async(cuda_stream, d_vector),
|
||||||
|
bp_vectors))
|
||||||
|
else:
|
||||||
|
backprop_lower.append((ids, d_vector, bp_vectors))
|
||||||
|
d_tokvecs = self.model[0].ops.allocate(tokvecs.shape)
|
||||||
|
self._make_updates(d_tokvecs, backprop_lower, sgd, cuda_stream)
|
||||||
|
d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, lengths)
|
||||||
|
if USE_FINE_TUNE:
|
||||||
|
bp_my_tokvecs(d_tokvecs, sgd=sgd)
|
||||||
|
return d_tokvecs
|
||||||
|
|
||||||
def _init_gold_batch(self, whole_docs, whole_golds):
|
def _init_gold_batch(self, whole_docs, whole_golds):
|
||||||
"""Make a square batch, of length equal to the shortest doc. A long
|
"""Make a square batch, of length equal to the shortest doc. A long
|
||||||
|
@ -585,14 +633,10 @@ cdef class Parser:
|
||||||
xp = get_array_module(d_tokvecs)
|
xp = get_array_module(d_tokvecs)
|
||||||
for ids, d_vector, bp_vector in backprops:
|
for ids, d_vector, bp_vector in backprops:
|
||||||
d_state_features = bp_vector(d_vector, sgd=sgd)
|
d_state_features = bp_vector(d_vector, sgd=sgd)
|
||||||
active_feats = ids * (ids >= 0)
|
mask = ids >= 0
|
||||||
active_feats = active_feats.reshape((ids.shape[0], ids.shape[1], 1))
|
indices = xp.nonzero(mask)
|
||||||
if hasattr(xp, 'scatter_add'):
|
self.model[0].ops.scatter_add(d_tokvecs, ids[indices],
|
||||||
xp.scatter_add(d_tokvecs,
|
d_state_features[indices])
|
||||||
ids, d_state_features * active_feats)
|
|
||||||
else:
|
|
||||||
xp.add.at(d_tokvecs,
|
|
||||||
ids, d_state_features * active_feats)
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def move_names(self):
|
def move_names(self):
|
||||||
|
@ -603,7 +647,7 @@ cdef class Parser:
|
||||||
return names
|
return names
|
||||||
|
|
||||||
def get_batch_model(self, batch_size, tokvecs, stream, dropout):
|
def get_batch_model(self, batch_size, tokvecs, stream, dropout):
|
||||||
lower, upper = self.model
|
_, lower, upper = self.model
|
||||||
state2vec = precompute_hiddens(batch_size, tokvecs,
|
state2vec = precompute_hiddens(batch_size, tokvecs,
|
||||||
lower, stream, drop=dropout)
|
lower, stream, drop=dropout)
|
||||||
return state2vec, upper
|
return state2vec, upper
|
||||||
|
@ -693,10 +737,12 @@ cdef class Parser:
|
||||||
|
|
||||||
def to_disk(self, path, **exclude):
|
def to_disk(self, path, **exclude):
|
||||||
serializers = {
|
serializers = {
|
||||||
'lower_model': lambda p: p.open('wb').write(
|
'tok2vec_model': lambda p: p.open('wb').write(
|
||||||
self.model[0].to_bytes()),
|
self.model[0].to_bytes()),
|
||||||
'upper_model': lambda p: p.open('wb').write(
|
'lower_model': lambda p: p.open('wb').write(
|
||||||
self.model[1].to_bytes()),
|
self.model[1].to_bytes()),
|
||||||
|
'upper_model': lambda p: p.open('wb').write(
|
||||||
|
self.model[2].to_bytes()),
|
||||||
'vocab': lambda p: self.vocab.to_disk(p),
|
'vocab': lambda p: self.vocab.to_disk(p),
|
||||||
'moves': lambda p: self.moves.to_disk(p, strings=False),
|
'moves': lambda p: self.moves.to_disk(p, strings=False),
|
||||||
'cfg': lambda p: p.open('w').write(json_dumps(self.cfg))
|
'cfg': lambda p: p.open('w').write(json_dumps(self.cfg))
|
||||||
|
@ -717,24 +763,29 @@ cdef class Parser:
|
||||||
self.model, cfg = self.Model(**self.cfg)
|
self.model, cfg = self.Model(**self.cfg)
|
||||||
else:
|
else:
|
||||||
cfg = {}
|
cfg = {}
|
||||||
with (path / 'lower_model').open('rb') as file_:
|
with (path / 'tok2vec_model').open('rb') as file_:
|
||||||
bytes_data = file_.read()
|
bytes_data = file_.read()
|
||||||
self.model[0].from_bytes(bytes_data)
|
self.model[0].from_bytes(bytes_data)
|
||||||
with (path / 'upper_model').open('rb') as file_:
|
with (path / 'lower_model').open('rb') as file_:
|
||||||
bytes_data = file_.read()
|
bytes_data = file_.read()
|
||||||
self.model[1].from_bytes(bytes_data)
|
self.model[1].from_bytes(bytes_data)
|
||||||
|
with (path / 'upper_model').open('rb') as file_:
|
||||||
|
bytes_data = file_.read()
|
||||||
|
self.model[2].from_bytes(bytes_data)
|
||||||
self.cfg.update(cfg)
|
self.cfg.update(cfg)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_bytes(self, **exclude):
|
def to_bytes(self, **exclude):
|
||||||
serializers = OrderedDict((
|
serializers = OrderedDict((
|
||||||
('lower_model', lambda: self.model[0].to_bytes()),
|
('tok2vec_model', lambda: self.model[0].to_bytes()),
|
||||||
('upper_model', lambda: self.model[1].to_bytes()),
|
('lower_model', lambda: self.model[1].to_bytes()),
|
||||||
|
('upper_model', lambda: self.model[2].to_bytes()),
|
||||||
('vocab', lambda: self.vocab.to_bytes()),
|
('vocab', lambda: self.vocab.to_bytes()),
|
||||||
('moves', lambda: self.moves.to_bytes(strings=False)),
|
('moves', lambda: self.moves.to_bytes(strings=False)),
|
||||||
('cfg', lambda: ujson.dumps(self.cfg))
|
('cfg', lambda: ujson.dumps(self.cfg))
|
||||||
))
|
))
|
||||||
if 'model' in exclude:
|
if 'model' in exclude:
|
||||||
|
exclude['tok2vec_model'] = True
|
||||||
exclude['lower_model'] = True
|
exclude['lower_model'] = True
|
||||||
exclude['upper_model'] = True
|
exclude['upper_model'] = True
|
||||||
exclude.pop('model')
|
exclude.pop('model')
|
||||||
|
@ -745,6 +796,7 @@ cdef class Parser:
|
||||||
('vocab', lambda b: self.vocab.from_bytes(b)),
|
('vocab', lambda b: self.vocab.from_bytes(b)),
|
||||||
('moves', lambda b: self.moves.from_bytes(b, strings=False)),
|
('moves', lambda b: self.moves.from_bytes(b, strings=False)),
|
||||||
('cfg', lambda b: self.cfg.update(ujson.loads(b))),
|
('cfg', lambda b: self.cfg.update(ujson.loads(b))),
|
||||||
|
('tok2vec_model', lambda b: None),
|
||||||
('lower_model', lambda b: None),
|
('lower_model', lambda b: None),
|
||||||
('upper_model', lambda b: None)
|
('upper_model', lambda b: None)
|
||||||
))
|
))
|
||||||
|
@ -754,10 +806,12 @@ cdef class Parser:
|
||||||
self.model, cfg = self.Model(self.moves.n_moves)
|
self.model, cfg = self.Model(self.moves.n_moves)
|
||||||
else:
|
else:
|
||||||
cfg = {}
|
cfg = {}
|
||||||
|
if 'tok2vec_model' in msg:
|
||||||
|
self.model[0].from_bytes(msg['tok2vec_model'])
|
||||||
if 'lower_model' in msg:
|
if 'lower_model' in msg:
|
||||||
self.model[0].from_bytes(msg['lower_model'])
|
self.model[1].from_bytes(msg['lower_model'])
|
||||||
if 'upper_model' in msg:
|
if 'upper_model' in msg:
|
||||||
self.model[1].from_bytes(msg['upper_model'])
|
self.model[2].from_bytes(msg['upper_model'])
|
||||||
self.cfg.update(cfg)
|
self.cfg.update(cfg)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
|
@ -107,6 +107,8 @@ cdef class TransitionSystem:
|
||||||
|
|
||||||
def is_valid(self, StateClass stcls, move_name):
|
def is_valid(self, StateClass stcls, move_name):
|
||||||
action = self.lookup_transition(move_name)
|
action = self.lookup_transition(move_name)
|
||||||
|
if action.move == 0:
|
||||||
|
return False
|
||||||
return action.is_valid(stcls.c, action.label)
|
return action.is_valid(stcls.c, action.label)
|
||||||
|
|
||||||
cdef int set_valid(self, int* is_valid, const StateC* st) nogil:
|
cdef int set_valid(self, int* is_valid, const StateC* st) nogil:
|
||||||
|
|
|
@ -78,3 +78,16 @@ def test_predict_doc_beam(parser, tok2vec, model, doc):
|
||||||
parser(doc, beam_width=32, beam_density=0.001)
|
parser(doc, beam_width=32, beam_density=0.001)
|
||||||
for word in doc:
|
for word in doc:
|
||||||
print(word.text, word.head, word.dep_)
|
print(word.text, word.head, word.dep_)
|
||||||
|
|
||||||
|
|
||||||
|
def test_update_doc_beam(parser, tok2vec, model, doc, gold):
|
||||||
|
parser.model = model
|
||||||
|
tokvecs, bp_tokvecs = tok2vec.begin_update([doc])
|
||||||
|
d_tokvecs = parser.update_beam(([doc], tokvecs), [gold])
|
||||||
|
assert d_tokvecs[0].shape == tokvecs[0].shape
|
||||||
|
def optimize(weights, gradient, key=None):
|
||||||
|
weights -= 0.001 * gradient
|
||||||
|
bp_tokvecs(d_tokvecs, sgd=optimize)
|
||||||
|
assert d_tokvecs[0].sum() == 0.
|
||||||
|
|
||||||
|
|
||||||
|
|
87
spacy/tests/parser/test_nn_beam.py
Normal file
87
spacy/tests/parser/test_nn_beam.py
Normal file
|
@ -0,0 +1,87 @@
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
import pytest
|
||||||
|
import numpy
|
||||||
|
from thinc.api import layerize
|
||||||
|
|
||||||
|
from ...vocab import Vocab
|
||||||
|
from ...syntax.arc_eager import ArcEager
|
||||||
|
from ...tokens import Doc
|
||||||
|
from ...gold import GoldParse
|
||||||
|
from ...syntax._beam_utils import ParserBeam, update_beam
|
||||||
|
from ...syntax.stateclass import StateClass
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def vocab():
|
||||||
|
return Vocab()
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def moves(vocab):
|
||||||
|
aeager = ArcEager(vocab.strings, {})
|
||||||
|
aeager.add_action(2, 'nsubj')
|
||||||
|
aeager.add_action(3, 'dobj')
|
||||||
|
aeager.add_action(2, 'aux')
|
||||||
|
return aeager
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def docs(vocab):
|
||||||
|
return [Doc(vocab, words=['Rats', 'bite', 'things'])]
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def states(docs):
|
||||||
|
return [StateClass(doc) for doc in docs]
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def tokvecs(docs, vector_size):
|
||||||
|
output = []
|
||||||
|
for doc in docs:
|
||||||
|
vec = numpy.random.uniform(-0.1, 0.1, (len(doc), vector_size))
|
||||||
|
output.append(numpy.asarray(vec))
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def golds(docs):
|
||||||
|
return [GoldParse(doc) for doc in docs]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def batch_size(docs):
|
||||||
|
return len(docs)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def beam_width():
|
||||||
|
return 4
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def vector_size():
|
||||||
|
return 6
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def beam(moves, states, golds, beam_width):
|
||||||
|
return ParserBeam(moves, states, golds, width=beam_width)
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def scores(moves, batch_size, beam_width):
|
||||||
|
return [
|
||||||
|
numpy.asarray(
|
||||||
|
numpy.random.uniform(-0.1, 0.1, (batch_size, moves.n_moves)),
|
||||||
|
dtype='f')
|
||||||
|
for _ in range(batch_size)]
|
||||||
|
|
||||||
|
|
||||||
|
def test_create_beam(beam):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def test_beam_advance(beam, scores):
|
||||||
|
beam.advance(scores)
|
||||||
|
|
||||||
|
|
||||||
|
def test_beam_advance_too_few_scores(beam, scores):
|
||||||
|
with pytest.raises(IndexError):
|
||||||
|
beam.advance(scores[:-1])
|
Loading…
Reference in New Issue
Block a user