Merge remote-tracking branch 'origin/develop' into feature/phrasematcher

This commit is contained in:
Matthew Honnibal 2017-09-26 08:32:55 -05:00
commit d02a41a8c9
9 changed files with 219 additions and 105 deletions

View File

@ -1,4 +1,4 @@
cython>=0.24 cython>=0.24,<0.27.0
pathlib pathlib
numpy>=1.7 numpy>=1.7
cymem>=1.30,<1.32 cymem>=1.30,<1.32

View File

@ -4,6 +4,7 @@ from thinc.neural import Model, Maxout, Softmax, Affine
from thinc.neural._classes.hash_embed import HashEmbed from thinc.neural._classes.hash_embed import HashEmbed
from thinc.neural.ops import NumpyOps, CupyOps from thinc.neural.ops import NumpyOps, CupyOps
from thinc.neural.util import get_array_module from thinc.neural.util import get_array_module
import thinc.extra.load_nlp
import random import random
import cytoolz import cytoolz
@ -31,6 +32,7 @@ from . import util
import numpy import numpy
import io import io
VECTORS_KEY = 'spacy_pretrained_vectors'
@layerize @layerize
def _flatten_add_lengths(seqs, pad=0, drop=0.): def _flatten_add_lengths(seqs, pad=0, drop=0.):
@ -225,42 +227,52 @@ def drop_layer(layer, factor=2.):
model.predict = layer model.predict = layer
return model return model
def link_vectors_to_models(vocab):
vectors = vocab.vectors
ops = Model.ops
for word in vocab:
if word.orth in vectors.key2row:
word.rank = vectors.key2row[word.orth]
else:
word.rank = 0
data = ops.asarray(vectors.data)
# Set an entry here, so that vectors are accessed by StaticVectors
# (unideal, I know)
thinc.extra.load_nlp.VECTORS[(ops.device, VECTORS_KEY)] = data
def Tok2Vec(width, embed_size, pretrained_dims=0):
if pretrained_dims is None: def Tok2Vec(width, embed_size, **kwargs):
pretrained_dims = 0 pretrained_dims = kwargs.get('pretrained_dims', 0)
cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 3)
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}): with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add,
'*': reapply}):
norm = HashEmbed(width, embed_size, column=cols.index(NORM), name='embed_norm') norm = HashEmbed(width, embed_size, column=cols.index(NORM), name='embed_norm')
prefix = HashEmbed(width, embed_size//2, column=cols.index(PREFIX), name='embed_prefix') prefix = HashEmbed(width, embed_size//2, column=cols.index(PREFIX), name='embed_prefix')
suffix = HashEmbed(width, embed_size//2, column=cols.index(SUFFIX), name='embed_suffix') suffix = HashEmbed(width, embed_size//2, column=cols.index(SUFFIX), name='embed_suffix')
shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE), name='embed_shape') shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE), name='embed_shape')
if pretrained_dims is not None and pretrained_dims >= 1:
glove = StaticVectors(VECTORS_KEY, width, column=cols.index(ID))
trained_vectors = ( embed = uniqued(
(glove | norm | prefix | suffix | shape)
>> LN(Maxout(width, width*5, pieces=3)), column=5)
else:
embed = uniqued(
(norm | prefix | suffix | shape)
>> LN(Maxout(width, width*4, pieces=3)), column=5)
convolution = Residual(
ExtractWindow(nW=1)
>> LN(Maxout(width, width*3, pieces=cnn_maxout_pieces))
)
tok2vec = (
FeatureExtracter(cols) FeatureExtracter(cols)
>> with_flatten( >> with_flatten(
uniqued( embed >> (convolution * 4), pad=4)
(norm | prefix | suffix | shape)
>> LN(Maxout(width, width*4, pieces=3)), column=5)
)
) )
convolution = Residual(ExtractWindow(nW=1) >> LN(Maxout(width, width*3, pieces=3)))
if pretrained_dims >= 1:
embed = concatenate_lists(trained_vectors, SpacyVectors)
tok2vec = (
embed
>> with_flatten(
Affine(width, width+pretrained_dims)
>> convolution ** 4,
pad=4)
)
else:
embed = trained_vectors
tok2vec = (
embed
>> with_flatten(convolution ** 4, pad=4)
)
# Work around thinc API limitations :(. TODO: Revise in Thinc 7 # Work around thinc API limitations :(. TODO: Revise in Thinc 7
tok2vec.nO = width tok2vec.nO = width
@ -268,6 +280,28 @@ def Tok2Vec(width, embed_size, pretrained_dims=0):
return tok2vec return tok2vec
def reapply(layer, n_times):
def reapply_fwd(X, drop=0.):
backprops = []
for i in range(n_times):
Y, backprop = layer.begin_update(X, drop=drop)
X = Y
backprops.append(backprop)
def reapply_bwd(dY, sgd=None):
dX = None
for backprop in reversed(backprops):
dY = backprop(dY, sgd=sgd)
if dX is None:
dX = dY
else:
dX += dY
return dX
return Y, reapply_bwd
return wrap(reapply_fwd, layer)
def asarray(ops, dtype): def asarray(ops, dtype):
def forward(X, drop=0.): def forward(X, drop=0.):
return ops.asarray(X, dtype=dtype), None return ops.asarray(X, dtype=dtype), None
@ -471,8 +505,13 @@ def getitem(i):
return X[i], None return X[i], None
return layerize(getitem_fwd) return layerize(getitem_fwd)
def build_tagger_model(nr_class, token_vector_width, pretrained_dims=0, **cfg): def build_tagger_model(nr_class, **cfg):
embed_size = util.env_opt('embed_size', 4000) embed_size = util.env_opt('embed_size', 4000)
if 'token_vector_width' in cfg:
token_vector_width = cfg['token_vector_width']
else:
token_vector_width = util.env_opt('token_vector_width', 128)
pretrained_dims = cfg.get('pretrained_dims', 0)
with Model.define_operators({'>>': chain, '+': add}): with Model.define_operators({'>>': chain, '+': add}):
# Input: (doc, tensor) tuples # Input: (doc, tensor) tuples
private_tok2vec = Tok2Vec(token_vector_width, embed_size, private_tok2vec = Tok2Vec(token_vector_width, embed_size,

View File

@ -8,6 +8,7 @@ import cytoolz
from pathlib import Path from pathlib import Path
import dill import dill
import tqdm import tqdm
from thinc.neural._classes.model import Model
from thinc.neural.optimizers import linear_decay from thinc.neural.optimizers import linear_decay
from timeit import default_timer as timer from timeit import default_timer as timer
@ -17,6 +18,7 @@ from ..gold import GoldParse, merge_sents
from ..gold import GoldCorpus, minibatch from ..gold import GoldCorpus, minibatch
from ..util import prints from ..util import prints
from .. import util from .. import util
from .. import about
from .. import displacy from .. import displacy
from ..compat import json_dumps from ..compat import json_dumps
@ -29,15 +31,16 @@ from ..compat import json_dumps
n_iter=("number of iterations", "option", "n", int), n_iter=("number of iterations", "option", "n", int),
n_sents=("number of sentences", "option", "ns", int), n_sents=("number of sentences", "option", "ns", int),
use_gpu=("Use GPU", "option", "g", int), use_gpu=("Use GPU", "option", "g", int),
resume=("Whether to resume training", "flag", "R", bool), vectors=("Model to load vectors from", "option", "v"),
no_tagger=("Don't train tagger", "flag", "T", bool), no_tagger=("Don't train tagger", "flag", "T", bool),
no_parser=("Don't train parser", "flag", "P", bool), no_parser=("Don't train parser", "flag", "P", bool),
no_entities=("Don't train NER", "flag", "N", bool), no_entities=("Don't train NER", "flag", "N", bool),
gold_preproc=("Use gold preprocessing", "flag", "G", bool), gold_preproc=("Use gold preprocessing", "flag", "G", bool),
meta_path=("Optional path to meta.json. All relevant properties will be overwritten.", "option", "m", Path)
) )
def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
use_gpu=-1, resume=False, no_tagger=False, no_parser=False, no_entities=False, use_gpu=-1, vectors=None, no_tagger=False, no_parser=False, no_entities=False,
gold_preproc=False): gold_preproc=False, meta_path=None):
""" """
Train a model. Expects data in spaCy's JSON format. Train a model. Expects data in spaCy's JSON format.
""" """
@ -46,13 +49,19 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
output_path = util.ensure_path(output_dir) output_path = util.ensure_path(output_dir)
train_path = util.ensure_path(train_data) train_path = util.ensure_path(train_data)
dev_path = util.ensure_path(dev_data) dev_path = util.ensure_path(dev_data)
meta_path = util.ensure_path(meta_path)
if not output_path.exists(): if not output_path.exists():
output_path.mkdir() output_path.mkdir()
if not train_path.exists(): if not train_path.exists():
prints(train_path, title="Training data not found", exits=1) prints(train_path, title="Training data not found", exits=1)
if dev_path and not dev_path.exists(): if dev_path and not dev_path.exists():
prints(dev_path, title="Development data not found", exits=1) prints(dev_path, title="Development data not found", exits=1)
if meta_path is not None and not meta_path.exists():
prints(meta_path, title="meta.json not found", exits=1)
meta = util.read_json(meta_path) if meta_path else {}
if not isinstance(meta, dict):
prints("Expected dict but got: {}".format(type(meta)),
title="Not a valid meta.json format", exits=1)
pipeline = ['token_vectors', 'tags', 'dependencies', 'entities'] pipeline = ['token_vectors', 'tags', 'dependencies', 'entities']
if no_tagger and 'tags' in pipeline: pipeline.remove('tags') if no_tagger and 'tags' in pipeline: pipeline.remove('tags')
@ -69,26 +78,23 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
batch_sizes = util.compounding(util.env_opt('batch_from', 1), batch_sizes = util.compounding(util.env_opt('batch_from', 1),
util.env_opt('batch_to', 64), util.env_opt('batch_to', 64),
util.env_opt('batch_compound', 1.001)) util.env_opt('batch_compound', 1.001))
if not resume:
lang_class = util.get_lang_class(lang)
nlp = lang_class(pipeline=pipeline)
else:
print("Load resume")
nlp = _resume_model(lang, pipeline)
lang_class = nlp.__class__
corpus = GoldCorpus(train_path, dev_path, limit=n_sents) corpus = GoldCorpus(train_path, dev_path, limit=n_sents)
n_train_words = corpus.count_train() n_train_words = corpus.count_train()
lang_class = util.get_lang_class(lang)
nlp = lang_class(pipeline=pipeline)
if vectors:
util.load_model(vectors, vocab=nlp.vocab)
optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu) optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
nlp._optimizer = None nlp._optimizer = None
print("Itn.\tLoss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %") print("Itn.\tLoss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %")
try: try:
train_docs = corpus.train_docs(nlp, projectivize=True, noise_level=0.0,
gold_preproc=gold_preproc, max_length=0)
train_docs = list(train_docs)
for i in range(n_iter): for i in range(n_iter):
with tqdm.tqdm(total=n_train_words, leave=False) as pbar: with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
train_docs = corpus.train_docs(nlp, projectivize=True, noise_level=0.0,
gold_preproc=gold_preproc, max_length=0)
losses = {} losses = {}
for batch in minibatch(train_docs, size=batch_sizes): for batch in minibatch(train_docs, size=batch_sizes):
docs, golds = zip(*batch) docs, golds = zip(*batch)
@ -103,32 +109,30 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
nlp.to_disk(epoch_model_path) nlp.to_disk(epoch_model_path)
nlp_loaded = lang_class(pipeline=pipeline) nlp_loaded = lang_class(pipeline=pipeline)
nlp_loaded = nlp_loaded.from_disk(epoch_model_path) nlp_loaded = nlp_loaded.from_disk(epoch_model_path)
scorer = nlp_loaded.evaluate( scorer = nlp.evaluate(
corpus.dev_docs( corpus.dev_docs(
nlp_loaded, nlp,
gold_preproc=gold_preproc)) gold_preproc=gold_preproc))
acc_loc =(output_path / ('model%d' % i) / 'accuracy.json') meta_loc = output_path / ('model%d' % i) / 'meta.json'
with acc_loc.open('w') as file_: meta['accuracy'] = scorer.scores
file_.write(json_dumps(scorer.scores)) meta['lang'] = nlp.lang
meta['pipeline'] = pipeline
meta['spacy_version'] = '>=%s' % about.__version__
meta.setdefault('name', 'model%d' % i)
meta.setdefault('version', '0.0.0')
with meta_loc.open('w') as file_:
file_.write(json_dumps(meta))
util.set_env_log(True) util.set_env_log(True)
print_progress(i, losses, scorer.scores) print_progress(i, losses, scorer.scores)
finally: finally:
print("Saving model...") print("Saving model...")
with (output_path / 'model-final.pickle').open('wb') as file_: try:
with nlp.use_params(optimizer.averages): with (output_path / 'model-final.pickle').open('wb') as file_:
dill.dump(nlp, file_, -1) with nlp.use_params(optimizer.averages):
dill.dump(nlp, file_, -1)
except:
def _resume_model(lang, pipeline): pass
nlp = util.load_model(lang)
pipes = {getattr(pipe, 'name', None) for pipe in nlp.pipeline}
for name in pipeline:
if name not in pipes:
factory = nlp.Defaults.factories[name]
nlp.pipeline.extend(factory(nlp))
nlp.meta['pipeline'] = pipeline
return nlp
def _render_parses(i, to_render): def _render_parses(i, to_render):
to_render[0].user_data['title'] = "Batch %d" % i to_render[0].user_data['title'] = "Batch %d" % i

View File

@ -342,7 +342,27 @@ class Language(object):
for doc, gold in docs_golds: for doc, gold in docs_golds:
yield doc, gold yield doc, gold
def begin_training(self, get_gold_tuples, **cfg): def resume_training(self, **cfg):
if cfg.get('device', -1) >= 0:
device = util.use_gpu(cfg['device'])
if self.vocab.vectors.data.shape[1] >= 1:
self.vocab.vectors.data = Model.ops.asarray(
self.vocab.vectors.data)
else:
device = None
learn_rate = util.env_opt('learn_rate', 0.001)
beta1 = util.env_opt('optimizer_B1', 0.9)
beta2 = util.env_opt('optimizer_B2', 0.999)
eps = util.env_opt('optimizer_eps', 1e-08)
L2 = util.env_opt('L2_penalty', 1e-6)
max_grad_norm = util.env_opt('grad_norm_clip', 1.)
self._optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1,
beta2=beta2, eps=eps)
self._optimizer.max_grad_norm = max_grad_norm
self._optimizer.device = device
return self._optimizer
def begin_training(self, get_gold_tuples=None, **cfg):
"""Allocate models, pre-process training data and acquire a trainer and """Allocate models, pre-process training data and acquire a trainer and
optimizer. Used as a contextmanager. optimizer. Used as a contextmanager.
@ -353,17 +373,14 @@ class Language(object):
if self.parser: if self.parser:
self.pipeline.append(NeuralLabeller(self.vocab)) self.pipeline.append(NeuralLabeller(self.vocab))
# Populate vocab # Populate vocab
for _, annots_brackets in get_gold_tuples(): if get_gold_tuples is not None:
for annots, _ in annots_brackets: for _, annots_brackets in get_gold_tuples():
for word in annots[1]: for annots, _ in annots_brackets:
_ = self.vocab[word] for word in annots[1]:
_ = self.vocab[word]
contexts = [] contexts = []
if cfg.get('device', -1) >= 0: if cfg.get('device', -1) >= 0:
import cupy.cuda.device device = util.use_gpu(cfg['device'])
device = cupy.cuda.device.Device(cfg['device'])
device.use()
Model.ops = CupyOps()
Model.Ops = CupyOps
if self.vocab.vectors.data.shape[1] >= 1: if self.vocab.vectors.data.shape[1] >= 1:
self.vocab.vectors.data = Model.ops.asarray( self.vocab.vectors.data = Model.ops.asarray(
self.vocab.vectors.data) self.vocab.vectors.data)

View File

@ -43,11 +43,12 @@ from .compat import json_dumps
from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS
from ._ml import rebatch, Tok2Vec, flatten from ._ml import rebatch, Tok2Vec, flatten
from ._ml import build_text_classifier, build_tagger_model from ._ml import build_text_classifier, build_tagger_model
from ._ml import link_vectors_to_models
from .parts_of_speech import X from .parts_of_speech import X
class SentenceSegmenter(object): class SentenceSegmenter(object):
'''A simple spaCy hook, to allow custom sentence boundary detection logic """A simple spaCy hook, to allow custom sentence boundary detection logic
(that doesn't require the dependency parse). (that doesn't require the dependency parse).
To change the sentence boundary detection strategy, pass a generator To change the sentence boundary detection strategy, pass a generator
@ -56,7 +57,7 @@ class SentenceSegmenter(object):
Sentence detection strategies should be generators that take `Doc` objects Sentence detection strategies should be generators that take `Doc` objects
and yield `Span` objects for each sentence. and yield `Span` objects for each sentence.
''' """
name = 'sbd' name = 'sbd'
def __init__(self, vocab, strategy=None): def __init__(self, vocab, strategy=None):
@ -88,17 +89,30 @@ class BaseThincComponent(object):
@classmethod @classmethod
def Model(cls, *shape, **kwargs): def Model(cls, *shape, **kwargs):
"""Initialize a model for the pipe."""
raise NotImplementedError raise NotImplementedError
def __init__(self, vocab, model=True, **cfg): def __init__(self, vocab, model=True, **cfg):
"""Create a new pipe instance."""
raise NotImplementedError raise NotImplementedError
def __call__(self, doc): def __call__(self, doc):
"""Apply the pipe to one document. The document is
modified in-place, and returned.
Both __call__ and pipe should delegate to the `predict()`
and `set_annotations()` methods.
"""
scores = self.predict([doc]) scores = self.predict([doc])
self.set_annotations([doc], scores) self.set_annotations([doc], scores)
return doc return doc
def pipe(self, stream, batch_size=128, n_threads=-1): def pipe(self, stream, batch_size=128, n_threads=-1):
"""Apply the pipe to a stream of documents.
Both __call__ and pipe should delegate to the `predict()`
and `set_annotations()` methods.
"""
for docs in cytoolz.partition_all(batch_size, stream): for docs in cytoolz.partition_all(batch_size, stream):
docs = list(docs) docs = list(docs)
scores = self.predict(docs) scores = self.predict(docs)
@ -106,27 +120,43 @@ class BaseThincComponent(object):
yield from docs yield from docs
def predict(self, docs): def predict(self, docs):
"""Apply the pipeline's model to a batch of docs, without
modifying them.
"""
raise NotImplementedError raise NotImplementedError
def set_annotations(self, docs, scores): def set_annotations(self, docs, scores):
"""Modify a batch of documents, using pre-computed scores."""
raise NotImplementedError raise NotImplementedError
def update(self, docs_tensors, golds, state=None, drop=0., sgd=None, losses=None): def update(self, docs, golds, drop=0., sgd=None, losses=None):
"""Learn from a batch of documents and gold-standard information,
updating the pipe's model.
Delegates to predict() and get_loss().
"""
raise NotImplementedError raise NotImplementedError
def get_loss(self, docs, golds, scores): def get_loss(self, docs, golds, scores):
"""Find the loss and gradient of loss for the batch of
documents and their predicted scores."""
raise NotImplementedError raise NotImplementedError
def begin_training(self, gold_tuples=tuple(), pipeline=None): def begin_training(self, gold_tuples=tuple(), pipeline=None):
token_vector_width = pipeline[0].model.nO """Initialize the pipe for training, using data exampes if available.
If no model has been initialized yet, the model is added."""
if self.model is True: if self.model is True:
self.model = self.Model(1, token_vector_width) self.model = self.Model(**self.cfg)
link_vectors_to_models(self.vocab)
def use_params(self, params): def use_params(self, params):
"""Modify the pipe's model, to use the given parameter values.
"""
with self.model.use_params(params): with self.model.use_params(params):
yield yield
def to_bytes(self, **exclude): def to_bytes(self, **exclude):
"""Serialize the pipe to a bytestring."""
serialize = OrderedDict(( serialize = OrderedDict((
('cfg', lambda: json_dumps(self.cfg)), ('cfg', lambda: json_dumps(self.cfg)),
('model', lambda: self.model.to_bytes()), ('model', lambda: self.model.to_bytes()),
@ -135,6 +165,7 @@ class BaseThincComponent(object):
return util.to_bytes(serialize, exclude) return util.to_bytes(serialize, exclude)
def from_bytes(self, bytes_data, **exclude): def from_bytes(self, bytes_data, **exclude):
"""Load the pipe from a bytestring."""
def load_model(b): def load_model(b):
if self.model is True: if self.model is True:
self.cfg['pretrained_dims'] = self.vocab.vectors_length self.cfg['pretrained_dims'] = self.vocab.vectors_length
@ -143,21 +174,23 @@ class BaseThincComponent(object):
deserialize = OrderedDict(( deserialize = OrderedDict((
('cfg', lambda b: self.cfg.update(ujson.loads(b))), ('cfg', lambda b: self.cfg.update(ujson.loads(b))),
('model', load_model),
('vocab', lambda b: self.vocab.from_bytes(b)) ('vocab', lambda b: self.vocab.from_bytes(b))
('model', load_model),
)) ))
util.from_bytes(bytes_data, deserialize, exclude) util.from_bytes(bytes_data, deserialize, exclude)
return self return self
def to_disk(self, path, **exclude): def to_disk(self, path, **exclude):
"""Serialize the pipe to disk."""
serialize = OrderedDict(( serialize = OrderedDict((
('cfg', lambda p: p.open('w').write(json_dumps(self.cfg))), ('cfg', lambda p: p.open('w').write(json_dumps(self.cfg))),
('vocab', lambda p: self.vocab.to_disk(p)),
('model', lambda p: p.open('wb').write(self.model.to_bytes())), ('model', lambda p: p.open('wb').write(self.model.to_bytes())),
('vocab', lambda p: self.vocab.to_disk(p))
)) ))
util.to_disk(path, serialize, exclude) util.to_disk(path, serialize, exclude)
def from_disk(self, path, **exclude): def from_disk(self, path, **exclude):
"""Load the pipe from disk."""
def load_model(p): def load_model(p):
if self.model is True: if self.model is True:
self.cfg['pretrained_dims'] = self.vocab.vectors_length self.cfg['pretrained_dims'] = self.vocab.vectors_length
@ -166,8 +199,8 @@ class BaseThincComponent(object):
deserialize = OrderedDict(( deserialize = OrderedDict((
('cfg', lambda p: self.cfg.update(_load_cfg(p))), ('cfg', lambda p: self.cfg.update(_load_cfg(p))),
('model', load_model),
('vocab', lambda p: self.vocab.from_disk(p)), ('vocab', lambda p: self.vocab.from_disk(p)),
('model', load_model),
)) ))
util.from_disk(path, deserialize, exclude) util.from_disk(path, deserialize, exclude)
return self return self
@ -215,6 +248,7 @@ class TokenVectorEncoder(BaseThincComponent):
self.model = model self.model = model
self.cfg = dict(cfg) self.cfg = dict(cfg)
self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1] self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
self.cfg.setdefault('cnn_maxout_pieces', 3)
def __call__(self, doc): def __call__(self, doc):
"""Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM """Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM
@ -286,9 +320,9 @@ class TokenVectorEncoder(BaseThincComponent):
pipeline (list): The pipeline the model is part of. pipeline (list): The pipeline the model is part of.
""" """
if self.model is True: if self.model is True:
self.model = self.Model( self.cfg['pretrained_dims'] = self.vocab.vectors_length
pretrained_dims=self.vocab.vectors_length, self.model = self.Model(**self.cfg)
**self.cfg) link_vectors_to_models(self.vocab)
class NeuralTagger(BaseThincComponent): class NeuralTagger(BaseThincComponent):
@ -297,6 +331,8 @@ class NeuralTagger(BaseThincComponent):
self.vocab = vocab self.vocab = vocab
self.model = model self.model = model
self.cfg = dict(cfg) self.cfg = dict(cfg)
self.cfg.setdefault('cnn_maxout_pieces', 2)
self.cfg.setdefault('pretrained_dims', self.vocab.vectors.data.shape[1])
def __call__(self, doc): def __call__(self, doc):
tags = self.predict(([doc], [doc.tensor])) tags = self.predict(([doc], [doc.tensor]))
@ -393,15 +429,14 @@ class NeuralTagger(BaseThincComponent):
vocab.morphology = Morphology(vocab.strings, new_tag_map, vocab.morphology = Morphology(vocab.strings, new_tag_map,
vocab.morphology.lemmatizer, vocab.morphology.lemmatizer,
exc=vocab.morphology.exc) exc=vocab.morphology.exc)
token_vector_width = pipeline[0].model.nO
if self.model is True: if self.model is True:
self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width, self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
pretrained_dims=self.vocab.vectors_length) self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
link_vectors_to_models(self.vocab)
@classmethod @classmethod
def Model(cls, n_tags, token_vector_width, pretrained_dims=0): def Model(cls, n_tags, **cfg):
return build_tagger_model(n_tags, token_vector_width, return build_tagger_model(n_tags, **cfg)
pretrained_dims)
def use_params(self, params): def use_params(self, params):
with self.model.use_params(params): with self.model.use_params(params):
@ -422,8 +457,7 @@ class NeuralTagger(BaseThincComponent):
if self.model is True: if self.model is True:
token_vector_width = util.env_opt('token_vector_width', token_vector_width = util.env_opt('token_vector_width',
self.cfg.get('token_vector_width', 128)) self.cfg.get('token_vector_width', 128))
self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width, self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
pretrained_dims=self.vocab.vectors_length)
self.model.from_bytes(b) self.model.from_bytes(b)
def load_tag_map(b): def load_tag_map(b):
@ -442,6 +476,7 @@ class NeuralTagger(BaseThincComponent):
return self return self
def to_disk(self, path, **exclude): def to_disk(self, path, **exclude):
self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
serialize = OrderedDict(( serialize = OrderedDict((
('vocab', lambda p: self.vocab.to_disk(p)), ('vocab', lambda p: self.vocab.to_disk(p)),
('tag_map', lambda p: p.open('wb').write(msgpack.dumps( ('tag_map', lambda p: p.open('wb').write(msgpack.dumps(
@ -456,10 +491,7 @@ class NeuralTagger(BaseThincComponent):
def from_disk(self, path, **exclude): def from_disk(self, path, **exclude):
def load_model(p): def load_model(p):
if self.model is True: if self.model is True:
token_vector_width = util.env_opt('token_vector_width', self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
self.cfg.get('token_vector_width', 128))
self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width,
**self.cfg)
self.model.from_bytes(p.open('rb').read()) self.model.from_bytes(p.open('rb').read())
def load_tag_map(p): def load_tag_map(p):
@ -486,6 +518,8 @@ class NeuralLabeller(NeuralTagger):
self.vocab = vocab self.vocab = vocab
self.model = model self.model = model
self.cfg = dict(cfg) self.cfg = dict(cfg)
self.cfg.setdefault('cnn_maxout_pieces', 2)
self.cfg.setdefault('pretrained_dims', self.vocab.vectors.data.shape[1])
@property @property
def labels(self): def labels(self):
@ -508,13 +542,13 @@ class NeuralLabeller(NeuralTagger):
self.labels[dep] = len(self.labels) self.labels[dep] = len(self.labels)
token_vector_width = pipeline[0].model.nO token_vector_width = pipeline[0].model.nO
if self.model is True: if self.model is True:
self.model = self.Model(len(self.labels), token_vector_width, self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
pretrained_dims=self.vocab.vectors_length) self.model = self.Model(len(self.labels), **self.cfg)
link_vectors_to_models(self.vocab)
@classmethod @classmethod
def Model(cls, n_tags, token_vector_width, pretrained_dims=0): def Model(cls, n_tags, **cfg):
return build_tagger_model(n_tags, token_vector_width, return build_tagger_model(n_tags, **cfg)
pretrained_dims)
def get_loss(self, docs, golds, scores): def get_loss(self, docs, golds, scores):
scores = self.model.ops.flatten(scores) scores = self.model.ops.flatten(scores)
@ -562,7 +596,7 @@ class SimilarityHook(BaseThincComponent):
return Siamese(Pooling(max_pool, mean_pool), CauchySimilarity(length)) return Siamese(Pooling(max_pool, mean_pool), CauchySimilarity(length))
def __call__(self, doc): def __call__(self, doc):
'''Install similarity hook''' """Install similarity hook"""
doc.user_hooks['similarity'] = self.predict doc.user_hooks['similarity'] = self.predict
return doc return doc
@ -590,6 +624,7 @@ class SimilarityHook(BaseThincComponent):
""" """
if self.model is True: if self.model is True:
self.model = self.Model(pipeline[0].model.nO) self.model = self.Model(pipeline[0].model.nO)
link_vectors_to_models(self.vocab)
class TextCategorizer(BaseThincComponent): class TextCategorizer(BaseThincComponent):
@ -663,6 +698,7 @@ class TextCategorizer(BaseThincComponent):
self.cfg['pretrained_dims'] = self.vocab.vectors_length self.cfg['pretrained_dims'] = self.vocab.vectors_length
self.model = self.Model(len(self.labels), token_vector_width, self.model = self.Model(len(self.labels), token_vector_width,
**self.cfg) **self.cfg)
link_vectors_to_models(self.vocab)
cdef class EntityRecognizer(LinearParser): cdef class EntityRecognizer(LinearParser):

View File

@ -49,6 +49,7 @@ from ..util import get_async, get_cuda_stream
from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts
from .._ml import Tok2Vec, doc2feats, rebatch, fine_tune from .._ml import Tok2Vec, doc2feats, rebatch, fine_tune
from .._ml import Residual, drop_layer from .._ml import Residual, drop_layer
from .._ml import link_vectors_to_models
from ..compat import json_dumps from ..compat import json_dumps
from . import _parse_features from . import _parse_features
@ -309,6 +310,7 @@ cdef class Parser:
cfg['beam_density'] = util.env_opt('beam_density', 0.0) cfg['beam_density'] = util.env_opt('beam_density', 0.0)
if 'pretrained_dims' not in cfg: if 'pretrained_dims' not in cfg:
cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1] cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
cfg.setdefault('cnn_maxout_pieces', 3)
self.cfg = cfg self.cfg = cfg
if 'actions' in self.cfg: if 'actions' in self.cfg:
for action, labels in self.cfg.get('actions', {}).items(): for action, labels in self.cfg.get('actions', {}).items():
@ -790,6 +792,7 @@ cdef class Parser:
if self.model is True: if self.model is True:
cfg['pretrained_dims'] = self.vocab.vectors_length cfg['pretrained_dims'] = self.vocab.vectors_length
self.model, cfg = self.Model(self.moves.n_moves, **cfg) self.model, cfg = self.Model(self.moves.n_moves, **cfg)
link_vectors_to_models(self.vocab)
self.cfg.update(cfg) self.cfg.update(cfg)
def preprocess_gold(self, docs_golds): def preprocess_gold(self, docs_golds):
@ -871,8 +874,7 @@ cdef class Parser:
msg = util.from_bytes(bytes_data, deserializers, exclude) msg = util.from_bytes(bytes_data, deserializers, exclude)
if 'model' not in exclude: if 'model' not in exclude:
if self.model is True: if self.model is True:
self.model, cfg = self.Model(self.moves.n_moves, self.model, cfg = self.Model(**self.cfg)
pretrained_dims=self.vocab.vectors_length)
cfg['pretrained_dims'] = self.vocab.vectors_length cfg['pretrained_dims'] = self.vocab.vectors_length
else: else:
cfg = {} cfg = {}

View File

@ -14,6 +14,7 @@ import numpy
import io import io
import dill import dill
from collections import OrderedDict from collections import OrderedDict
from thinc.neural._classes.model import Model
import msgpack import msgpack
import msgpack_numpy import msgpack_numpy
@ -557,3 +558,14 @@ def minify_html(html):
RETURNS (unicode): "Minified" HTML. RETURNS (unicode): "Minified" HTML.
""" """
return html.strip().replace(' ', '').replace('\n', '') return html.strip().replace(' ', '').replace('\n', '')
def use_gpu(gpu_id):
import cupy.cuda.device
from thinc.neural.ops import CupyOps
device = cupy.cuda.device.Device(gpu_id)
device.use()
Model.ops = CupyOps()
Model.Ops = CupyOps
return device

View File

@ -27,6 +27,7 @@ from .vectors import Vectors
from . import util from . import util
from . import attrs from . import attrs
from . import symbols from . import symbols
from ._ml import link_vectors_to_models
cdef class Vocab: cdef class Vocab:
@ -323,6 +324,7 @@ cdef class Vocab:
self.lexemes_from_bytes(file_.read()) self.lexemes_from_bytes(file_.read())
if self.vectors is not None: if self.vectors is not None:
self.vectors.from_disk(path, exclude='strings.json') self.vectors.from_disk(path, exclude='strings.json')
link_vectors_to_models(self)
return self return self
def to_bytes(self, **exclude): def to_bytes(self, **exclude):
@ -362,6 +364,7 @@ cdef class Vocab:
('vectors', lambda b: serialize_vectors(b)) ('vectors', lambda b: serialize_vectors(b))
)) ))
util.from_bytes(bytes_data, setters, exclude) util.from_bytes(bytes_data, setters, exclude)
link_vectors_to_models(self)
return self return self
def lexemes_to_bytes(self): def lexemes_to_bytes(self):
@ -436,6 +439,7 @@ def unpickle_vocab(sstore, morphology, data_dir,
vocab.lex_attr_getters = lex_attr_getters vocab.lex_attr_getters = lex_attr_getters
vocab.lexemes_from_bytes(lexemes_data) vocab.lexemes_from_bytes(lexemes_data)
vocab.length = length vocab.length = length
link_vectors_to_models(vocab)
return vocab return vocab

View File

@ -17,7 +17,7 @@ fi
if [ "${VIA}" == "compile" ]; then if [ "${VIA}" == "compile" ]; then
pip install -r requirements.txt pip install -r requirements.txt
python setup.py build_ext --inplace python setup.py clean --all
pip install -e . pip install -e .
fi fi