mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Merge remote-tracking branch 'origin/develop' into feature/phrasematcher
This commit is contained in:
commit
d02a41a8c9
|
@ -1,4 +1,4 @@
|
|||
cython>=0.24
|
||||
cython>=0.24,<0.27.0
|
||||
pathlib
|
||||
numpy>=1.7
|
||||
cymem>=1.30,<1.32
|
||||
|
|
93
spacy/_ml.py
93
spacy/_ml.py
|
@ -4,6 +4,7 @@ from thinc.neural import Model, Maxout, Softmax, Affine
|
|||
from thinc.neural._classes.hash_embed import HashEmbed
|
||||
from thinc.neural.ops import NumpyOps, CupyOps
|
||||
from thinc.neural.util import get_array_module
|
||||
import thinc.extra.load_nlp
|
||||
import random
|
||||
import cytoolz
|
||||
|
||||
|
@ -31,6 +32,7 @@ from . import util
|
|||
import numpy
|
||||
import io
|
||||
|
||||
VECTORS_KEY = 'spacy_pretrained_vectors'
|
||||
|
||||
@layerize
|
||||
def _flatten_add_lengths(seqs, pad=0, drop=0.):
|
||||
|
@ -225,42 +227,52 @@ def drop_layer(layer, factor=2.):
|
|||
model.predict = layer
|
||||
return model
|
||||
|
||||
def link_vectors_to_models(vocab):
|
||||
vectors = vocab.vectors
|
||||
ops = Model.ops
|
||||
for word in vocab:
|
||||
if word.orth in vectors.key2row:
|
||||
word.rank = vectors.key2row[word.orth]
|
||||
else:
|
||||
word.rank = 0
|
||||
data = ops.asarray(vectors.data)
|
||||
# Set an entry here, so that vectors are accessed by StaticVectors
|
||||
# (unideal, I know)
|
||||
thinc.extra.load_nlp.VECTORS[(ops.device, VECTORS_KEY)] = data
|
||||
|
||||
def Tok2Vec(width, embed_size, pretrained_dims=0):
|
||||
if pretrained_dims is None:
|
||||
pretrained_dims = 0
|
||||
|
||||
def Tok2Vec(width, embed_size, **kwargs):
|
||||
pretrained_dims = kwargs.get('pretrained_dims', 0)
|
||||
cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 3)
|
||||
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
||||
with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}):
|
||||
with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add,
|
||||
'*': reapply}):
|
||||
norm = HashEmbed(width, embed_size, column=cols.index(NORM), name='embed_norm')
|
||||
prefix = HashEmbed(width, embed_size//2, column=cols.index(PREFIX), name='embed_prefix')
|
||||
suffix = HashEmbed(width, embed_size//2, column=cols.index(SUFFIX), name='embed_suffix')
|
||||
shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE), name='embed_shape')
|
||||
if pretrained_dims is not None and pretrained_dims >= 1:
|
||||
glove = StaticVectors(VECTORS_KEY, width, column=cols.index(ID))
|
||||
|
||||
trained_vectors = (
|
||||
embed = uniqued(
|
||||
(glove | norm | prefix | suffix | shape)
|
||||
>> LN(Maxout(width, width*5, pieces=3)), column=5)
|
||||
else:
|
||||
embed = uniqued(
|
||||
(norm | prefix | suffix | shape)
|
||||
>> LN(Maxout(width, width*4, pieces=3)), column=5)
|
||||
|
||||
|
||||
convolution = Residual(
|
||||
ExtractWindow(nW=1)
|
||||
>> LN(Maxout(width, width*3, pieces=cnn_maxout_pieces))
|
||||
)
|
||||
|
||||
tok2vec = (
|
||||
FeatureExtracter(cols)
|
||||
>> with_flatten(
|
||||
uniqued(
|
||||
(norm | prefix | suffix | shape)
|
||||
>> LN(Maxout(width, width*4, pieces=3)), column=5)
|
||||
)
|
||||
embed >> (convolution * 4), pad=4)
|
||||
)
|
||||
convolution = Residual(ExtractWindow(nW=1) >> LN(Maxout(width, width*3, pieces=3)))
|
||||
|
||||
if pretrained_dims >= 1:
|
||||
embed = concatenate_lists(trained_vectors, SpacyVectors)
|
||||
tok2vec = (
|
||||
embed
|
||||
>> with_flatten(
|
||||
Affine(width, width+pretrained_dims)
|
||||
>> convolution ** 4,
|
||||
pad=4)
|
||||
)
|
||||
else:
|
||||
embed = trained_vectors
|
||||
tok2vec = (
|
||||
embed
|
||||
>> with_flatten(convolution ** 4, pad=4)
|
||||
)
|
||||
|
||||
# Work around thinc API limitations :(. TODO: Revise in Thinc 7
|
||||
tok2vec.nO = width
|
||||
|
@ -268,6 +280,28 @@ def Tok2Vec(width, embed_size, pretrained_dims=0):
|
|||
return tok2vec
|
||||
|
||||
|
||||
def reapply(layer, n_times):
|
||||
def reapply_fwd(X, drop=0.):
|
||||
backprops = []
|
||||
for i in range(n_times):
|
||||
Y, backprop = layer.begin_update(X, drop=drop)
|
||||
X = Y
|
||||
backprops.append(backprop)
|
||||
def reapply_bwd(dY, sgd=None):
|
||||
dX = None
|
||||
for backprop in reversed(backprops):
|
||||
dY = backprop(dY, sgd=sgd)
|
||||
if dX is None:
|
||||
dX = dY
|
||||
else:
|
||||
dX += dY
|
||||
return dX
|
||||
return Y, reapply_bwd
|
||||
return wrap(reapply_fwd, layer)
|
||||
|
||||
|
||||
|
||||
|
||||
def asarray(ops, dtype):
|
||||
def forward(X, drop=0.):
|
||||
return ops.asarray(X, dtype=dtype), None
|
||||
|
@ -471,8 +505,13 @@ def getitem(i):
|
|||
return X[i], None
|
||||
return layerize(getitem_fwd)
|
||||
|
||||
def build_tagger_model(nr_class, token_vector_width, pretrained_dims=0, **cfg):
|
||||
def build_tagger_model(nr_class, **cfg):
|
||||
embed_size = util.env_opt('embed_size', 4000)
|
||||
if 'token_vector_width' in cfg:
|
||||
token_vector_width = cfg['token_vector_width']
|
||||
else:
|
||||
token_vector_width = util.env_opt('token_vector_width', 128)
|
||||
pretrained_dims = cfg.get('pretrained_dims', 0)
|
||||
with Model.define_operators({'>>': chain, '+': add}):
|
||||
# Input: (doc, tensor) tuples
|
||||
private_tok2vec = Tok2Vec(token_vector_width, embed_size,
|
||||
|
|
|
@ -8,6 +8,7 @@ import cytoolz
|
|||
from pathlib import Path
|
||||
import dill
|
||||
import tqdm
|
||||
from thinc.neural._classes.model import Model
|
||||
from thinc.neural.optimizers import linear_decay
|
||||
from timeit import default_timer as timer
|
||||
|
||||
|
@ -17,6 +18,7 @@ from ..gold import GoldParse, merge_sents
|
|||
from ..gold import GoldCorpus, minibatch
|
||||
from ..util import prints
|
||||
from .. import util
|
||||
from .. import about
|
||||
from .. import displacy
|
||||
from ..compat import json_dumps
|
||||
|
||||
|
@ -29,15 +31,16 @@ from ..compat import json_dumps
|
|||
n_iter=("number of iterations", "option", "n", int),
|
||||
n_sents=("number of sentences", "option", "ns", int),
|
||||
use_gpu=("Use GPU", "option", "g", int),
|
||||
resume=("Whether to resume training", "flag", "R", bool),
|
||||
vectors=("Model to load vectors from", "option", "v"),
|
||||
no_tagger=("Don't train tagger", "flag", "T", bool),
|
||||
no_parser=("Don't train parser", "flag", "P", bool),
|
||||
no_entities=("Don't train NER", "flag", "N", bool),
|
||||
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
|
||||
meta_path=("Optional path to meta.json. All relevant properties will be overwritten.", "option", "m", Path)
|
||||
)
|
||||
def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
|
||||
use_gpu=-1, resume=False, no_tagger=False, no_parser=False, no_entities=False,
|
||||
gold_preproc=False):
|
||||
use_gpu=-1, vectors=None, no_tagger=False, no_parser=False, no_entities=False,
|
||||
gold_preproc=False, meta_path=None):
|
||||
"""
|
||||
Train a model. Expects data in spaCy's JSON format.
|
||||
"""
|
||||
|
@ -46,13 +49,19 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
|
|||
output_path = util.ensure_path(output_dir)
|
||||
train_path = util.ensure_path(train_data)
|
||||
dev_path = util.ensure_path(dev_data)
|
||||
meta_path = util.ensure_path(meta_path)
|
||||
if not output_path.exists():
|
||||
output_path.mkdir()
|
||||
if not train_path.exists():
|
||||
prints(train_path, title="Training data not found", exits=1)
|
||||
if dev_path and not dev_path.exists():
|
||||
prints(dev_path, title="Development data not found", exits=1)
|
||||
|
||||
if meta_path is not None and not meta_path.exists():
|
||||
prints(meta_path, title="meta.json not found", exits=1)
|
||||
meta = util.read_json(meta_path) if meta_path else {}
|
||||
if not isinstance(meta, dict):
|
||||
prints("Expected dict but got: {}".format(type(meta)),
|
||||
title="Not a valid meta.json format", exits=1)
|
||||
|
||||
pipeline = ['token_vectors', 'tags', 'dependencies', 'entities']
|
||||
if no_tagger and 'tags' in pipeline: pipeline.remove('tags')
|
||||
|
@ -69,26 +78,23 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
|
|||
batch_sizes = util.compounding(util.env_opt('batch_from', 1),
|
||||
util.env_opt('batch_to', 64),
|
||||
util.env_opt('batch_compound', 1.001))
|
||||
|
||||
if not resume:
|
||||
lang_class = util.get_lang_class(lang)
|
||||
nlp = lang_class(pipeline=pipeline)
|
||||
else:
|
||||
print("Load resume")
|
||||
nlp = _resume_model(lang, pipeline)
|
||||
lang_class = nlp.__class__
|
||||
|
||||
corpus = GoldCorpus(train_path, dev_path, limit=n_sents)
|
||||
n_train_words = corpus.count_train()
|
||||
|
||||
lang_class = util.get_lang_class(lang)
|
||||
nlp = lang_class(pipeline=pipeline)
|
||||
if vectors:
|
||||
util.load_model(vectors, vocab=nlp.vocab)
|
||||
optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
|
||||
nlp._optimizer = None
|
||||
|
||||
print("Itn.\tLoss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %")
|
||||
try:
|
||||
train_docs = corpus.train_docs(nlp, projectivize=True, noise_level=0.0,
|
||||
gold_preproc=gold_preproc, max_length=0)
|
||||
train_docs = list(train_docs)
|
||||
for i in range(n_iter):
|
||||
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
|
||||
train_docs = corpus.train_docs(nlp, projectivize=True, noise_level=0.0,
|
||||
gold_preproc=gold_preproc, max_length=0)
|
||||
losses = {}
|
||||
for batch in minibatch(train_docs, size=batch_sizes):
|
||||
docs, golds = zip(*batch)
|
||||
|
@ -103,32 +109,30 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
|
|||
nlp.to_disk(epoch_model_path)
|
||||
nlp_loaded = lang_class(pipeline=pipeline)
|
||||
nlp_loaded = nlp_loaded.from_disk(epoch_model_path)
|
||||
scorer = nlp_loaded.evaluate(
|
||||
scorer = nlp.evaluate(
|
||||
corpus.dev_docs(
|
||||
nlp_loaded,
|
||||
nlp,
|
||||
gold_preproc=gold_preproc))
|
||||
acc_loc =(output_path / ('model%d' % i) / 'accuracy.json')
|
||||
with acc_loc.open('w') as file_:
|
||||
file_.write(json_dumps(scorer.scores))
|
||||
meta_loc = output_path / ('model%d' % i) / 'meta.json'
|
||||
meta['accuracy'] = scorer.scores
|
||||
meta['lang'] = nlp.lang
|
||||
meta['pipeline'] = pipeline
|
||||
meta['spacy_version'] = '>=%s' % about.__version__
|
||||
meta.setdefault('name', 'model%d' % i)
|
||||
meta.setdefault('version', '0.0.0')
|
||||
|
||||
with meta_loc.open('w') as file_:
|
||||
file_.write(json_dumps(meta))
|
||||
util.set_env_log(True)
|
||||
print_progress(i, losses, scorer.scores)
|
||||
finally:
|
||||
print("Saving model...")
|
||||
with (output_path / 'model-final.pickle').open('wb') as file_:
|
||||
with nlp.use_params(optimizer.averages):
|
||||
dill.dump(nlp, file_, -1)
|
||||
|
||||
|
||||
def _resume_model(lang, pipeline):
|
||||
nlp = util.load_model(lang)
|
||||
pipes = {getattr(pipe, 'name', None) for pipe in nlp.pipeline}
|
||||
for name in pipeline:
|
||||
if name not in pipes:
|
||||
factory = nlp.Defaults.factories[name]
|
||||
nlp.pipeline.extend(factory(nlp))
|
||||
nlp.meta['pipeline'] = pipeline
|
||||
return nlp
|
||||
|
||||
try:
|
||||
with (output_path / 'model-final.pickle').open('wb') as file_:
|
||||
with nlp.use_params(optimizer.averages):
|
||||
dill.dump(nlp, file_, -1)
|
||||
except:
|
||||
pass
|
||||
|
||||
def _render_parses(i, to_render):
|
||||
to_render[0].user_data['title'] = "Batch %d" % i
|
||||
|
|
|
@ -342,7 +342,27 @@ class Language(object):
|
|||
for doc, gold in docs_golds:
|
||||
yield doc, gold
|
||||
|
||||
def begin_training(self, get_gold_tuples, **cfg):
|
||||
def resume_training(self, **cfg):
|
||||
if cfg.get('device', -1) >= 0:
|
||||
device = util.use_gpu(cfg['device'])
|
||||
if self.vocab.vectors.data.shape[1] >= 1:
|
||||
self.vocab.vectors.data = Model.ops.asarray(
|
||||
self.vocab.vectors.data)
|
||||
else:
|
||||
device = None
|
||||
learn_rate = util.env_opt('learn_rate', 0.001)
|
||||
beta1 = util.env_opt('optimizer_B1', 0.9)
|
||||
beta2 = util.env_opt('optimizer_B2', 0.999)
|
||||
eps = util.env_opt('optimizer_eps', 1e-08)
|
||||
L2 = util.env_opt('L2_penalty', 1e-6)
|
||||
max_grad_norm = util.env_opt('grad_norm_clip', 1.)
|
||||
self._optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1,
|
||||
beta2=beta2, eps=eps)
|
||||
self._optimizer.max_grad_norm = max_grad_norm
|
||||
self._optimizer.device = device
|
||||
return self._optimizer
|
||||
|
||||
def begin_training(self, get_gold_tuples=None, **cfg):
|
||||
"""Allocate models, pre-process training data and acquire a trainer and
|
||||
optimizer. Used as a contextmanager.
|
||||
|
||||
|
@ -353,17 +373,14 @@ class Language(object):
|
|||
if self.parser:
|
||||
self.pipeline.append(NeuralLabeller(self.vocab))
|
||||
# Populate vocab
|
||||
for _, annots_brackets in get_gold_tuples():
|
||||
for annots, _ in annots_brackets:
|
||||
for word in annots[1]:
|
||||
_ = self.vocab[word]
|
||||
if get_gold_tuples is not None:
|
||||
for _, annots_brackets in get_gold_tuples():
|
||||
for annots, _ in annots_brackets:
|
||||
for word in annots[1]:
|
||||
_ = self.vocab[word]
|
||||
contexts = []
|
||||
if cfg.get('device', -1) >= 0:
|
||||
import cupy.cuda.device
|
||||
device = cupy.cuda.device.Device(cfg['device'])
|
||||
device.use()
|
||||
Model.ops = CupyOps()
|
||||
Model.Ops = CupyOps
|
||||
device = util.use_gpu(cfg['device'])
|
||||
if self.vocab.vectors.data.shape[1] >= 1:
|
||||
self.vocab.vectors.data = Model.ops.asarray(
|
||||
self.vocab.vectors.data)
|
||||
|
|
|
@ -43,11 +43,12 @@ from .compat import json_dumps
|
|||
from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS
|
||||
from ._ml import rebatch, Tok2Vec, flatten
|
||||
from ._ml import build_text_classifier, build_tagger_model
|
||||
from ._ml import link_vectors_to_models
|
||||
from .parts_of_speech import X
|
||||
|
||||
|
||||
class SentenceSegmenter(object):
|
||||
'''A simple spaCy hook, to allow custom sentence boundary detection logic
|
||||
"""A simple spaCy hook, to allow custom sentence boundary detection logic
|
||||
(that doesn't require the dependency parse).
|
||||
|
||||
To change the sentence boundary detection strategy, pass a generator
|
||||
|
@ -56,7 +57,7 @@ class SentenceSegmenter(object):
|
|||
|
||||
Sentence detection strategies should be generators that take `Doc` objects
|
||||
and yield `Span` objects for each sentence.
|
||||
'''
|
||||
"""
|
||||
name = 'sbd'
|
||||
|
||||
def __init__(self, vocab, strategy=None):
|
||||
|
@ -88,17 +89,30 @@ class BaseThincComponent(object):
|
|||
|
||||
@classmethod
|
||||
def Model(cls, *shape, **kwargs):
|
||||
"""Initialize a model for the pipe."""
|
||||
raise NotImplementedError
|
||||
|
||||
def __init__(self, vocab, model=True, **cfg):
|
||||
"""Create a new pipe instance."""
|
||||
raise NotImplementedError
|
||||
|
||||
def __call__(self, doc):
|
||||
"""Apply the pipe to one document. The document is
|
||||
modified in-place, and returned.
|
||||
|
||||
Both __call__ and pipe should delegate to the `predict()`
|
||||
and `set_annotations()` methods.
|
||||
"""
|
||||
scores = self.predict([doc])
|
||||
self.set_annotations([doc], scores)
|
||||
return doc
|
||||
|
||||
def pipe(self, stream, batch_size=128, n_threads=-1):
|
||||
"""Apply the pipe to a stream of documents.
|
||||
|
||||
Both __call__ and pipe should delegate to the `predict()`
|
||||
and `set_annotations()` methods.
|
||||
"""
|
||||
for docs in cytoolz.partition_all(batch_size, stream):
|
||||
docs = list(docs)
|
||||
scores = self.predict(docs)
|
||||
|
@ -106,27 +120,43 @@ class BaseThincComponent(object):
|
|||
yield from docs
|
||||
|
||||
def predict(self, docs):
|
||||
"""Apply the pipeline's model to a batch of docs, without
|
||||
modifying them.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def set_annotations(self, docs, scores):
|
||||
"""Modify a batch of documents, using pre-computed scores."""
|
||||
raise NotImplementedError
|
||||
|
||||
def update(self, docs_tensors, golds, state=None, drop=0., sgd=None, losses=None):
|
||||
def update(self, docs, golds, drop=0., sgd=None, losses=None):
|
||||
"""Learn from a batch of documents and gold-standard information,
|
||||
updating the pipe's model.
|
||||
|
||||
Delegates to predict() and get_loss().
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def get_loss(self, docs, golds, scores):
|
||||
"""Find the loss and gradient of loss for the batch of
|
||||
documents and their predicted scores."""
|
||||
raise NotImplementedError
|
||||
|
||||
def begin_training(self, gold_tuples=tuple(), pipeline=None):
|
||||
token_vector_width = pipeline[0].model.nO
|
||||
"""Initialize the pipe for training, using data exampes if available.
|
||||
If no model has been initialized yet, the model is added."""
|
||||
if self.model is True:
|
||||
self.model = self.Model(1, token_vector_width)
|
||||
self.model = self.Model(**self.cfg)
|
||||
link_vectors_to_models(self.vocab)
|
||||
|
||||
def use_params(self, params):
|
||||
"""Modify the pipe's model, to use the given parameter values.
|
||||
"""
|
||||
with self.model.use_params(params):
|
||||
yield
|
||||
|
||||
def to_bytes(self, **exclude):
|
||||
"""Serialize the pipe to a bytestring."""
|
||||
serialize = OrderedDict((
|
||||
('cfg', lambda: json_dumps(self.cfg)),
|
||||
('model', lambda: self.model.to_bytes()),
|
||||
|
@ -135,6 +165,7 @@ class BaseThincComponent(object):
|
|||
return util.to_bytes(serialize, exclude)
|
||||
|
||||
def from_bytes(self, bytes_data, **exclude):
|
||||
"""Load the pipe from a bytestring."""
|
||||
def load_model(b):
|
||||
if self.model is True:
|
||||
self.cfg['pretrained_dims'] = self.vocab.vectors_length
|
||||
|
@ -143,21 +174,23 @@ class BaseThincComponent(object):
|
|||
|
||||
deserialize = OrderedDict((
|
||||
('cfg', lambda b: self.cfg.update(ujson.loads(b))),
|
||||
('model', load_model),
|
||||
('vocab', lambda b: self.vocab.from_bytes(b))
|
||||
('model', load_model),
|
||||
))
|
||||
util.from_bytes(bytes_data, deserialize, exclude)
|
||||
return self
|
||||
|
||||
def to_disk(self, path, **exclude):
|
||||
"""Serialize the pipe to disk."""
|
||||
serialize = OrderedDict((
|
||||
('cfg', lambda p: p.open('w').write(json_dumps(self.cfg))),
|
||||
('vocab', lambda p: self.vocab.to_disk(p)),
|
||||
('model', lambda p: p.open('wb').write(self.model.to_bytes())),
|
||||
('vocab', lambda p: self.vocab.to_disk(p))
|
||||
))
|
||||
util.to_disk(path, serialize, exclude)
|
||||
|
||||
def from_disk(self, path, **exclude):
|
||||
"""Load the pipe from disk."""
|
||||
def load_model(p):
|
||||
if self.model is True:
|
||||
self.cfg['pretrained_dims'] = self.vocab.vectors_length
|
||||
|
@ -166,8 +199,8 @@ class BaseThincComponent(object):
|
|||
|
||||
deserialize = OrderedDict((
|
||||
('cfg', lambda p: self.cfg.update(_load_cfg(p))),
|
||||
('model', load_model),
|
||||
('vocab', lambda p: self.vocab.from_disk(p)),
|
||||
('model', load_model),
|
||||
))
|
||||
util.from_disk(path, deserialize, exclude)
|
||||
return self
|
||||
|
@ -215,6 +248,7 @@ class TokenVectorEncoder(BaseThincComponent):
|
|||
self.model = model
|
||||
self.cfg = dict(cfg)
|
||||
self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
|
||||
self.cfg.setdefault('cnn_maxout_pieces', 3)
|
||||
|
||||
def __call__(self, doc):
|
||||
"""Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM
|
||||
|
@ -286,9 +320,9 @@ class TokenVectorEncoder(BaseThincComponent):
|
|||
pipeline (list): The pipeline the model is part of.
|
||||
"""
|
||||
if self.model is True:
|
||||
self.model = self.Model(
|
||||
pretrained_dims=self.vocab.vectors_length,
|
||||
**self.cfg)
|
||||
self.cfg['pretrained_dims'] = self.vocab.vectors_length
|
||||
self.model = self.Model(**self.cfg)
|
||||
link_vectors_to_models(self.vocab)
|
||||
|
||||
|
||||
class NeuralTagger(BaseThincComponent):
|
||||
|
@ -297,6 +331,8 @@ class NeuralTagger(BaseThincComponent):
|
|||
self.vocab = vocab
|
||||
self.model = model
|
||||
self.cfg = dict(cfg)
|
||||
self.cfg.setdefault('cnn_maxout_pieces', 2)
|
||||
self.cfg.setdefault('pretrained_dims', self.vocab.vectors.data.shape[1])
|
||||
|
||||
def __call__(self, doc):
|
||||
tags = self.predict(([doc], [doc.tensor]))
|
||||
|
@ -393,15 +429,14 @@ class NeuralTagger(BaseThincComponent):
|
|||
vocab.morphology = Morphology(vocab.strings, new_tag_map,
|
||||
vocab.morphology.lemmatizer,
|
||||
exc=vocab.morphology.exc)
|
||||
token_vector_width = pipeline[0].model.nO
|
||||
if self.model is True:
|
||||
self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width,
|
||||
pretrained_dims=self.vocab.vectors_length)
|
||||
self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
|
||||
self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
|
||||
link_vectors_to_models(self.vocab)
|
||||
|
||||
@classmethod
|
||||
def Model(cls, n_tags, token_vector_width, pretrained_dims=0):
|
||||
return build_tagger_model(n_tags, token_vector_width,
|
||||
pretrained_dims)
|
||||
def Model(cls, n_tags, **cfg):
|
||||
return build_tagger_model(n_tags, **cfg)
|
||||
|
||||
def use_params(self, params):
|
||||
with self.model.use_params(params):
|
||||
|
@ -422,8 +457,7 @@ class NeuralTagger(BaseThincComponent):
|
|||
if self.model is True:
|
||||
token_vector_width = util.env_opt('token_vector_width',
|
||||
self.cfg.get('token_vector_width', 128))
|
||||
self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width,
|
||||
pretrained_dims=self.vocab.vectors_length)
|
||||
self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
|
||||
self.model.from_bytes(b)
|
||||
|
||||
def load_tag_map(b):
|
||||
|
@ -442,6 +476,7 @@ class NeuralTagger(BaseThincComponent):
|
|||
return self
|
||||
|
||||
def to_disk(self, path, **exclude):
|
||||
self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
|
||||
serialize = OrderedDict((
|
||||
('vocab', lambda p: self.vocab.to_disk(p)),
|
||||
('tag_map', lambda p: p.open('wb').write(msgpack.dumps(
|
||||
|
@ -456,10 +491,7 @@ class NeuralTagger(BaseThincComponent):
|
|||
def from_disk(self, path, **exclude):
|
||||
def load_model(p):
|
||||
if self.model is True:
|
||||
token_vector_width = util.env_opt('token_vector_width',
|
||||
self.cfg.get('token_vector_width', 128))
|
||||
self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width,
|
||||
**self.cfg)
|
||||
self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
|
||||
self.model.from_bytes(p.open('rb').read())
|
||||
|
||||
def load_tag_map(p):
|
||||
|
@ -486,6 +518,8 @@ class NeuralLabeller(NeuralTagger):
|
|||
self.vocab = vocab
|
||||
self.model = model
|
||||
self.cfg = dict(cfg)
|
||||
self.cfg.setdefault('cnn_maxout_pieces', 2)
|
||||
self.cfg.setdefault('pretrained_dims', self.vocab.vectors.data.shape[1])
|
||||
|
||||
@property
|
||||
def labels(self):
|
||||
|
@ -508,13 +542,13 @@ class NeuralLabeller(NeuralTagger):
|
|||
self.labels[dep] = len(self.labels)
|
||||
token_vector_width = pipeline[0].model.nO
|
||||
if self.model is True:
|
||||
self.model = self.Model(len(self.labels), token_vector_width,
|
||||
pretrained_dims=self.vocab.vectors_length)
|
||||
self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
|
||||
self.model = self.Model(len(self.labels), **self.cfg)
|
||||
link_vectors_to_models(self.vocab)
|
||||
|
||||
@classmethod
|
||||
def Model(cls, n_tags, token_vector_width, pretrained_dims=0):
|
||||
return build_tagger_model(n_tags, token_vector_width,
|
||||
pretrained_dims)
|
||||
def Model(cls, n_tags, **cfg):
|
||||
return build_tagger_model(n_tags, **cfg)
|
||||
|
||||
def get_loss(self, docs, golds, scores):
|
||||
scores = self.model.ops.flatten(scores)
|
||||
|
@ -562,7 +596,7 @@ class SimilarityHook(BaseThincComponent):
|
|||
return Siamese(Pooling(max_pool, mean_pool), CauchySimilarity(length))
|
||||
|
||||
def __call__(self, doc):
|
||||
'''Install similarity hook'''
|
||||
"""Install similarity hook"""
|
||||
doc.user_hooks['similarity'] = self.predict
|
||||
return doc
|
||||
|
||||
|
@ -590,6 +624,7 @@ class SimilarityHook(BaseThincComponent):
|
|||
"""
|
||||
if self.model is True:
|
||||
self.model = self.Model(pipeline[0].model.nO)
|
||||
link_vectors_to_models(self.vocab)
|
||||
|
||||
|
||||
class TextCategorizer(BaseThincComponent):
|
||||
|
@ -663,6 +698,7 @@ class TextCategorizer(BaseThincComponent):
|
|||
self.cfg['pretrained_dims'] = self.vocab.vectors_length
|
||||
self.model = self.Model(len(self.labels), token_vector_width,
|
||||
**self.cfg)
|
||||
link_vectors_to_models(self.vocab)
|
||||
|
||||
|
||||
cdef class EntityRecognizer(LinearParser):
|
||||
|
|
|
@ -49,6 +49,7 @@ from ..util import get_async, get_cuda_stream
|
|||
from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts
|
||||
from .._ml import Tok2Vec, doc2feats, rebatch, fine_tune
|
||||
from .._ml import Residual, drop_layer
|
||||
from .._ml import link_vectors_to_models
|
||||
from ..compat import json_dumps
|
||||
|
||||
from . import _parse_features
|
||||
|
@ -309,6 +310,7 @@ cdef class Parser:
|
|||
cfg['beam_density'] = util.env_opt('beam_density', 0.0)
|
||||
if 'pretrained_dims' not in cfg:
|
||||
cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
|
||||
cfg.setdefault('cnn_maxout_pieces', 3)
|
||||
self.cfg = cfg
|
||||
if 'actions' in self.cfg:
|
||||
for action, labels in self.cfg.get('actions', {}).items():
|
||||
|
@ -790,6 +792,7 @@ cdef class Parser:
|
|||
if self.model is True:
|
||||
cfg['pretrained_dims'] = self.vocab.vectors_length
|
||||
self.model, cfg = self.Model(self.moves.n_moves, **cfg)
|
||||
link_vectors_to_models(self.vocab)
|
||||
self.cfg.update(cfg)
|
||||
|
||||
def preprocess_gold(self, docs_golds):
|
||||
|
@ -871,8 +874,7 @@ cdef class Parser:
|
|||
msg = util.from_bytes(bytes_data, deserializers, exclude)
|
||||
if 'model' not in exclude:
|
||||
if self.model is True:
|
||||
self.model, cfg = self.Model(self.moves.n_moves,
|
||||
pretrained_dims=self.vocab.vectors_length)
|
||||
self.model, cfg = self.Model(**self.cfg)
|
||||
cfg['pretrained_dims'] = self.vocab.vectors_length
|
||||
else:
|
||||
cfg = {}
|
||||
|
|
|
@ -14,6 +14,7 @@ import numpy
|
|||
import io
|
||||
import dill
|
||||
from collections import OrderedDict
|
||||
from thinc.neural._classes.model import Model
|
||||
|
||||
import msgpack
|
||||
import msgpack_numpy
|
||||
|
@ -557,3 +558,14 @@ def minify_html(html):
|
|||
RETURNS (unicode): "Minified" HTML.
|
||||
"""
|
||||
return html.strip().replace(' ', '').replace('\n', '')
|
||||
|
||||
|
||||
def use_gpu(gpu_id):
|
||||
import cupy.cuda.device
|
||||
from thinc.neural.ops import CupyOps
|
||||
device = cupy.cuda.device.Device(gpu_id)
|
||||
device.use()
|
||||
Model.ops = CupyOps()
|
||||
Model.Ops = CupyOps
|
||||
return device
|
||||
|
||||
|
|
|
@ -27,6 +27,7 @@ from .vectors import Vectors
|
|||
from . import util
|
||||
from . import attrs
|
||||
from . import symbols
|
||||
from ._ml import link_vectors_to_models
|
||||
|
||||
|
||||
cdef class Vocab:
|
||||
|
@ -323,6 +324,7 @@ cdef class Vocab:
|
|||
self.lexemes_from_bytes(file_.read())
|
||||
if self.vectors is not None:
|
||||
self.vectors.from_disk(path, exclude='strings.json')
|
||||
link_vectors_to_models(self)
|
||||
return self
|
||||
|
||||
def to_bytes(self, **exclude):
|
||||
|
@ -362,6 +364,7 @@ cdef class Vocab:
|
|||
('vectors', lambda b: serialize_vectors(b))
|
||||
))
|
||||
util.from_bytes(bytes_data, setters, exclude)
|
||||
link_vectors_to_models(self)
|
||||
return self
|
||||
|
||||
def lexemes_to_bytes(self):
|
||||
|
@ -436,6 +439,7 @@ def unpickle_vocab(sstore, morphology, data_dir,
|
|||
vocab.lex_attr_getters = lex_attr_getters
|
||||
vocab.lexemes_from_bytes(lexemes_data)
|
||||
vocab.length = length
|
||||
link_vectors_to_models(vocab)
|
||||
return vocab
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user