mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-13 13:17:06 +03:00
Merge pre-trained vectors and noshare patches
This commit is contained in:
commit
4348c479fc
86
spacy/_ml.py
86
spacy/_ml.py
|
@ -4,6 +4,7 @@ from thinc.neural import Model, Maxout, Softmax, Affine
|
||||||
from thinc.neural._classes.hash_embed import HashEmbed
|
from thinc.neural._classes.hash_embed import HashEmbed
|
||||||
from thinc.neural.ops import NumpyOps, CupyOps
|
from thinc.neural.ops import NumpyOps, CupyOps
|
||||||
from thinc.neural.util import get_array_module
|
from thinc.neural.util import get_array_module
|
||||||
|
import thinc.extra.load_nlp
|
||||||
import random
|
import random
|
||||||
import cytoolz
|
import cytoolz
|
||||||
|
|
||||||
|
@ -31,6 +32,7 @@ from . import util
|
||||||
import numpy
|
import numpy
|
||||||
import io
|
import io
|
||||||
|
|
||||||
|
VECTORS_KEY = 'spacy_pretrained_vectors'
|
||||||
|
|
||||||
@layerize
|
@layerize
|
||||||
def _flatten_add_lengths(seqs, pad=0, drop=0.):
|
def _flatten_add_lengths(seqs, pad=0, drop=0.):
|
||||||
|
@ -225,45 +227,51 @@ def drop_layer(layer, factor=2.):
|
||||||
model.predict = layer
|
model.predict = layer
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
def link_vectors_to_models(vocab):
|
||||||
|
vectors = vocab.vectors
|
||||||
|
ops = Model.ops
|
||||||
|
for word in vocab:
|
||||||
|
if word.orth in vectors.key2row:
|
||||||
|
word.rank = vectors.key2row[word.orth]
|
||||||
|
else:
|
||||||
|
word.rank = 0
|
||||||
|
data = ops.asarray(vectors.data)
|
||||||
|
# Set an entry here, so that vectors are accessed by StaticVectors
|
||||||
|
# (unideal, I know)
|
||||||
|
thinc.extra.load_nlp.VECTORS[(ops.device, VECTORS_KEY)] = data
|
||||||
|
|
||||||
def Tok2Vec(width, embed_size, **kwargs):
|
def Tok2Vec(width, embed_size, **kwargs):
|
||||||
pretrained_dims = kwargs.get('pretrained_dims', 0)
|
pretrained_dims = kwargs.get('pretrained_dims', 0)
|
||||||
cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 3)
|
cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 3)
|
||||||
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
||||||
with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}):
|
with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add,
|
||||||
|
'*': reapply}):
|
||||||
norm = HashEmbed(width, embed_size, column=cols.index(NORM), name='embed_norm')
|
norm = HashEmbed(width, embed_size, column=cols.index(NORM), name='embed_norm')
|
||||||
prefix = HashEmbed(width, embed_size//2, column=cols.index(PREFIX), name='embed_prefix')
|
prefix = HashEmbed(width, embed_size//2, column=cols.index(PREFIX), name='embed_prefix')
|
||||||
suffix = HashEmbed(width, embed_size//2, column=cols.index(SUFFIX), name='embed_suffix')
|
suffix = HashEmbed(width, embed_size//2, column=cols.index(SUFFIX), name='embed_suffix')
|
||||||
shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE), name='embed_shape')
|
shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE), name='embed_shape')
|
||||||
|
if pretrained_dims is not None and pretrained_dims >= 1:
|
||||||
|
glove = StaticVectors(VECTORS_KEY, width, column=cols.index(ID))
|
||||||
|
|
||||||
|
embed = uniqued(
|
||||||
|
(glove | norm | prefix | suffix | shape)
|
||||||
|
>> LN(Maxout(width, width*5, pieces=3)), column=5)
|
||||||
|
else:
|
||||||
|
embed = uniqued(
|
||||||
|
(norm | prefix | suffix | shape)
|
||||||
|
>> LN(Maxout(width, width*4, pieces=3)), column=5)
|
||||||
|
|
||||||
|
|
||||||
trained_vectors = (
|
|
||||||
FeatureExtracter(cols)
|
|
||||||
>> with_flatten(
|
|
||||||
uniqued(
|
|
||||||
(norm | prefix | suffix | shape)
|
|
||||||
>> LN(Maxout(width, width*4, pieces=3)), column=5)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
convolution = Residual(
|
convolution = Residual(
|
||||||
ExtractWindow(nW=1)
|
ExtractWindow(nW=1)
|
||||||
>> LN(Maxout(width, width*3, pieces=cnn_maxout_pieces))
|
>> LN(Maxout(width, width*3, pieces=cnn_maxout_pieces))
|
||||||
)
|
)
|
||||||
|
|
||||||
if pretrained_dims >= 1:
|
tok2vec = (
|
||||||
embed = concatenate_lists(trained_vectors, SpacyVectors)
|
FeatureExtracter(cols)
|
||||||
tok2vec = (
|
>> with_flatten(
|
||||||
embed
|
embed >> (convolution * 4), pad=4)
|
||||||
>> with_flatten(
|
)
|
||||||
Affine(width, width+pretrained_dims)
|
|
||||||
>> convolution ** 4,
|
|
||||||
pad=4)
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
embed = trained_vectors
|
|
||||||
tok2vec = (
|
|
||||||
embed
|
|
||||||
>> with_flatten(convolution ** 4, pad=4)
|
|
||||||
)
|
|
||||||
|
|
||||||
# Work around thinc API limitations :(. TODO: Revise in Thinc 7
|
# Work around thinc API limitations :(. TODO: Revise in Thinc 7
|
||||||
tok2vec.nO = width
|
tok2vec.nO = width
|
||||||
|
@ -271,6 +279,28 @@ def Tok2Vec(width, embed_size, **kwargs):
|
||||||
return tok2vec
|
return tok2vec
|
||||||
|
|
||||||
|
|
||||||
|
def reapply(layer, n_times):
|
||||||
|
def reapply_fwd(X, drop=0.):
|
||||||
|
backprops = []
|
||||||
|
for i in range(n_times):
|
||||||
|
Y, backprop = layer.begin_update(X, drop=drop)
|
||||||
|
X = Y
|
||||||
|
backprops.append(backprop)
|
||||||
|
def reapply_bwd(dY, sgd=None):
|
||||||
|
dX = None
|
||||||
|
for backprop in reversed(backprops):
|
||||||
|
dY = backprop(dY, sgd=sgd)
|
||||||
|
if dX is None:
|
||||||
|
dX = dY
|
||||||
|
else:
|
||||||
|
dX += dY
|
||||||
|
return dX
|
||||||
|
return Y, reapply_bwd
|
||||||
|
return wrap(reapply_fwd, layer)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def asarray(ops, dtype):
|
def asarray(ops, dtype):
|
||||||
def forward(X, drop=0.):
|
def forward(X, drop=0.):
|
||||||
return ops.asarray(X, dtype=dtype), None
|
return ops.asarray(X, dtype=dtype), None
|
||||||
|
@ -474,11 +504,13 @@ def getitem(i):
|
||||||
return X[i], None
|
return X[i], None
|
||||||
return layerize(getitem_fwd)
|
return layerize(getitem_fwd)
|
||||||
|
|
||||||
|
def build_tagger_model(nr_class, **cfg):
|
||||||
def build_tagger_model(nr_class, pretrained_dims=0, **cfg):
|
|
||||||
embed_size = util.env_opt('embed_size', 4000)
|
embed_size = util.env_opt('embed_size', 4000)
|
||||||
if 'token_vector_width' not in cfg:
|
if 'token_vector_width' in cfg:
|
||||||
|
token_vector_width = cfg['token_vector_width']
|
||||||
|
else:
|
||||||
token_vector_width = util.env_opt('token_vector_width', 128)
|
token_vector_width = util.env_opt('token_vector_width', 128)
|
||||||
|
pretrained_dims = cfg.get('pretrained_dims', 0)
|
||||||
with Model.define_operators({'>>': chain, '+': add}):
|
with Model.define_operators({'>>': chain, '+': add}):
|
||||||
tok2vec = Tok2Vec(token_vector_width, embed_size,
|
tok2vec = Tok2Vec(token_vector_width, embed_size,
|
||||||
pretrained_dims=pretrained_dims)
|
pretrained_dims=pretrained_dims)
|
||||||
|
|
|
@ -35,14 +35,14 @@ numpy.random.seed(0)
|
||||||
n_iter=("number of iterations", "option", "n", int),
|
n_iter=("number of iterations", "option", "n", int),
|
||||||
n_sents=("number of sentences", "option", "ns", int),
|
n_sents=("number of sentences", "option", "ns", int),
|
||||||
use_gpu=("Use GPU", "option", "g", int),
|
use_gpu=("Use GPU", "option", "g", int),
|
||||||
resume=("Whether to resume training", "flag", "R", bool),
|
vectors=("Model to load vectors from", "option", "v"),
|
||||||
no_tagger=("Don't train tagger", "flag", "T", bool),
|
no_tagger=("Don't train tagger", "flag", "T", bool),
|
||||||
no_parser=("Don't train parser", "flag", "P", bool),
|
no_parser=("Don't train parser", "flag", "P", bool),
|
||||||
no_entities=("Don't train NER", "flag", "N", bool),
|
no_entities=("Don't train NER", "flag", "N", bool),
|
||||||
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
|
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
|
||||||
)
|
)
|
||||||
def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
|
def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
|
||||||
use_gpu=-1, resume=False, no_tagger=False, no_parser=False, no_entities=False,
|
use_gpu=-1, vectors=None, no_tagger=False, no_parser=False, no_entities=False,
|
||||||
gold_preproc=False):
|
gold_preproc=False):
|
||||||
"""
|
"""
|
||||||
Train a model. Expects data in spaCy's JSON format.
|
Train a model. Expects data in spaCy's JSON format.
|
||||||
|
@ -78,25 +78,20 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
|
||||||
corpus = GoldCorpus(train_path, dev_path, limit=n_sents)
|
corpus = GoldCorpus(train_path, dev_path, limit=n_sents)
|
||||||
n_train_words = corpus.count_train()
|
n_train_words = corpus.count_train()
|
||||||
|
|
||||||
if not resume:
|
lang_class = util.get_lang_class(lang)
|
||||||
lang_class = util.get_lang_class(lang)
|
nlp = lang_class(pipeline=pipeline)
|
||||||
nlp = lang_class(pipeline=pipeline)
|
if vectors:
|
||||||
optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
|
util.load_model(vectors, vocab=nlp.vocab)
|
||||||
else:
|
optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
|
||||||
print("Load resume")
|
|
||||||
util.use_gpu(use_gpu)
|
|
||||||
nlp = _resume_model(lang, pipeline, corpus)
|
|
||||||
optimizer = nlp.resume_training(device=use_gpu)
|
|
||||||
lang_class = nlp.__class__
|
|
||||||
|
|
||||||
nlp._optimizer = None
|
nlp._optimizer = None
|
||||||
|
|
||||||
print("Itn.\tLoss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %")
|
print("Itn.\tLoss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %")
|
||||||
try:
|
try:
|
||||||
|
train_docs = corpus.train_docs(nlp, projectivize=True, noise_level=0.0,
|
||||||
|
gold_preproc=gold_preproc, max_length=0)
|
||||||
|
train_docs = list(train_docs)
|
||||||
for i in range(n_iter):
|
for i in range(n_iter):
|
||||||
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
|
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
|
||||||
train_docs = corpus.train_docs(nlp, projectivize=True, noise_level=0.0,
|
|
||||||
gold_preproc=gold_preproc, max_length=0)
|
|
||||||
losses = {}
|
losses = {}
|
||||||
for batch in minibatch(train_docs, size=batch_sizes):
|
for batch in minibatch(train_docs, size=batch_sizes):
|
||||||
docs, golds = zip(*batch)
|
docs, golds = zip(*batch)
|
||||||
|
@ -109,8 +104,8 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
|
||||||
util.set_env_log(False)
|
util.set_env_log(False)
|
||||||
epoch_model_path = output_path / ('model%d' % i)
|
epoch_model_path = output_path / ('model%d' % i)
|
||||||
nlp.to_disk(epoch_model_path)
|
nlp.to_disk(epoch_model_path)
|
||||||
#nlp_loaded = lang_class(pipeline=pipeline)
|
nlp_loaded = lang_class(pipeline=pipeline)
|
||||||
#nlp_loaded = nlp_loaded.from_disk(epoch_model_path)
|
nlp_loaded = nlp_loaded.from_disk(epoch_model_path)
|
||||||
scorer = nlp.evaluate(
|
scorer = nlp.evaluate(
|
||||||
corpus.dev_docs(
|
corpus.dev_docs(
|
||||||
nlp,
|
nlp,
|
||||||
|
@ -129,26 +124,6 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
def _resume_model(lang, pipeline, corpus):
|
|
||||||
nlp = util.load_model(lang)
|
|
||||||
pipes = {getattr(pipe, 'name', None) for pipe in nlp.pipeline}
|
|
||||||
for name in pipeline:
|
|
||||||
if name not in pipes:
|
|
||||||
factory = nlp.Defaults.factories[name]
|
|
||||||
for pipe in factory(nlp):
|
|
||||||
if hasattr(pipe, 'begin_training'):
|
|
||||||
pipe.begin_training(corpus.train_tuples,
|
|
||||||
pipeline=nlp.pipeline)
|
|
||||||
nlp.pipeline.append(pipe)
|
|
||||||
nlp.meta['pipeline'] = pipeline
|
|
||||||
if nlp.vocab.vectors.data.shape[1] >= 1:
|
|
||||||
nlp.vocab.vectors.data = Model.ops.asarray(
|
|
||||||
nlp.vocab.vectors.data)
|
|
||||||
|
|
||||||
return nlp
|
|
||||||
|
|
||||||
|
|
||||||
def _render_parses(i, to_render):
|
def _render_parses(i, to_render):
|
||||||
to_render[0].user_data['title'] = "Batch %d" % i
|
to_render[0].user_data['title'] = "Batch %d" % i
|
||||||
with Path('/tmp/entities.html').open('w') as file_:
|
with Path('/tmp/entities.html').open('w') as file_:
|
||||||
|
|
|
@ -348,7 +348,6 @@ class Language(object):
|
||||||
self._optimizer.device = device
|
self._optimizer.device = device
|
||||||
return self._optimizer
|
return self._optimizer
|
||||||
|
|
||||||
|
|
||||||
def begin_training(self, get_gold_tuples=None, **cfg):
|
def begin_training(self, get_gold_tuples=None, **cfg):
|
||||||
"""Allocate models, pre-process training data and acquire a trainer and
|
"""Allocate models, pre-process training data and acquire a trainer and
|
||||||
optimizer. Used as a contextmanager.
|
optimizer. Used as a contextmanager.
|
||||||
|
|
|
@ -43,6 +43,7 @@ from .compat import json_dumps
|
||||||
from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS
|
from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS
|
||||||
from ._ml import rebatch, Tok2Vec, flatten
|
from ._ml import rebatch, Tok2Vec, flatten
|
||||||
from ._ml import build_text_classifier, build_tagger_model
|
from ._ml import build_text_classifier, build_tagger_model
|
||||||
|
from ._ml import link_vectors_to_models
|
||||||
from .parts_of_speech import X
|
from .parts_of_speech import X
|
||||||
|
|
||||||
|
|
||||||
|
@ -121,6 +122,7 @@ class BaseThincComponent(object):
|
||||||
token_vector_width = pipeline[0].model.nO
|
token_vector_width = pipeline[0].model.nO
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
self.model = self.Model(1, token_vector_width)
|
self.model = self.Model(1, token_vector_width)
|
||||||
|
link_vectors_to_models(self.vocab)
|
||||||
|
|
||||||
def use_params(self, params):
|
def use_params(self, params):
|
||||||
with self.model.use_params(params):
|
with self.model.use_params(params):
|
||||||
|
@ -143,8 +145,8 @@ class BaseThincComponent(object):
|
||||||
|
|
||||||
deserialize = OrderedDict((
|
deserialize = OrderedDict((
|
||||||
('cfg', lambda b: self.cfg.update(ujson.loads(b))),
|
('cfg', lambda b: self.cfg.update(ujson.loads(b))),
|
||||||
('model', load_model),
|
|
||||||
('vocab', lambda b: self.vocab.from_bytes(b))
|
('vocab', lambda b: self.vocab.from_bytes(b))
|
||||||
|
('model', load_model),
|
||||||
))
|
))
|
||||||
util.from_bytes(bytes_data, deserialize, exclude)
|
util.from_bytes(bytes_data, deserialize, exclude)
|
||||||
return self
|
return self
|
||||||
|
@ -152,8 +154,8 @@ class BaseThincComponent(object):
|
||||||
def to_disk(self, path, **exclude):
|
def to_disk(self, path, **exclude):
|
||||||
serialize = OrderedDict((
|
serialize = OrderedDict((
|
||||||
('cfg', lambda p: p.open('w').write(json_dumps(self.cfg))),
|
('cfg', lambda p: p.open('w').write(json_dumps(self.cfg))),
|
||||||
|
('vocab', lambda p: self.vocab.to_disk(p)),
|
||||||
('model', lambda p: p.open('wb').write(self.model.to_bytes())),
|
('model', lambda p: p.open('wb').write(self.model.to_bytes())),
|
||||||
('vocab', lambda p: self.vocab.to_disk(p))
|
|
||||||
))
|
))
|
||||||
util.to_disk(path, serialize, exclude)
|
util.to_disk(path, serialize, exclude)
|
||||||
|
|
||||||
|
@ -166,8 +168,8 @@ class BaseThincComponent(object):
|
||||||
|
|
||||||
deserialize = OrderedDict((
|
deserialize = OrderedDict((
|
||||||
('cfg', lambda p: self.cfg.update(_load_cfg(p))),
|
('cfg', lambda p: self.cfg.update(_load_cfg(p))),
|
||||||
('model', load_model),
|
|
||||||
('vocab', lambda p: self.vocab.from_disk(p)),
|
('vocab', lambda p: self.vocab.from_disk(p)),
|
||||||
|
('model', load_model),
|
||||||
))
|
))
|
||||||
util.from_disk(path, deserialize, exclude)
|
util.from_disk(path, deserialize, exclude)
|
||||||
return self
|
return self
|
||||||
|
@ -215,7 +217,7 @@ class TokenVectorEncoder(BaseThincComponent):
|
||||||
self.model = model
|
self.model = model
|
||||||
self.cfg = dict(cfg)
|
self.cfg = dict(cfg)
|
||||||
self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
|
self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
|
||||||
self.cfg.setdefault('cnn_maxout_pieces', 2)
|
self.cfg.setdefault('cnn_maxout_pieces', 3)
|
||||||
|
|
||||||
def __call__(self, doc):
|
def __call__(self, doc):
|
||||||
"""Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM
|
"""Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM
|
||||||
|
@ -287,7 +289,9 @@ class TokenVectorEncoder(BaseThincComponent):
|
||||||
pipeline (list): The pipeline the model is part of.
|
pipeline (list): The pipeline the model is part of.
|
||||||
"""
|
"""
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
|
self.cfg['pretrained_dims'] = self.vocab.vectors_length
|
||||||
self.model = self.Model(**self.cfg)
|
self.model = self.Model(**self.cfg)
|
||||||
|
link_vectors_to_models(self.vocab)
|
||||||
|
|
||||||
|
|
||||||
class NeuralTagger(BaseThincComponent):
|
class NeuralTagger(BaseThincComponent):
|
||||||
|
@ -297,6 +301,7 @@ class NeuralTagger(BaseThincComponent):
|
||||||
self.model = model
|
self.model = model
|
||||||
self.cfg = dict(cfg)
|
self.cfg = dict(cfg)
|
||||||
self.cfg.setdefault('cnn_maxout_pieces', 2)
|
self.cfg.setdefault('cnn_maxout_pieces', 2)
|
||||||
|
self.cfg.setdefault('pretrained_dims', self.vocab.vectors.data.shape[1])
|
||||||
|
|
||||||
def __call__(self, doc):
|
def __call__(self, doc):
|
||||||
tags = self.predict([doc])
|
tags = self.predict([doc])
|
||||||
|
@ -388,8 +393,9 @@ class NeuralTagger(BaseThincComponent):
|
||||||
vocab.morphology.lemmatizer,
|
vocab.morphology.lemmatizer,
|
||||||
exc=vocab.morphology.exc)
|
exc=vocab.morphology.exc)
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
self.model = self.Model(self.vocab.morphology.n_tags,
|
self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
|
||||||
pretrained_dims=self.vocab.vectors_length)
|
self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
|
||||||
|
link_vectors_to_models(self.vocab)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def Model(cls, n_tags, **cfg):
|
def Model(cls, n_tags, **cfg):
|
||||||
|
@ -414,8 +420,7 @@ class NeuralTagger(BaseThincComponent):
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
token_vector_width = util.env_opt('token_vector_width',
|
token_vector_width = util.env_opt('token_vector_width',
|
||||||
self.cfg.get('token_vector_width', 128))
|
self.cfg.get('token_vector_width', 128))
|
||||||
self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width,
|
self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
|
||||||
pretrained_dims=self.vocab.vectors_length)
|
|
||||||
self.model.from_bytes(b)
|
self.model.from_bytes(b)
|
||||||
|
|
||||||
def load_tag_map(b):
|
def load_tag_map(b):
|
||||||
|
@ -449,10 +454,7 @@ class NeuralTagger(BaseThincComponent):
|
||||||
def from_disk(self, path, **exclude):
|
def from_disk(self, path, **exclude):
|
||||||
def load_model(p):
|
def load_model(p):
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
token_vector_width = util.env_opt('token_vector_width',
|
self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
|
||||||
self.cfg.get('token_vector_width', 128))
|
|
||||||
self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width,
|
|
||||||
**self.cfg)
|
|
||||||
self.model.from_bytes(p.open('rb').read())
|
self.model.from_bytes(p.open('rb').read())
|
||||||
|
|
||||||
def load_tag_map(p):
|
def load_tag_map(p):
|
||||||
|
@ -480,6 +482,7 @@ class NeuralLabeller(NeuralTagger):
|
||||||
self.model = model
|
self.model = model
|
||||||
self.cfg = dict(cfg)
|
self.cfg = dict(cfg)
|
||||||
self.cfg.setdefault('cnn_maxout_pieces', 2)
|
self.cfg.setdefault('cnn_maxout_pieces', 2)
|
||||||
|
self.cfg.setdefault('pretrained_dims', self.vocab.vectors.data.shape[1])
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def labels(self):
|
def labels(self):
|
||||||
|
@ -502,13 +505,13 @@ class NeuralLabeller(NeuralTagger):
|
||||||
self.labels[dep] = len(self.labels)
|
self.labels[dep] = len(self.labels)
|
||||||
token_vector_width = pipeline[0].model.nO
|
token_vector_width = pipeline[0].model.nO
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
self.model = self.Model(len(self.labels), token_vector_width,
|
self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
|
||||||
pretrained_dims=self.vocab.vectors_length)
|
self.model = self.Model(len(self.labels), **self.cfg)
|
||||||
|
link_vectors_to_models(self.vocab)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def Model(cls, n_tags, token_vector_width, pretrained_dims=0):
|
def Model(cls, n_tags, **cfg):
|
||||||
return build_tagger_model(n_tags, token_vector_width,
|
return build_tagger_model(n_tags, **cfg)
|
||||||
pretrained_dims)
|
|
||||||
|
|
||||||
def get_loss(self, docs, golds, scores):
|
def get_loss(self, docs, golds, scores):
|
||||||
scores = self.model.ops.flatten(scores)
|
scores = self.model.ops.flatten(scores)
|
||||||
|
@ -579,6 +582,7 @@ class SimilarityHook(BaseThincComponent):
|
||||||
"""
|
"""
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
self.model = self.Model(pipeline[0].model.nO)
|
self.model = self.Model(pipeline[0].model.nO)
|
||||||
|
link_vectors_to_models(self.vocab)
|
||||||
|
|
||||||
|
|
||||||
class TextCategorizer(BaseThincComponent):
|
class TextCategorizer(BaseThincComponent):
|
||||||
|
@ -650,6 +654,7 @@ class TextCategorizer(BaseThincComponent):
|
||||||
self.cfg['pretrained_dims'] = self.vocab.vectors_length
|
self.cfg['pretrained_dims'] = self.vocab.vectors_length
|
||||||
self.model = self.Model(len(self.labels), token_vector_width,
|
self.model = self.Model(len(self.labels), token_vector_width,
|
||||||
**self.cfg)
|
**self.cfg)
|
||||||
|
link_vectors_to_models(self.vocab)
|
||||||
|
|
||||||
|
|
||||||
cdef class EntityRecognizer(LinearParser):
|
cdef class EntityRecognizer(LinearParser):
|
||||||
|
|
|
@ -49,6 +49,7 @@ from ..util import get_async, get_cuda_stream
|
||||||
from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts
|
from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts
|
||||||
from .._ml import Tok2Vec, doc2feats, rebatch, fine_tune
|
from .._ml import Tok2Vec, doc2feats, rebatch, fine_tune
|
||||||
from .._ml import Residual, drop_layer, flatten
|
from .._ml import Residual, drop_layer, flatten
|
||||||
|
from .._ml import link_vectors_to_models
|
||||||
from ..compat import json_dumps
|
from ..compat import json_dumps
|
||||||
|
|
||||||
from . import _parse_features
|
from . import _parse_features
|
||||||
|
@ -310,6 +311,7 @@ cdef class Parser:
|
||||||
cfg['beam_density'] = util.env_opt('beam_density', 0.0)
|
cfg['beam_density'] = util.env_opt('beam_density', 0.0)
|
||||||
if 'pretrained_dims' not in cfg:
|
if 'pretrained_dims' not in cfg:
|
||||||
cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
|
cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
|
||||||
|
cfg.setdefault('cnn_maxout_pieces', 3)
|
||||||
self.cfg = cfg
|
self.cfg = cfg
|
||||||
if 'actions' in self.cfg:
|
if 'actions' in self.cfg:
|
||||||
for action, labels in self.cfg.get('actions', {}).items():
|
for action, labels in self.cfg.get('actions', {}).items():
|
||||||
|
@ -339,7 +341,7 @@ cdef class Parser:
|
||||||
self.set_annotations([doc], states)
|
self.set_annotations([doc], states)
|
||||||
return doc
|
return doc
|
||||||
else:
|
else:
|
||||||
beam = self.beam_parse([doc],
|
beam = self.beam_parse([doc],
|
||||||
beam_width=beam_width, beam_density=beam_density)[0]
|
beam_width=beam_width, beam_density=beam_density)[0]
|
||||||
output = self.moves.get_beam_annot(beam)
|
output = self.moves.get_beam_annot(beam)
|
||||||
state = <StateClass>beam.at(0)
|
state = <StateClass>beam.at(0)
|
||||||
|
@ -754,6 +756,7 @@ cdef class Parser:
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
cfg['pretrained_dims'] = self.vocab.vectors_length
|
cfg['pretrained_dims'] = self.vocab.vectors_length
|
||||||
self.model, cfg = self.Model(self.moves.n_moves, **cfg)
|
self.model, cfg = self.Model(self.moves.n_moves, **cfg)
|
||||||
|
link_vectors_to_models(self.vocab)
|
||||||
self.cfg.update(cfg)
|
self.cfg.update(cfg)
|
||||||
|
|
||||||
def preprocess_gold(self, docs_golds):
|
def preprocess_gold(self, docs_golds):
|
||||||
|
@ -835,8 +838,7 @@ cdef class Parser:
|
||||||
msg = util.from_bytes(bytes_data, deserializers, exclude)
|
msg = util.from_bytes(bytes_data, deserializers, exclude)
|
||||||
if 'model' not in exclude:
|
if 'model' not in exclude:
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
self.model, cfg = self.Model(self.moves.n_moves,
|
self.model, cfg = self.Model(**self.cfg)
|
||||||
pretrained_dims=self.vocab.vectors_length)
|
|
||||||
cfg['pretrained_dims'] = self.vocab.vectors_length
|
cfg['pretrained_dims'] = self.vocab.vectors_length
|
||||||
else:
|
else:
|
||||||
cfg = {}
|
cfg = {}
|
||||||
|
|
|
@ -27,6 +27,7 @@ from .vectors import Vectors
|
||||||
from . import util
|
from . import util
|
||||||
from . import attrs
|
from . import attrs
|
||||||
from . import symbols
|
from . import symbols
|
||||||
|
from ._ml import link_vectors_to_models
|
||||||
|
|
||||||
|
|
||||||
cdef class Vocab:
|
cdef class Vocab:
|
||||||
|
@ -323,6 +324,7 @@ cdef class Vocab:
|
||||||
self.lexemes_from_bytes(file_.read())
|
self.lexemes_from_bytes(file_.read())
|
||||||
if self.vectors is not None:
|
if self.vectors is not None:
|
||||||
self.vectors.from_disk(path, exclude='strings.json')
|
self.vectors.from_disk(path, exclude='strings.json')
|
||||||
|
link_vectors_to_models(self)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_bytes(self, **exclude):
|
def to_bytes(self, **exclude):
|
||||||
|
@ -362,6 +364,7 @@ cdef class Vocab:
|
||||||
('vectors', lambda b: serialize_vectors(b))
|
('vectors', lambda b: serialize_vectors(b))
|
||||||
))
|
))
|
||||||
util.from_bytes(bytes_data, setters, exclude)
|
util.from_bytes(bytes_data, setters, exclude)
|
||||||
|
link_vectors_to_models(self)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def lexemes_to_bytes(self):
|
def lexemes_to_bytes(self):
|
||||||
|
@ -436,6 +439,7 @@ def unpickle_vocab(sstore, morphology, data_dir,
|
||||||
vocab.lex_attr_getters = lex_attr_getters
|
vocab.lex_attr_getters = lex_attr_getters
|
||||||
vocab.lexemes_from_bytes(lexemes_data)
|
vocab.lexemes_from_bytes(lexemes_data)
|
||||||
vocab.length = length
|
vocab.length = length
|
||||||
|
link_vectors_to_models(vocab)
|
||||||
return vocab
|
return vocab
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user