Add link_vectors_to_models function

This commit is contained in:
Matthew Honnibal 2017-09-22 09:38:22 -05:00
parent a186596307
commit d9124f1aa3
4 changed files with 19 additions and 5 deletions

View File

@ -4,6 +4,7 @@ from thinc.neural import Model, Maxout, Softmax, Affine
from thinc.neural._classes.hash_embed import HashEmbed from thinc.neural._classes.hash_embed import HashEmbed
from thinc.neural.ops import NumpyOps, CupyOps from thinc.neural.ops import NumpyOps, CupyOps
from thinc.neural.util import get_array_module from thinc.neural.util import get_array_module
import thinc.extra.load_nlp
import random import random
import cytoolz import cytoolz
@ -31,6 +32,7 @@ from . import util
import numpy import numpy
import io import io
VECTORS_KEY = 'spacy_pretrained_vectors'
@layerize @layerize
def _flatten_add_lengths(seqs, pad=0, drop=0.): def _flatten_add_lengths(seqs, pad=0, drop=0.):

View File

@ -43,6 +43,7 @@ from .compat import json_dumps
from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS
from ._ml import rebatch, Tok2Vec, flatten from ._ml import rebatch, Tok2Vec, flatten
from ._ml import build_text_classifier, build_tagger_model from ._ml import build_text_classifier, build_tagger_model
from ._ml import link_vectors_to_models
from .parts_of_speech import X from .parts_of_speech import X
@ -121,6 +122,7 @@ class BaseThincComponent(object):
token_vector_width = pipeline[0].model.nO token_vector_width = pipeline[0].model.nO
if self.model is True: if self.model is True:
self.model = self.Model(1, token_vector_width) self.model = self.Model(1, token_vector_width)
link_vectors_to_models(self.vocab)
def use_params(self, params): def use_params(self, params):
with self.model.use_params(params): with self.model.use_params(params):
@ -215,7 +217,7 @@ class TokenVectorEncoder(BaseThincComponent):
self.model = model self.model = model
self.cfg = dict(cfg) self.cfg = dict(cfg)
self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1] self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
self.cfg.setdefault('cnn_maxout_pieces', 2) self.cfg.setdefault('cnn_maxout_pieces', 3)
def __call__(self, doc): def __call__(self, doc):
"""Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM """Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM
@ -288,6 +290,7 @@ class TokenVectorEncoder(BaseThincComponent):
""" """
if self.model is True: if self.model is True:
self.model = self.Model(**self.cfg) self.model = self.Model(**self.cfg)
link_vectors_to_models(self.vocab)
class NeuralTagger(BaseThincComponent): class NeuralTagger(BaseThincComponent):
@ -396,6 +399,7 @@ class NeuralTagger(BaseThincComponent):
exc=vocab.morphology.exc) exc=vocab.morphology.exc)
if self.model is True: if self.model is True:
self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg) self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
link_vectors_to_models(self.vocab)
@classmethod @classmethod
def Model(cls, n_tags, **cfg): def Model(cls, n_tags, **cfg):
@ -504,8 +508,9 @@ class NeuralLabeller(NeuralTagger):
self.labels[dep] = len(self.labels) self.labels[dep] = len(self.labels)
token_vector_width = pipeline[0].model.nO token_vector_width = pipeline[0].model.nO
if self.model is True: if self.model is True:
self.model = self.Model(len(self.labels), token_vector_width, self.model = self.Model(len(self.labels), token_vector_width=token_vector_width,
pretrained_dims=self.vocab.vectors_length) pretrained_dims=self.vocab.vectors_length)
link_vectors_to_models(self.vocab)
@classmethod @classmethod
def Model(cls, n_tags, **cfg): def Model(cls, n_tags, **cfg):
@ -585,6 +590,7 @@ class SimilarityHook(BaseThincComponent):
""" """
if self.model is True: if self.model is True:
self.model = self.Model(pipeline[0].model.nO) self.model = self.Model(pipeline[0].model.nO)
link_vectors_to_models(self.vocab)
class TextCategorizer(BaseThincComponent): class TextCategorizer(BaseThincComponent):
@ -658,6 +664,7 @@ class TextCategorizer(BaseThincComponent):
self.cfg['pretrained_dims'] = self.vocab.vectors_length self.cfg['pretrained_dims'] = self.vocab.vectors_length
self.model = self.Model(len(self.labels), token_vector_width, self.model = self.Model(len(self.labels), token_vector_width,
**self.cfg) **self.cfg)
link_vectors_to_models(self.vocab)
cdef class EntityRecognizer(LinearParser): cdef class EntityRecognizer(LinearParser):

View File

@ -49,6 +49,7 @@ from ..util import get_async, get_cuda_stream
from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts
from .._ml import Tok2Vec, doc2feats, rebatch, fine_tune from .._ml import Tok2Vec, doc2feats, rebatch, fine_tune
from .._ml import Residual, drop_layer from .._ml import Residual, drop_layer
from .._ml import link_vectors_to_models
from ..compat import json_dumps from ..compat import json_dumps
from . import _parse_features from . import _parse_features
@ -309,7 +310,7 @@ cdef class Parser:
cfg['beam_density'] = util.env_opt('beam_density', 0.0) cfg['beam_density'] = util.env_opt('beam_density', 0.0)
if 'pretrained_dims' not in cfg: if 'pretrained_dims' not in cfg:
cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1] cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
cfg.setdefault('cnn_maxout_pieces', 2) cfg.setdefault('cnn_maxout_pieces', 3)
self.cfg = cfg self.cfg = cfg
if 'actions' in self.cfg: if 'actions' in self.cfg:
for action, labels in self.cfg.get('actions', {}).items(): for action, labels in self.cfg.get('actions', {}).items():
@ -791,6 +792,7 @@ cdef class Parser:
if self.model is True: if self.model is True:
cfg['pretrained_dims'] = self.vocab.vectors_length cfg['pretrained_dims'] = self.vocab.vectors_length
self.model, cfg = self.Model(self.moves.n_moves, **cfg) self.model, cfg = self.Model(self.moves.n_moves, **cfg)
link_vectors_to_models(self.vocab)
self.cfg.update(cfg) self.cfg.update(cfg)
def preprocess_gold(self, docs_golds): def preprocess_gold(self, docs_golds):
@ -872,8 +874,7 @@ cdef class Parser:
msg = util.from_bytes(bytes_data, deserializers, exclude) msg = util.from_bytes(bytes_data, deserializers, exclude)
if 'model' not in exclude: if 'model' not in exclude:
if self.model is True: if self.model is True:
self.model, cfg = self.Model(self.moves.n_moves, self.model, cfg = self.Model(**self.cfg)
pretrained_dims=self.vocab.vectors_length)
cfg['pretrained_dims'] = self.vocab.vectors_length cfg['pretrained_dims'] = self.vocab.vectors_length
else: else:
cfg = {} cfg = {}

View File

@ -27,6 +27,7 @@ from .vectors import Vectors
from . import util from . import util
from . import attrs from . import attrs
from . import symbols from . import symbols
from ._ml import link_vectors_to_models
cdef class Vocab: cdef class Vocab:
@ -323,6 +324,7 @@ cdef class Vocab:
self.lexemes_from_bytes(file_.read()) self.lexemes_from_bytes(file_.read())
if self.vectors is not None: if self.vectors is not None:
self.vectors.from_disk(path, exclude='strings.json') self.vectors.from_disk(path, exclude='strings.json')
link_vectors_to_models(self)
return self return self
def to_bytes(self, **exclude): def to_bytes(self, **exclude):
@ -362,6 +364,7 @@ cdef class Vocab:
('vectors', lambda b: serialize_vectors(b)) ('vectors', lambda b: serialize_vectors(b))
)) ))
util.from_bytes(bytes_data, setters, exclude) util.from_bytes(bytes_data, setters, exclude)
link_vectors_to_models(self)
return self return self
def lexemes_to_bytes(self): def lexemes_to_bytes(self):
@ -436,6 +439,7 @@ def unpickle_vocab(sstore, morphology, data_dir,
vocab.lex_attr_getters = lex_attr_getters vocab.lex_attr_getters = lex_attr_getters
vocab.lexemes_from_bytes(lexemes_data) vocab.lexemes_from_bytes(lexemes_data)
vocab.length = length vocab.length = length
link_vectors_to_models(vocab)
return vocab return vocab