mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 10:46:29 +03:00
Add link_vectors_to_models function
This commit is contained in:
parent
a186596307
commit
d9124f1aa3
|
@ -4,6 +4,7 @@ from thinc.neural import Model, Maxout, Softmax, Affine
|
||||||
from thinc.neural._classes.hash_embed import HashEmbed
|
from thinc.neural._classes.hash_embed import HashEmbed
|
||||||
from thinc.neural.ops import NumpyOps, CupyOps
|
from thinc.neural.ops import NumpyOps, CupyOps
|
||||||
from thinc.neural.util import get_array_module
|
from thinc.neural.util import get_array_module
|
||||||
|
import thinc.extra.load_nlp
|
||||||
import random
|
import random
|
||||||
import cytoolz
|
import cytoolz
|
||||||
|
|
||||||
|
@ -31,6 +32,7 @@ from . import util
|
||||||
import numpy
|
import numpy
|
||||||
import io
|
import io
|
||||||
|
|
||||||
|
VECTORS_KEY = 'spacy_pretrained_vectors'
|
||||||
|
|
||||||
@layerize
|
@layerize
|
||||||
def _flatten_add_lengths(seqs, pad=0, drop=0.):
|
def _flatten_add_lengths(seqs, pad=0, drop=0.):
|
||||||
|
|
|
@ -43,6 +43,7 @@ from .compat import json_dumps
|
||||||
from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS
|
from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS
|
||||||
from ._ml import rebatch, Tok2Vec, flatten
|
from ._ml import rebatch, Tok2Vec, flatten
|
||||||
from ._ml import build_text_classifier, build_tagger_model
|
from ._ml import build_text_classifier, build_tagger_model
|
||||||
|
from ._ml import link_vectors_to_models
|
||||||
from .parts_of_speech import X
|
from .parts_of_speech import X
|
||||||
|
|
||||||
|
|
||||||
|
@ -121,6 +122,7 @@ class BaseThincComponent(object):
|
||||||
token_vector_width = pipeline[0].model.nO
|
token_vector_width = pipeline[0].model.nO
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
self.model = self.Model(1, token_vector_width)
|
self.model = self.Model(1, token_vector_width)
|
||||||
|
link_vectors_to_models(self.vocab)
|
||||||
|
|
||||||
def use_params(self, params):
|
def use_params(self, params):
|
||||||
with self.model.use_params(params):
|
with self.model.use_params(params):
|
||||||
|
@ -215,7 +217,7 @@ class TokenVectorEncoder(BaseThincComponent):
|
||||||
self.model = model
|
self.model = model
|
||||||
self.cfg = dict(cfg)
|
self.cfg = dict(cfg)
|
||||||
self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
|
self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
|
||||||
self.cfg.setdefault('cnn_maxout_pieces', 2)
|
self.cfg.setdefault('cnn_maxout_pieces', 3)
|
||||||
|
|
||||||
def __call__(self, doc):
|
def __call__(self, doc):
|
||||||
"""Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM
|
"""Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM
|
||||||
|
@ -288,6 +290,7 @@ class TokenVectorEncoder(BaseThincComponent):
|
||||||
"""
|
"""
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
self.model = self.Model(**self.cfg)
|
self.model = self.Model(**self.cfg)
|
||||||
|
link_vectors_to_models(self.vocab)
|
||||||
|
|
||||||
|
|
||||||
class NeuralTagger(BaseThincComponent):
|
class NeuralTagger(BaseThincComponent):
|
||||||
|
@ -396,6 +399,7 @@ class NeuralTagger(BaseThincComponent):
|
||||||
exc=vocab.morphology.exc)
|
exc=vocab.morphology.exc)
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
|
self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
|
||||||
|
link_vectors_to_models(self.vocab)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def Model(cls, n_tags, **cfg):
|
def Model(cls, n_tags, **cfg):
|
||||||
|
@ -504,8 +508,9 @@ class NeuralLabeller(NeuralTagger):
|
||||||
self.labels[dep] = len(self.labels)
|
self.labels[dep] = len(self.labels)
|
||||||
token_vector_width = pipeline[0].model.nO
|
token_vector_width = pipeline[0].model.nO
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
self.model = self.Model(len(self.labels), token_vector_width,
|
self.model = self.Model(len(self.labels), token_vector_width=token_vector_width,
|
||||||
pretrained_dims=self.vocab.vectors_length)
|
pretrained_dims=self.vocab.vectors_length)
|
||||||
|
link_vectors_to_models(self.vocab)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def Model(cls, n_tags, **cfg):
|
def Model(cls, n_tags, **cfg):
|
||||||
|
@ -585,6 +590,7 @@ class SimilarityHook(BaseThincComponent):
|
||||||
"""
|
"""
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
self.model = self.Model(pipeline[0].model.nO)
|
self.model = self.Model(pipeline[0].model.nO)
|
||||||
|
link_vectors_to_models(self.vocab)
|
||||||
|
|
||||||
|
|
||||||
class TextCategorizer(BaseThincComponent):
|
class TextCategorizer(BaseThincComponent):
|
||||||
|
@ -658,6 +664,7 @@ class TextCategorizer(BaseThincComponent):
|
||||||
self.cfg['pretrained_dims'] = self.vocab.vectors_length
|
self.cfg['pretrained_dims'] = self.vocab.vectors_length
|
||||||
self.model = self.Model(len(self.labels), token_vector_width,
|
self.model = self.Model(len(self.labels), token_vector_width,
|
||||||
**self.cfg)
|
**self.cfg)
|
||||||
|
link_vectors_to_models(self.vocab)
|
||||||
|
|
||||||
|
|
||||||
cdef class EntityRecognizer(LinearParser):
|
cdef class EntityRecognizer(LinearParser):
|
||||||
|
|
|
@ -49,6 +49,7 @@ from ..util import get_async, get_cuda_stream
|
||||||
from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts
|
from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts
|
||||||
from .._ml import Tok2Vec, doc2feats, rebatch, fine_tune
|
from .._ml import Tok2Vec, doc2feats, rebatch, fine_tune
|
||||||
from .._ml import Residual, drop_layer
|
from .._ml import Residual, drop_layer
|
||||||
|
from .._ml import link_vectors_to_models
|
||||||
from ..compat import json_dumps
|
from ..compat import json_dumps
|
||||||
|
|
||||||
from . import _parse_features
|
from . import _parse_features
|
||||||
|
@ -309,7 +310,7 @@ cdef class Parser:
|
||||||
cfg['beam_density'] = util.env_opt('beam_density', 0.0)
|
cfg['beam_density'] = util.env_opt('beam_density', 0.0)
|
||||||
if 'pretrained_dims' not in cfg:
|
if 'pretrained_dims' not in cfg:
|
||||||
cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
|
cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
|
||||||
cfg.setdefault('cnn_maxout_pieces', 2)
|
cfg.setdefault('cnn_maxout_pieces', 3)
|
||||||
self.cfg = cfg
|
self.cfg = cfg
|
||||||
if 'actions' in self.cfg:
|
if 'actions' in self.cfg:
|
||||||
for action, labels in self.cfg.get('actions', {}).items():
|
for action, labels in self.cfg.get('actions', {}).items():
|
||||||
|
@ -791,6 +792,7 @@ cdef class Parser:
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
cfg['pretrained_dims'] = self.vocab.vectors_length
|
cfg['pretrained_dims'] = self.vocab.vectors_length
|
||||||
self.model, cfg = self.Model(self.moves.n_moves, **cfg)
|
self.model, cfg = self.Model(self.moves.n_moves, **cfg)
|
||||||
|
link_vectors_to_models(self.vocab)
|
||||||
self.cfg.update(cfg)
|
self.cfg.update(cfg)
|
||||||
|
|
||||||
def preprocess_gold(self, docs_golds):
|
def preprocess_gold(self, docs_golds):
|
||||||
|
@ -872,8 +874,7 @@ cdef class Parser:
|
||||||
msg = util.from_bytes(bytes_data, deserializers, exclude)
|
msg = util.from_bytes(bytes_data, deserializers, exclude)
|
||||||
if 'model' not in exclude:
|
if 'model' not in exclude:
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
self.model, cfg = self.Model(self.moves.n_moves,
|
self.model, cfg = self.Model(**self.cfg)
|
||||||
pretrained_dims=self.vocab.vectors_length)
|
|
||||||
cfg['pretrained_dims'] = self.vocab.vectors_length
|
cfg['pretrained_dims'] = self.vocab.vectors_length
|
||||||
else:
|
else:
|
||||||
cfg = {}
|
cfg = {}
|
||||||
|
|
|
@ -27,6 +27,7 @@ from .vectors import Vectors
|
||||||
from . import util
|
from . import util
|
||||||
from . import attrs
|
from . import attrs
|
||||||
from . import symbols
|
from . import symbols
|
||||||
|
from ._ml import link_vectors_to_models
|
||||||
|
|
||||||
|
|
||||||
cdef class Vocab:
|
cdef class Vocab:
|
||||||
|
@ -323,6 +324,7 @@ cdef class Vocab:
|
||||||
self.lexemes_from_bytes(file_.read())
|
self.lexemes_from_bytes(file_.read())
|
||||||
if self.vectors is not None:
|
if self.vectors is not None:
|
||||||
self.vectors.from_disk(path, exclude='strings.json')
|
self.vectors.from_disk(path, exclude='strings.json')
|
||||||
|
link_vectors_to_models(self)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_bytes(self, **exclude):
|
def to_bytes(self, **exclude):
|
||||||
|
@ -362,6 +364,7 @@ cdef class Vocab:
|
||||||
('vectors', lambda b: serialize_vectors(b))
|
('vectors', lambda b: serialize_vectors(b))
|
||||||
))
|
))
|
||||||
util.from_bytes(bytes_data, setters, exclude)
|
util.from_bytes(bytes_data, setters, exclude)
|
||||||
|
link_vectors_to_models(self)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def lexemes_to_bytes(self):
|
def lexemes_to_bytes(self):
|
||||||
|
@ -436,6 +439,7 @@ def unpickle_vocab(sstore, morphology, data_dir,
|
||||||
vocab.lex_attr_getters = lex_attr_getters
|
vocab.lex_attr_getters = lex_attr_getters
|
||||||
vocab.lexemes_from_bytes(lexemes_data)
|
vocab.lexemes_from_bytes(lexemes_data)
|
||||||
vocab.length = length
|
vocab.length = length
|
||||||
|
link_vectors_to_models(vocab)
|
||||||
return vocab
|
return vocab
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user