Clean up link_vectors_to_models unused stuff

This commit is contained in:
Matthew Honnibal 2020-07-28 22:17:47 +02:00
parent 0c17ea4c85
commit 1784c95827
15 changed files with 6 additions and 51 deletions

View File

@ -11,7 +11,6 @@ from ...util import ensure_path, working_dir
from .._util import project_cli, Arg, PROJECT_FILE, load_project_config, get_checksum from .._util import project_cli, Arg, PROJECT_FILE, load_project_config, get_checksum
# TODO: find a solution for caches # TODO: find a solution for caches
# CACHES = [ # CACHES = [
# Path.home() / ".torch", # Path.home() / ".torch",

View File

@ -21,7 +21,7 @@ from .vocab import Vocab, create_vocab
from .pipe_analysis import analyze_pipes, analyze_all_pipes, validate_attrs from .pipe_analysis import analyze_pipes, analyze_all_pipes, validate_attrs
from .gold import Example from .gold import Example
from .scorer import Scorer from .scorer import Scorer
from .util import link_vectors_to_models, create_default_optimizer, registry from .util import create_default_optimizer, registry
from .util import SimpleFrozenDict, combine_score_weights from .util import SimpleFrozenDict, combine_score_weights
from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
@ -1049,7 +1049,6 @@ class Language:
if self.vocab.vectors.data.shape[1] >= 1: if self.vocab.vectors.data.shape[1] >= 1:
ops = get_current_ops() ops = get_current_ops()
self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data) self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
link_vectors_to_models(self.vocab)
if sgd is None: if sgd is None:
sgd = create_default_optimizer() sgd = create_default_optimizer()
self._optimizer = sgd self._optimizer = sgd
@ -1082,7 +1081,6 @@ class Language:
ops = get_current_ops() ops = get_current_ops()
if self.vocab.vectors.data.shape[1] >= 1: if self.vocab.vectors.data.shape[1] >= 1:
self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data) self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
link_vectors_to_models(self.vocab)
if sgd is None: if sgd is None:
sgd = create_default_optimizer() sgd = create_default_optimizer()
self._optimizer = sgd self._optimizer = sgd

View File

@ -149,7 +149,6 @@ class Morphologizer(Tagger):
self.cfg["labels_pos"][norm_label] = POS_IDS[pos] self.cfg["labels_pos"][norm_label] = POS_IDS[pos]
self.set_output(len(self.labels)) self.set_output(len(self.labels))
self.model.initialize() self.model.initialize()
util.link_vectors_to_models(self.vocab)
if sgd is None: if sgd is None:
sgd = self.create_optimizer() sgd = self.create_optimizer()
return sgd return sgd

View File

@ -11,7 +11,6 @@ from .tagger import Tagger
from ..language import Language from ..language import Language
from ..syntax import nonproj from ..syntax import nonproj
from ..attrs import POS, ID from ..attrs import POS, ID
from ..util import link_vectors_to_models
from ..errors import Errors from ..errors import Errors
@ -91,7 +90,6 @@ class MultitaskObjective(Tagger):
if label is not None and label not in self.labels: if label is not None and label not in self.labels:
self.labels[label] = len(self.labels) self.labels[label] = len(self.labels)
self.model.initialize() self.model.initialize()
link_vectors_to_models(self.vocab)
if sgd is None: if sgd is None:
sgd = self.create_optimizer() sgd = self.create_optimizer()
return sgd return sgd
@ -179,7 +177,6 @@ class ClozeMultitask(Pipe):
pass pass
def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None): def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None):
link_vectors_to_models(self.vocab)
self.model.initialize() self.model.initialize()
X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO"))) X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO")))
self.model.output_layer.begin_training(X) self.model.output_layer.begin_training(X)

View File

@ -3,7 +3,7 @@ import srsly
from ..tokens.doc cimport Doc from ..tokens.doc cimport Doc
from ..util import link_vectors_to_models, create_default_optimizer from ..util import create_default_optimizer
from ..errors import Errors from ..errors import Errors
from .. import util from .. import util
@ -145,8 +145,6 @@ class Pipe:
DOCS: https://spacy.io/api/pipe#begin_training DOCS: https://spacy.io/api/pipe#begin_training
""" """
self.model.initialize() self.model.initialize()
if hasattr(self, "vocab"):
link_vectors_to_models(self.vocab)
if sgd is None: if sgd is None:
sgd = self.create_optimizer() sgd = self.create_optimizer()
return sgd return sgd

View File

@ -138,7 +138,6 @@ class SentenceRecognizer(Tagger):
""" """
self.set_output(len(self.labels)) self.set_output(len(self.labels))
self.model.initialize() self.model.initialize()
util.link_vectors_to_models(self.vocab)
if sgd is None: if sgd is None:
sgd = self.create_optimizer() sgd = self.create_optimizer()
return sgd return sgd

View File

@ -168,7 +168,6 @@ class SimpleNER(Pipe):
self.model.initialize() self.model.initialize()
if pipeline is not None: if pipeline is not None:
self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg) self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg)
util.link_vectors_to_models(self.vocab)
self.loss_func = SequenceCategoricalCrossentropy( self.loss_func = SequenceCategoricalCrossentropy(
names=self.get_tag_names(), normalize=True, missing_value=None names=self.get_tag_names(), normalize=True, missing_value=None
) )

View File

@ -318,7 +318,6 @@ class Tagger(Pipe):
self.model.initialize(X=doc_sample) self.model.initialize(X=doc_sample)
# Get batch of example docs, example outputs to call begin_training(). # Get batch of example docs, example outputs to call begin_training().
# This lets the model infer shapes. # This lets the model infer shapes.
util.link_vectors_to_models(self.vocab)
if sgd is None: if sgd is None:
sgd = self.create_optimizer() sgd = self.create_optimizer()
return sgd return sgd

View File

@ -356,7 +356,6 @@ class TextCategorizer(Pipe):
docs = [Doc(Vocab(), words=["hello"])] docs = [Doc(Vocab(), words=["hello"])]
truths, _ = self._examples_to_truth(examples) truths, _ = self._examples_to_truth(examples)
self.set_output(len(self.labels)) self.set_output(len(self.labels))
util.link_vectors_to_models(self.vocab)
self.model.initialize(X=docs, Y=truths) self.model.initialize(X=docs, Y=truths)
if sgd is None: if sgd is None:
sgd = self.create_optimizer() sgd = self.create_optimizer()

View File

@ -7,7 +7,7 @@ from ..tokens import Doc
from ..vocab import Vocab from ..vocab import Vocab
from ..language import Language from ..language import Language
from ..errors import Errors from ..errors import Errors
from ..util import link_vectors_to_models, minibatch from ..util import minibatch
default_model_config = """ default_model_config = """
@ -198,7 +198,6 @@ class Tok2Vec(Pipe):
""" """
docs = [Doc(self.vocab, words=["hello"])] docs = [Doc(self.vocab, words=["hello"])]
self.model.initialize(X=docs) self.model.initialize(X=docs)
link_vectors_to_models(self.vocab)
class Tok2VecListener(Model): class Tok2VecListener(Model):

View File

@ -21,7 +21,7 @@ from .transition_system cimport Transition
from ..compat import copy_array from ..compat import copy_array
from ..errors import Errors, TempErrors from ..errors import Errors, TempErrors
from ..util import link_vectors_to_models, create_default_optimizer from ..util import create_default_optimizer
from .. import util from .. import util
from . import nonproj from . import nonproj

View File

@ -29,7 +29,7 @@ from .stateclass cimport StateClass
from ._state cimport StateC from ._state cimport StateC
from .transition_system cimport Transition from .transition_system cimport Transition
from ..util import link_vectors_to_models, create_default_optimizer, registry from ..util import create_default_optimizer, registry
from ..compat import copy_array from ..compat import copy_array
from ..errors import Errors, Warnings from ..errors import Errors, Warnings
from .. import util from .. import util
@ -456,7 +456,6 @@ cdef class Parser:
self.model.initialize() self.model.initialize()
if pipeline is not None: if pipeline is not None:
self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg) self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg)
link_vectors_to_models(self.vocab)
return sgd return sgd
def to_disk(self, path, exclude=tuple()): def to_disk(self, path, exclude=tuple()):

View File

@ -9,7 +9,6 @@ from spacy.matcher import Matcher
from spacy.tokens import Doc, Span from spacy.tokens import Doc, Span
from spacy.vocab import Vocab from spacy.vocab import Vocab
from spacy.compat import pickle from spacy.compat import pickle
from spacy.util import link_vectors_to_models
import numpy import numpy
import random import random
@ -190,7 +189,6 @@ def test_issue2871():
_ = vocab[word] # noqa: F841 _ = vocab[word] # noqa: F841
vocab.set_vector(word, vector_data[0]) vocab.set_vector(word, vector_data[0])
vocab.vectors.name = "dummy_vectors" vocab.vectors.name = "dummy_vectors"
link_vectors_to_models(vocab)
assert vocab["dog"].rank == 0 assert vocab["dog"].rank == 0
assert vocab["cat"].rank == 1 assert vocab["cat"].rank == 1
assert vocab["SUFFIX"].rank == 2 assert vocab["SUFFIX"].rank == 2

View File

@ -1200,29 +1200,6 @@ class DummyTokenizer:
return self return self
def link_vectors_to_models(
vocab: "Vocab",
models: List[Model] = [],
*,
vectors_name_attr="vectors_name",
vectors_attr="vectors",
key2row_attr="key2row",
default_vectors_name="spacy_pretrained_vectors",
) -> None:
"""Supply vectors data to models."""
vectors = vocab.vectors
if vectors.name is None:
vectors.name = default_vectors_name
if vectors.data.size != 0:
warnings.warn(Warnings.W020.format(shape=vectors.data.shape))
for model in models:
for node in model.walk():
if node.attrs.get(vectors_name_attr) == vectors.name:
node.attrs[vectors_attr] = Unserializable(vectors.data)
node.attrs[key2row_attr] = Unserializable(vectors.key2row)
def create_default_optimizer() -> Optimizer: def create_default_optimizer() -> Optimizer:
# TODO: Do we still want to allow env_opt? # TODO: Do we still want to allow env_opt?
learn_rate = env_opt("learn_rate", 0.001) learn_rate = env_opt("learn_rate", 0.001)

View File

@ -16,7 +16,7 @@ from .errors import Errors
from .lemmatizer import Lemmatizer from .lemmatizer import Lemmatizer
from .attrs import intify_attrs, NORM, IS_STOP from .attrs import intify_attrs, NORM, IS_STOP
from .vectors import Vectors from .vectors import Vectors
from .util import link_vectors_to_models, registry from .util import registry
from .lookups import Lookups, load_lookups from .lookups import Lookups, load_lookups
from . import util from . import util
from .lang.norm_exceptions import BASE_NORMS from .lang.norm_exceptions import BASE_NORMS
@ -344,7 +344,6 @@ cdef class Vocab:
synonym = self.strings[syn_keys[i][0]] synonym = self.strings[syn_keys[i][0]]
score = scores[i][0] score = scores[i][0]
remap[word] = (synonym, score) remap[word] = (synonym, score)
link_vectors_to_models(self)
return remap return remap
def get_vector(self, orth, minn=None, maxn=None): def get_vector(self, orth, minn=None, maxn=None):
@ -476,8 +475,6 @@ cdef class Vocab:
if "vectors" not in exclude: if "vectors" not in exclude:
if self.vectors is not None: if self.vectors is not None:
self.vectors.from_disk(path, exclude=["strings"]) self.vectors.from_disk(path, exclude=["strings"])
if self.vectors.name is not None:
link_vectors_to_models(self)
if "lookups" not in exclude: if "lookups" not in exclude:
self.lookups.from_disk(path) self.lookups.from_disk(path)
if "lexeme_norm" in self.lookups: if "lexeme_norm" in self.lookups:
@ -537,8 +534,6 @@ cdef class Vocab:
) )
self.length = 0 self.length = 0
self._by_orth = PreshMap() self._by_orth = PreshMap()
if self.vectors.name is not None:
link_vectors_to_models(self)
return self return self
def _reset_cache(self, keys, strings): def _reset_cache(self, keys, strings):