mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Sync vocab in vectors and components sourced in configs (#9335)
Since a component may reference anything in the vocab, share the full vocab when loading source components and vectors (which will include `strings` as of #8909). When loading a source component from a config, save and restore the vocab state after loading source pipelines, in particular to preserve the original state without vectors, since `[initialize.vectors] = null` skips rather than resets the vectors. The vocab references are not synced for components loaded with `Language.add_pipe(source=)` because the pipelines are already loaded and not necessarily with the same vocab. A warning could be added in `Language.create_pipe_from_source` that it may be necessary to save and reload before training, but it's a rare enough case that this kind of warning may be too noisy overall.
This commit is contained in:
parent
6e833b617a
commit
4192e71599
|
@ -707,8 +707,9 @@ class Language:
|
|||
source_config = source.config.interpolate()
|
||||
pipe_config = util.copy_config(source_config["components"][source_name])
|
||||
self._pipe_configs[name] = pipe_config
|
||||
for s in source.vocab.strings:
|
||||
self.vocab.strings.add(s)
|
||||
if self.vocab.strings != source.vocab.strings:
|
||||
for s in source.vocab.strings:
|
||||
self.vocab.strings.add(s)
|
||||
return pipe, pipe_config["factory"]
|
||||
|
||||
def add_pipe(
|
||||
|
@ -1700,6 +1701,7 @@ class Language:
|
|||
# them here so they're only loaded once
|
||||
source_nlps = {}
|
||||
source_nlp_vectors_hashes = {}
|
||||
vocab_b = None
|
||||
for pipe_name in config["nlp"]["pipeline"]:
|
||||
if pipe_name not in pipeline:
|
||||
opts = ", ".join(pipeline.keys())
|
||||
|
@ -1722,14 +1724,22 @@ class Language:
|
|||
raw_config=raw_config,
|
||||
)
|
||||
else:
|
||||
# We need the sourced components to reference the same
|
||||
# vocab without modifying the current vocab state **AND**
|
||||
# we still want to load the source model vectors to perform
|
||||
# the vectors check. Since the source vectors clobber the
|
||||
# current ones, we save the original vocab state and
|
||||
# restore after this loop. Existing strings are preserved
|
||||
# during deserialization, so they do not need any
|
||||
# additional handling.
|
||||
if vocab_b is None:
|
||||
vocab_b = nlp.vocab.to_bytes(exclude=["lookups", "strings"])
|
||||
model = pipe_cfg["source"]
|
||||
if model not in source_nlps:
|
||||
# We only need the components here and we intentionally
|
||||
# do not load the model with the same vocab because
|
||||
# this would cause the vectors to be copied into the
|
||||
# current nlp object (all the strings will be added in
|
||||
# create_pipe_from_source)
|
||||
source_nlps[model] = util.load_model(model)
|
||||
# Load with the same vocab, adding any strings
|
||||
source_nlps[model] = util.load_model(
|
||||
model, vocab=nlp.vocab, exclude=["lookups"]
|
||||
)
|
||||
source_name = pipe_cfg.get("component", pipe_name)
|
||||
listeners_replaced = False
|
||||
if "replace_listeners" in pipe_cfg:
|
||||
|
@ -1756,6 +1766,9 @@ class Language:
|
|||
# Delete from cache if listeners were replaced
|
||||
if listeners_replaced:
|
||||
del source_nlps[model]
|
||||
# Restore the original vocab after sourcing if necessary
|
||||
if vocab_b is not None:
|
||||
nlp.vocab.from_bytes(vocab_b)
|
||||
disabled_pipes = [*config["nlp"]["disabled"], *disable]
|
||||
nlp._disabled = set(p for p in disabled_pipes if p not in exclude)
|
||||
nlp.batch_size = config["nlp"]["batch_size"]
|
||||
|
|
|
@ -144,7 +144,12 @@ def load_vectors_into_model(
|
|||
) -> None:
|
||||
"""Load word vectors from an installed model or path into a model instance."""
|
||||
try:
|
||||
vectors_nlp = load_model(name)
|
||||
# Load with the same vocab, which automatically adds the vectors to
|
||||
# the current nlp object. Exclude lookups so they are not modified.
|
||||
exclude = ["lookups"]
|
||||
if not add_strings:
|
||||
exclude.append("strings")
|
||||
vectors_nlp = load_model(name, vocab=nlp.vocab, exclude=exclude)
|
||||
except ConfigValidationError as e:
|
||||
title = f"Config validation error for vectors {name}"
|
||||
desc = (
|
||||
|
@ -158,15 +163,8 @@ def load_vectors_into_model(
|
|||
if len(vectors_nlp.vocab.vectors.keys()) == 0:
|
||||
logger.warning(Warnings.W112.format(name=name))
|
||||
|
||||
nlp.vocab.vectors = vectors_nlp.vocab.vectors
|
||||
for lex in nlp.vocab:
|
||||
lex.rank = nlp.vocab.vectors.key2row.get(lex.orth, OOV_RANK)
|
||||
if add_strings:
|
||||
# I guess we should add the strings from the vectors_nlp model?
|
||||
# E.g. if someone does a similarity query, they might expect the strings.
|
||||
for key in nlp.vocab.vectors.key2row:
|
||||
if key in vectors_nlp.vocab.strings:
|
||||
nlp.vocab.strings.add(vectors_nlp.vocab.strings[key])
|
||||
|
||||
|
||||
def init_tok2vec(
|
||||
|
|
Loading…
Reference in New Issue
Block a user