mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 18:06:29 +03:00
Sync vocab in vectors and components sourced in configs (#9335)
Since a component may reference anything in the vocab, share the full vocab when loading source components and vectors (which will include `strings` as of #8909). When loading a source component from a config, save and restore the vocab state after loading source pipelines, in particular to preserve the original state without vectors, since `[initialize.vectors] = null` skips rather than resets the vectors. The vocab references are not synced for components loaded with `Language.add_pipe(source=)` because the pipelines are already loaded and not necessarily with the same vocab. A warning could be added in `Language.create_pipe_from_source` that it may be necessary to save and reload before training, but it's a rare enough case that this kind of warning may be too noisy overall.
This commit is contained in:
parent
6e833b617a
commit
4192e71599
|
@ -707,8 +707,9 @@ class Language:
|
||||||
source_config = source.config.interpolate()
|
source_config = source.config.interpolate()
|
||||||
pipe_config = util.copy_config(source_config["components"][source_name])
|
pipe_config = util.copy_config(source_config["components"][source_name])
|
||||||
self._pipe_configs[name] = pipe_config
|
self._pipe_configs[name] = pipe_config
|
||||||
for s in source.vocab.strings:
|
if self.vocab.strings != source.vocab.strings:
|
||||||
self.vocab.strings.add(s)
|
for s in source.vocab.strings:
|
||||||
|
self.vocab.strings.add(s)
|
||||||
return pipe, pipe_config["factory"]
|
return pipe, pipe_config["factory"]
|
||||||
|
|
||||||
def add_pipe(
|
def add_pipe(
|
||||||
|
@ -1700,6 +1701,7 @@ class Language:
|
||||||
# them here so they're only loaded once
|
# them here so they're only loaded once
|
||||||
source_nlps = {}
|
source_nlps = {}
|
||||||
source_nlp_vectors_hashes = {}
|
source_nlp_vectors_hashes = {}
|
||||||
|
vocab_b = None
|
||||||
for pipe_name in config["nlp"]["pipeline"]:
|
for pipe_name in config["nlp"]["pipeline"]:
|
||||||
if pipe_name not in pipeline:
|
if pipe_name not in pipeline:
|
||||||
opts = ", ".join(pipeline.keys())
|
opts = ", ".join(pipeline.keys())
|
||||||
|
@ -1722,14 +1724,22 @@ class Language:
|
||||||
raw_config=raw_config,
|
raw_config=raw_config,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
|
# We need the sourced components to reference the same
|
||||||
|
# vocab without modifying the current vocab state **AND**
|
||||||
|
# we still want to load the source model vectors to perform
|
||||||
|
# the vectors check. Since the source vectors clobber the
|
||||||
|
# current ones, we save the original vocab state and
|
||||||
|
# restore after this loop. Existing strings are preserved
|
||||||
|
# during deserialization, so they do not need any
|
||||||
|
# additional handling.
|
||||||
|
if vocab_b is None:
|
||||||
|
vocab_b = nlp.vocab.to_bytes(exclude=["lookups", "strings"])
|
||||||
model = pipe_cfg["source"]
|
model = pipe_cfg["source"]
|
||||||
if model not in source_nlps:
|
if model not in source_nlps:
|
||||||
# We only need the components here and we intentionally
|
# Load with the same vocab, adding any strings
|
||||||
# do not load the model with the same vocab because
|
source_nlps[model] = util.load_model(
|
||||||
# this would cause the vectors to be copied into the
|
model, vocab=nlp.vocab, exclude=["lookups"]
|
||||||
# current nlp object (all the strings will be added in
|
)
|
||||||
# create_pipe_from_source)
|
|
||||||
source_nlps[model] = util.load_model(model)
|
|
||||||
source_name = pipe_cfg.get("component", pipe_name)
|
source_name = pipe_cfg.get("component", pipe_name)
|
||||||
listeners_replaced = False
|
listeners_replaced = False
|
||||||
if "replace_listeners" in pipe_cfg:
|
if "replace_listeners" in pipe_cfg:
|
||||||
|
@ -1756,6 +1766,9 @@ class Language:
|
||||||
# Delete from cache if listeners were replaced
|
# Delete from cache if listeners were replaced
|
||||||
if listeners_replaced:
|
if listeners_replaced:
|
||||||
del source_nlps[model]
|
del source_nlps[model]
|
||||||
|
# Restore the original vocab after sourcing if necessary
|
||||||
|
if vocab_b is not None:
|
||||||
|
nlp.vocab.from_bytes(vocab_b)
|
||||||
disabled_pipes = [*config["nlp"]["disabled"], *disable]
|
disabled_pipes = [*config["nlp"]["disabled"], *disable]
|
||||||
nlp._disabled = set(p for p in disabled_pipes if p not in exclude)
|
nlp._disabled = set(p for p in disabled_pipes if p not in exclude)
|
||||||
nlp.batch_size = config["nlp"]["batch_size"]
|
nlp.batch_size = config["nlp"]["batch_size"]
|
||||||
|
|
|
@ -144,7 +144,12 @@ def load_vectors_into_model(
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Load word vectors from an installed model or path into a model instance."""
|
"""Load word vectors from an installed model or path into a model instance."""
|
||||||
try:
|
try:
|
||||||
vectors_nlp = load_model(name)
|
# Load with the same vocab, which automatically adds the vectors to
|
||||||
|
# the current nlp object. Exclude lookups so they are not modified.
|
||||||
|
exclude = ["lookups"]
|
||||||
|
if not add_strings:
|
||||||
|
exclude.append("strings")
|
||||||
|
vectors_nlp = load_model(name, vocab=nlp.vocab, exclude=exclude)
|
||||||
except ConfigValidationError as e:
|
except ConfigValidationError as e:
|
||||||
title = f"Config validation error for vectors {name}"
|
title = f"Config validation error for vectors {name}"
|
||||||
desc = (
|
desc = (
|
||||||
|
@ -158,15 +163,8 @@ def load_vectors_into_model(
|
||||||
if len(vectors_nlp.vocab.vectors.keys()) == 0:
|
if len(vectors_nlp.vocab.vectors.keys()) == 0:
|
||||||
logger.warning(Warnings.W112.format(name=name))
|
logger.warning(Warnings.W112.format(name=name))
|
||||||
|
|
||||||
nlp.vocab.vectors = vectors_nlp.vocab.vectors
|
|
||||||
for lex in nlp.vocab:
|
for lex in nlp.vocab:
|
||||||
lex.rank = nlp.vocab.vectors.key2row.get(lex.orth, OOV_RANK)
|
lex.rank = nlp.vocab.vectors.key2row.get(lex.orth, OOV_RANK)
|
||||||
if add_strings:
|
|
||||||
# I guess we should add the strings from the vectors_nlp model?
|
|
||||||
# E.g. if someone does a similarity query, they might expect the strings.
|
|
||||||
for key in nlp.vocab.vectors.key2row:
|
|
||||||
if key in vectors_nlp.vocab.strings:
|
|
||||||
nlp.vocab.strings.add(vectors_nlp.vocab.strings[key])
|
|
||||||
|
|
||||||
|
|
||||||
def init_tok2vec(
|
def init_tok2vec(
|
||||||
|
|
Loading…
Reference in New Issue
Block a user