Sync vocab in vectors and components sourced in configs (#9335)

Since a component may reference anything in the vocab, share the full
vocab when loading source components and vectors (which will include
`strings` as of #8909).

When loading a source component from a config, save and restore the
vocab state after loading source pipelines, in particular to preserve
the original state without vectors, since `[initialize.vectors]
= null` skips rather than resets the vectors.

The vocab references are not synced for components loaded with
`Language.add_pipe(source=)` because the pipelines are already loaded
and not necessarily with the same vocab. A warning could be added in
`Language.create_pipe_from_source` that it may be necessary to save and
reload before training, but it's a rare enough case that this kind of
warning may be too noisy overall.
This commit is contained in:
Adriane Boyd 2021-10-04 12:19:02 +02:00 committed by GitHub
parent 6e833b617a
commit 4192e71599
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 27 additions and 16 deletions

View File

@ -707,8 +707,9 @@ class Language:
source_config = source.config.interpolate()
pipe_config = util.copy_config(source_config["components"][source_name])
self._pipe_configs[name] = pipe_config
for s in source.vocab.strings:
self.vocab.strings.add(s)
if self.vocab.strings != source.vocab.strings:
for s in source.vocab.strings:
self.vocab.strings.add(s)
return pipe, pipe_config["factory"]
def add_pipe(
@ -1700,6 +1701,7 @@ class Language:
# them here so they're only loaded once
source_nlps = {}
source_nlp_vectors_hashes = {}
vocab_b = None
for pipe_name in config["nlp"]["pipeline"]:
if pipe_name not in pipeline:
opts = ", ".join(pipeline.keys())
@ -1722,14 +1724,22 @@ class Language:
raw_config=raw_config,
)
else:
# We need the sourced components to reference the same
# vocab without modifying the current vocab state **AND**
# we still want to load the source model vectors to perform
# the vectors check. Since the source vectors clobber the
# current ones, we save the original vocab state and
# restore after this loop. Existing strings are preserved
# during deserialization, so they do not need any
# additional handling.
if vocab_b is None:
vocab_b = nlp.vocab.to_bytes(exclude=["lookups", "strings"])
model = pipe_cfg["source"]
if model not in source_nlps:
# We only need the components here and we intentionally
# do not load the model with the same vocab because
# this would cause the vectors to be copied into the
# current nlp object (all the strings will be added in
# create_pipe_from_source)
source_nlps[model] = util.load_model(model)
# Load with the same vocab, adding any strings
source_nlps[model] = util.load_model(
model, vocab=nlp.vocab, exclude=["lookups"]
)
source_name = pipe_cfg.get("component", pipe_name)
listeners_replaced = False
if "replace_listeners" in pipe_cfg:
@ -1756,6 +1766,9 @@ class Language:
# Delete from cache if listeners were replaced
if listeners_replaced:
del source_nlps[model]
# Restore the original vocab after sourcing if necessary
if vocab_b is not None:
nlp.vocab.from_bytes(vocab_b)
disabled_pipes = [*config["nlp"]["disabled"], *disable]
nlp._disabled = set(p for p in disabled_pipes if p not in exclude)
nlp.batch_size = config["nlp"]["batch_size"]

View File

@ -144,7 +144,12 @@ def load_vectors_into_model(
) -> None:
"""Load word vectors from an installed model or path into a model instance."""
try:
vectors_nlp = load_model(name)
# Load with the same vocab, which automatically adds the vectors to
# the current nlp object. Exclude lookups so they are not modified.
exclude = ["lookups"]
if not add_strings:
exclude.append("strings")
vectors_nlp = load_model(name, vocab=nlp.vocab, exclude=exclude)
except ConfigValidationError as e:
title = f"Config validation error for vectors {name}"
desc = (
@ -158,15 +163,8 @@ def load_vectors_into_model(
if len(vectors_nlp.vocab.vectors.keys()) == 0:
logger.warning(Warnings.W112.format(name=name))
nlp.vocab.vectors = vectors_nlp.vocab.vectors
for lex in nlp.vocab:
lex.rank = nlp.vocab.vectors.key2row.get(lex.orth, OOV_RANK)
if add_strings:
# I guess we should add the strings from the vectors_nlp model?
# E.g. if someone does a similarity query, they might expect the strings.
for key in nlp.vocab.vectors.key2row:
if key in vectors_nlp.vocab.strings:
nlp.vocab.strings.add(vectors_nlp.vocab.strings[key])
def init_tok2vec(