From 4192e715994ac46f8ded67608115153899457892 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 4 Oct 2021 12:19:02 +0200 Subject: [PATCH] Sync vocab in vectors and components sourced in configs (#9335) Since a component may reference anything in the vocab, share the full vocab when loading source components and vectors (which will include `strings` as of #8909). When loading a source component from a config, save and restore the vocab state after loading source pipelines, in particular to preserve the original state without vectors, since `[initialize.vectors] = null` skips rather than resets the vectors. The vocab references are not synced for components loaded with `Language.add_pipe(source=)` because the pipelines are already loaded and not necessarily with the same vocab. A warning could be added in `Language.create_pipe_from_source` that it may be necessary to save and reload before training, but it's a rare enough case that this kind of warning may be too noisy overall. --- spacy/language.py | 29 +++++++++++++++++++++-------- spacy/training/initialize.py | 14 ++++++-------- 2 files changed, 27 insertions(+), 16 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 6abbc6f56..81d740d74 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -707,8 +707,9 @@ class Language: source_config = source.config.interpolate() pipe_config = util.copy_config(source_config["components"][source_name]) self._pipe_configs[name] = pipe_config - for s in source.vocab.strings: - self.vocab.strings.add(s) + if self.vocab.strings != source.vocab.strings: + for s in source.vocab.strings: + self.vocab.strings.add(s) return pipe, pipe_config["factory"] def add_pipe( @@ -1700,6 +1701,7 @@ class Language: # them here so they're only loaded once source_nlps = {} source_nlp_vectors_hashes = {} + vocab_b = None for pipe_name in config["nlp"]["pipeline"]: if pipe_name not in pipeline: opts = ", ".join(pipeline.keys()) @@ -1722,14 +1724,22 @@ class Language: raw_config=raw_config, ) else: + # We need the sourced components to reference the same + # vocab without modifying the current vocab state **AND** + # we still want to load the source model vectors to perform + # the vectors check. Since the source vectors clobber the + # current ones, we save the original vocab state and + # restore after this loop. Existing strings are preserved + # during deserialization, so they do not need any + # additional handling. + if vocab_b is None: + vocab_b = nlp.vocab.to_bytes(exclude=["lookups", "strings"]) model = pipe_cfg["source"] if model not in source_nlps: - # We only need the components here and we intentionally - # do not load the model with the same vocab because - # this would cause the vectors to be copied into the - # current nlp object (all the strings will be added in - # create_pipe_from_source) - source_nlps[model] = util.load_model(model) + # Load with the same vocab, adding any strings + source_nlps[model] = util.load_model( + model, vocab=nlp.vocab, exclude=["lookups"] + ) source_name = pipe_cfg.get("component", pipe_name) listeners_replaced = False if "replace_listeners" in pipe_cfg: @@ -1756,6 +1766,9 @@ class Language: # Delete from cache if listeners were replaced if listeners_replaced: del source_nlps[model] + # Restore the original vocab after sourcing if necessary + if vocab_b is not None: + nlp.vocab.from_bytes(vocab_b) disabled_pipes = [*config["nlp"]["disabled"], *disable] nlp._disabled = set(p for p in disabled_pipes if p not in exclude) nlp.batch_size = config["nlp"]["batch_size"] diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index bd014f75f..4eb8ea276 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -144,7 +144,12 @@ def load_vectors_into_model( ) -> None: """Load word vectors from an installed model or path into a model instance.""" try: - vectors_nlp = load_model(name) + # Load with the same vocab, which automatically adds the vectors to + # the current nlp object. Exclude lookups so they are not modified. + exclude = ["lookups"] + if not add_strings: + exclude.append("strings") + vectors_nlp = load_model(name, vocab=nlp.vocab, exclude=exclude) except ConfigValidationError as e: title = f"Config validation error for vectors {name}" desc = ( @@ -158,15 +163,8 @@ def load_vectors_into_model( if len(vectors_nlp.vocab.vectors.keys()) == 0: logger.warning(Warnings.W112.format(name=name)) - nlp.vocab.vectors = vectors_nlp.vocab.vectors for lex in nlp.vocab: lex.rank = nlp.vocab.vectors.key2row.get(lex.orth, OOV_RANK) - if add_strings: - # I guess we should add the strings from the vectors_nlp model? - # E.g. if someone does a similarity query, they might expect the strings. - for key in nlp.vocab.vectors.key2row: - if key in vectors_nlp.vocab.strings: - nlp.vocab.strings.add(vectors_nlp.vocab.strings[key]) def init_tok2vec(