From 4192e715994ac46f8ded67608115153899457892 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 4 Oct 2021 12:19:02 +0200
Subject: [PATCH] Sync vocab in vectors and components sourced in configs
 (#9335)

Since a component may reference anything in the vocab, share the full
vocab when loading source components and vectors (which will include
`strings` as of #8909).

When loading a source component from a config, save and restore the
vocab state after loading source pipelines, in particular to preserve
the original state without vectors, since `[initialize.vectors]
= null` skips rather than resets the vectors.

The vocab references are not synced for components loaded with
`Language.add_pipe(source=)` because the pipelines are already loaded
and not necessarily with the same vocab. A warning could be added in
`Language.create_pipe_from_source` that it may be necessary to save and
reload before training, but it's a rare enough case that this kind of
warning may be too noisy overall.
---
 spacy/language.py            | 29 +++++++++++++++++++++--------
 spacy/training/initialize.py | 14 ++++++--------
 2 files changed, 27 insertions(+), 16 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index 6abbc6f56..81d740d74 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -707,8 +707,9 @@ class Language:
         source_config = source.config.interpolate()
         pipe_config = util.copy_config(source_config["components"][source_name])
         self._pipe_configs[name] = pipe_config
-        for s in source.vocab.strings:
-            self.vocab.strings.add(s)
+        if self.vocab.strings != source.vocab.strings:
+            for s in source.vocab.strings:
+                self.vocab.strings.add(s)
         return pipe, pipe_config["factory"]
 
     def add_pipe(
@@ -1700,6 +1701,7 @@ class Language:
         # them here so they're only loaded once
         source_nlps = {}
         source_nlp_vectors_hashes = {}
+        vocab_b = None
         for pipe_name in config["nlp"]["pipeline"]:
             if pipe_name not in pipeline:
                 opts = ", ".join(pipeline.keys())
@@ -1722,14 +1724,22 @@ class Language:
                         raw_config=raw_config,
                     )
                 else:
+                    # We need the sourced components to reference the same
+                    # vocab without modifying the current vocab state **AND**
+                    # we still want to load the source model vectors to perform
+                    # the vectors check. Since the source vectors clobber the
+                    # current ones, we save the original vocab state and
+                    # restore after this loop. Existing strings are preserved
+                    # during deserialization, so they do not need any
+                    # additional handling.
+                    if vocab_b is None:
+                        vocab_b = nlp.vocab.to_bytes(exclude=["lookups", "strings"])
                     model = pipe_cfg["source"]
                     if model not in source_nlps:
-                        # We only need the components here and we intentionally
-                        # do not load the model with the same vocab because
-                        # this would cause the vectors to be copied into the
-                        # current nlp object (all the strings will be added in
-                        # create_pipe_from_source)
-                        source_nlps[model] = util.load_model(model)
+                        # Load with the same vocab, adding any strings
+                        source_nlps[model] = util.load_model(
+                            model, vocab=nlp.vocab, exclude=["lookups"]
+                        )
                     source_name = pipe_cfg.get("component", pipe_name)
                     listeners_replaced = False
                     if "replace_listeners" in pipe_cfg:
@@ -1756,6 +1766,9 @@ class Language:
                     # Delete from cache if listeners were replaced
                     if listeners_replaced:
                         del source_nlps[model]
+        # Restore the original vocab after sourcing if necessary
+        if vocab_b is not None:
+            nlp.vocab.from_bytes(vocab_b)
         disabled_pipes = [*config["nlp"]["disabled"], *disable]
         nlp._disabled = set(p for p in disabled_pipes if p not in exclude)
         nlp.batch_size = config["nlp"]["batch_size"]
diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py
index bd014f75f..4eb8ea276 100644
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@@ -144,7 +144,12 @@ def load_vectors_into_model(
 ) -> None:
     """Load word vectors from an installed model or path into a model instance."""
     try:
-        vectors_nlp = load_model(name)
+        # Load with the same vocab, which automatically adds the vectors to
+        # the current nlp object. Exclude lookups so they are not modified.
+        exclude = ["lookups"]
+        if not add_strings:
+            exclude.append("strings")
+        vectors_nlp = load_model(name, vocab=nlp.vocab, exclude=exclude)
     except ConfigValidationError as e:
         title = f"Config validation error for vectors {name}"
         desc = (
@@ -158,15 +163,8 @@ def load_vectors_into_model(
     if len(vectors_nlp.vocab.vectors.keys()) == 0:
         logger.warning(Warnings.W112.format(name=name))
 
-    nlp.vocab.vectors = vectors_nlp.vocab.vectors
     for lex in nlp.vocab:
         lex.rank = nlp.vocab.vectors.key2row.get(lex.orth, OOV_RANK)
-    if add_strings:
-        # I guess we should add the strings from the vectors_nlp model?
-        # E.g. if someone does a similarity query, they might expect the strings.
-        for key in nlp.vocab.vectors.key2row:
-            if key in vectors_nlp.vocab.strings:
-                nlp.vocab.strings.add(vectors_nlp.vocab.strings[key])
 
 
 def init_tok2vec(