Sync vocab in vectors and components sourced in configs (#9335)

Since a component may reference anything in the vocab, share the full vocab when loading source components and vectors (which will include `strings` as of #8909). When loading a source component from a config, save and restore the vocab state after loading source pipelines, in particular to preserve the original state without vectors, since `[initialize.vectors] = null` skips rather than resets the vectors. The vocab references are not synced for components loaded with `Language.add_pipe(source=)` because the pipelines are already loaded and not necessarily with the same vocab. A warning could be added in `Language.create_pipe_from_source` that it may be necessary to save and reload before training, but it's a rare enough case that this kind of warning may be too noisy overall.
2025-10-24 20:51:30 +03:00 · 2021-10-04 12:19:02 +02:00 · 2021-10-04 12:19:02 +02:00 · 4192e71599
commit 4192e71599
parent 6e833b617a
2 changed files with 27 additions and 16 deletions
--- a/spacy/language.py
+++ b/spacy/language.py
@ -707,8 +707,9 @@ class Language:
        source_config = source.config.interpolate()
        pipe_config = util.copy_config(source_config["components"][source_name])
        self._pipe_configs[name] = pipe_config
-        for s in source.vocab.strings:
-            self.vocab.strings.add(s)
+        if self.vocab.strings != source.vocab.strings:
+            for s in source.vocab.strings:
+                self.vocab.strings.add(s)
        return pipe, pipe_config["factory"]

    def add_pipe(
@ -1700,6 +1701,7 @@ class Language:
        # them here so they're only loaded once
        source_nlps = {}
        source_nlp_vectors_hashes = {}
+        vocab_b = None
        for pipe_name in config["nlp"]["pipeline"]:
            if pipe_name not in pipeline:
                opts = ", ".join(pipeline.keys())
@ -1722,14 +1724,22 @@ class Language:
                        raw_config=raw_config,
                    )
                else:
+                    # We need the sourced components to reference the same
+                    # vocab without modifying the current vocab state **AND**
+                    # we still want to load the source model vectors to perform
+                    # the vectors check. Since the source vectors clobber the
+                    # current ones, we save the original vocab state and
+                    # restore after this loop. Existing strings are preserved
+                    # during deserialization, so they do not need any
+                    # additional handling.
+                    if vocab_b is None:
+                        vocab_b = nlp.vocab.to_bytes(exclude=["lookups", "strings"])
                    model = pipe_cfg["source"]
                    if model not in source_nlps:
-                        # We only need the components here and we intentionally
-                        # do not load the model with the same vocab because
-                        # this would cause the vectors to be copied into the
-                        # current nlp object (all the strings will be added in
-                        # create_pipe_from_source)
-                        source_nlps[model] = util.load_model(model)
+                        # Load with the same vocab, adding any strings
+                        source_nlps[model] = util.load_model(
+                            model, vocab=nlp.vocab, exclude=["lookups"]
+                        )
                    source_name = pipe_cfg.get("component", pipe_name)
                    listeners_replaced = False
                    if "replace_listeners" in pipe_cfg:
@ -1756,6 +1766,9 @@ class Language:
                    # Delete from cache if listeners were replaced
                    if listeners_replaced:
                        del source_nlps[model]
+        # Restore the original vocab after sourcing if necessary
+        if vocab_b is not None:
+            nlp.vocab.from_bytes(vocab_b)
        disabled_pipes = [*config["nlp"]["disabled"], *disable]
        nlp._disabled = set(p for p in disabled_pipes if p not in exclude)
        nlp.batch_size = config["nlp"]["batch_size"]
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@ -144,7 +144,12 @@ def load_vectors_into_model(
 ) -> None:
    """Load word vectors from an installed model or path into a model instance."""
    try:
-        vectors_nlp = load_model(name)
+        # Load with the same vocab, which automatically adds the vectors to
+        # the current nlp object. Exclude lookups so they are not modified.
+        exclude = ["lookups"]
+        if not add_strings:
+            exclude.append("strings")
+        vectors_nlp = load_model(name, vocab=nlp.vocab, exclude=exclude)
    except ConfigValidationError as e:
        title = f"Config validation error for vectors {name}"
        desc = (
@ -158,15 +163,8 @@ def load_vectors_into_model(
    if len(vectors_nlp.vocab.vectors.keys()) == 0:
        logger.warning(Warnings.W112.format(name=name))

-    nlp.vocab.vectors = vectors_nlp.vocab.vectors
    for lex in nlp.vocab:
        lex.rank = nlp.vocab.vectors.key2row.get(lex.orth, OOV_RANK)
-    if add_strings:
-        # I guess we should add the strings from the vectors_nlp model?
-        # E.g. if someone does a similarity query, they might expect the strings.
-        for key in nlp.vocab.vectors.key2row:
-            if key in vectors_nlp.vocab.strings:
-                nlp.vocab.strings.add(vectors_nlp.vocab.strings[key])


 def init_tok2vec(