Load vocab lookups tables at beginning of training

Similar to how vectors are handled, move the vocab lookups to be loaded at the start of training rather than when the vocab is initialized, since the vocab doesn't have access to the full config when it's created. The option moves from `nlp.load_vocab_data` to `training.lookups`. Typically these tables will come from `spacy-lookups-data`, but any `Lookups` object can be provided. The loading from `spacy-lookups-data` is now strict, so configs for each language should specify the exact tables required. This also makes it easier to control whether the larger clusters and probs tables are included. To load `lexeme_norm` from `spacy-lookups-data`: ``` [training.lookups] @misc = "spacy.LoadLookupsData.v1" lang = ${nlp.lang} tables = ["lexeme_norm"] ```
2025-08-06 13:20:20 +03:00 · 2020-09-18 15:45:55 +02:00 · 2020-09-18 15:45:55 +02:00 · eed4b785f5
commit eed4b785f5
parent 0406200a1e
7 changed files with 30 additions and 16 deletions
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -88,6 +88,7 @@ def train(
    sourced_components = get_sourced_components(config)
    with show_validation_error(config_path):
        nlp, config = util.load_model_from_config(config)
+    util.load_vocab_data_into_model(nlp, lookups=config["training"]["lookups"])
    if config["training"]["vectors"] is not None:
        util.load_vectors_into_model(nlp, config["training"]["vectors"])
    raw_text, tag_map, morph_rules, weights_data = load_from_paths(config)
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@ -12,7 +12,6 @@ use_pytorch_for_gpu_memory = false
 lang = null
 pipeline = []
 disabled = []
-load_vocab_data = true
 before_creation = null
 after_creation = null
 after_pipeline_creation = null
@ -58,6 +57,7 @@ accumulate_gradient = 1
 init_tok2vec = ${paths.init_tok2vec}
 raw_text = ${paths.raw}
 vectors = null
+lookups = null
 # Controls early-stopping. 0 or -1 mean unlimited.
 patience = 1600
 max_epochs = 0
--- a/spacy/language.py
+++ b/spacy/language.py
@ -31,6 +31,7 @@ from .schemas import ConfigSchema
 from .git_info import GIT_VERSION
 from . import util
 from . import about
+from .lookups import load_lookups


 # This is the base config will all settings (training etc.)
@ -86,6 +87,12 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]:
    return tokenizer_factory


+@registry.misc("spacy.LoadLookupsData.v1")
+def load_lookups_data(lang, tables):
+    lookups = load_lookups(lang=lang, tables=tables)
+    return lookups
+
+
 class Language:
    """A text-processing pipeline. Usually you'll load this once per process,
    and pass the instance around your application.
@ -152,7 +159,6 @@ class Language:
                self.lang,
                self.Defaults,
                vectors_name=vectors_name,
-                load_data=self._config["nlp"]["load_vocab_data"],
            )
        else:
            if (self.lang and vocab.lang) and (self.lang != vocab.lang):
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@ -8,6 +8,7 @@ from collections import defaultdict
 from thinc.api import Optimizer

 from .attrs import NAMES
+from .lookups import Lookups

 if TYPE_CHECKING:
    # This lets us add type hints for mypy etc. without causing circular imports
@ -198,6 +199,7 @@ class ModelMetaSchema(BaseModel):
 class ConfigSchemaTraining(BaseModel):
    # fmt: off
    vectors: Optional[StrictStr] = Field(..., title="Path to vectors")
+    lookups: Optional[Lookups] = Field(..., title="Vocab lookups")
    dev_corpus: StrictStr = Field(..., title="Path in the config to the dev data")
    train_corpus: StrictStr = Field(..., title="Path in the config to the training data")
    batcher: Batcher = Field(..., title="Batcher for the training data")
@ -227,7 +229,6 @@ class ConfigSchemaNlp(BaseModel):
    pipeline: List[StrictStr] = Field(..., title="The pipeline component names in order")
    disabled: List[StrictStr] = Field(..., title="Pipeline components to disable by default")
    tokenizer: Callable = Field(..., title="The tokenizer to use")
-    load_vocab_data: StrictBool = Field(..., title="Whether to load additional vocab data from spacy-lookups-data")
    before_creation: Optional[Callable[[Type["Language"]], Type["Language"]]] = Field(..., title="Optional callback to modify Language class before initialization")
    after_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after creation and before the pipeline is constructed")
    after_pipeline_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after the pipeline is constructed")
--- a/spacy/tests/test_util.py
+++ b/spacy/tests/test_util.py
@ -69,7 +69,6 @@ def test_util_dot_section():
    [nlp]
    lang = "en"
    pipeline = ["textcat"]
-    load_vocab_data = false

    [components]

@ -95,15 +94,13 @@ def test_util_dot_section():
    # not exclusive_classes
    assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] is False
    # Test that default values got overwritten
-    assert not en_config["nlp"]["load_vocab_data"]
-    assert nl_config["nlp"]["load_vocab_data"]  # default value True
+    assert en_config["nlp"]["pipeline"] == ["textcat"]
+    assert nl_config["nlp"]["pipeline"] == [] # default value []
    # Test proper functioning of 'dot_to_object'
    with pytest.raises(KeyError):
        dot_to_object(en_config, "nlp.pipeline.tagger")
    with pytest.raises(KeyError):
        dot_to_object(en_config, "nlp.unknownattribute")
-    assert not dot_to_object(en_config, "nlp.load_vocab_data")
-    assert dot_to_object(nl_config, "nlp.load_vocab_data")
    assert isinstance(dot_to_object(nl_config, "training.optimizer"), Optimizer)


--- a/spacy/util.py
+++ b/spacy/util.py
@ -253,6 +253,14 @@ def load_vectors_into_model(
                nlp.vocab.strings.add(vectors_nlp.vocab.strings[key])


+def load_vocab_data_into_model(
+    nlp: "Language", *, lookups: Optional["Lookups"]=None
+) -> None:
+    """Load vocab data."""
+    if lookups:
+        nlp.vocab.load_lookups(lookups)
+
+
 def load_model(
    name: Union[str, Path],
    *,
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -22,14 +22,9 @@ from .lang.norm_exceptions import BASE_NORMS
 from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang


-def create_vocab(lang, defaults, vectors_name=None, load_data=True):
+def create_vocab(lang, defaults, vectors_name=None):
    # If the spacy-lookups-data package is installed, we pre-populate the lookups
    # with lexeme data, if available
-    if load_data:
-        tables = ["lexeme_norm", "lexeme_prob", "lexeme_cluster", "lexeme_settings"]
-        lookups = load_lookups(lang, tables=tables, strict=False)
-    else:
-        lookups = Lookups()
    lex_attrs = {**LEX_ATTRS, **defaults.lex_attr_getters}
    # This is messy, but it's the minimal working fix to Issue #639.
    lex_attrs[IS_STOP] = functools.partial(is_stop, stops=defaults.stop_words)
@ -38,11 +33,9 @@ def create_vocab(lang, defaults, vectors_name=None, load_data=True):
    lex_attrs[NORM] = util.add_lookups(
        lex_attrs.get(NORM, LEX_ATTRS[NORM]),
        BASE_NORMS,
-        lookups.get_table("lexeme_norm", {}),
    )
    return Vocab(
        lex_attr_getters=lex_attrs,
-        lookups=lookups,
        writing_system=defaults.writing_system,
        get_noun_chunks=defaults.syntax_iterators.get("noun_chunks"),
        vectors_name=vectors_name,
@ -424,6 +417,14 @@ cdef class Vocab:
            orth = self.strings.add(orth)
        return orth in self.vectors

+    def load_lookups(self, lookups):
+        self.lookups = lookups
+        if lookups.has_table("lexeme_norm"):
+            self.lex_attr_getters[NORM] = util.add_lookups(
+                self.lex_attr_getters[NORM],
+                lookups.get_table("lexeme_norm"),
+            )
+
    def to_disk(self, path, *, exclude=tuple()):
        """Save the current state to a directory.