Merge pull request #6092 from adrianeboyd/bugfix/load-vocab-lookups-2

2025-11-08 03:47:39 +03:00 · 2020-09-19 12:33:38 +02:00 · 2020-09-19 12:33:38 +02:00 · e863b3dc14
commit e863b3dc14
parent 39872de1f6 47080fba98
8 changed files with 37 additions and 17 deletions
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -87,6 +87,7 @@ def train(
    sourced_components = get_sourced_components(config)
    with show_validation_error(config_path):
        nlp, config = util.load_model_from_config(config)
+    util.load_vocab_data_into_model(nlp, lookups=config["training"]["lookups"])
    if config["training"]["vectors"] is not None:
        util.load_vectors_into_model(nlp, config["training"]["vectors"])
    raw_text, tag_map, morph_rules, weights_data = load_from_paths(config)
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@ -12,7 +12,6 @@ gpu_allocator = null
 lang = null
 pipeline = []
 disabled = []
-load_vocab_data = true
 before_creation = null
 after_creation = null
 after_pipeline_creation = null
@ -59,6 +58,7 @@ accumulate_gradient = 1
 init_tok2vec = ${paths.init_tok2vec}
 raw_text = ${paths.raw}
 vectors = null
+lookups = null
 # Controls early-stopping. 0 or -1 mean unlimited.
 patience = 1600
 max_epochs = 0
--- a/spacy/language.py
+++ b/spacy/language.py
@ -31,6 +31,7 @@ from .schemas import ConfigSchema
 from .git_info import GIT_VERSION
 from . import util
 from . import about
+from .lookups import load_lookups


 # This is the base config will all settings (training etc.)
@ -86,6 +87,13 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]:
    return tokenizer_factory


+@registry.misc("spacy.LookupsDataLoader.v1")
+def load_lookups_data(lang, tables):
+    util.logger.debug(f"Loading lookups from spacy-lookups-data: {tables}")
+    lookups = load_lookups(lang=lang, tables=tables)
+    return lookups
+
+
 class Language:
    """A text-processing pipeline. Usually you'll load this once per process,
    and pass the instance around your application.
@ -152,7 +160,6 @@ class Language:
                self.lang,
                self.Defaults,
                vectors_name=vectors_name,
-                load_data=self._config["nlp"]["load_vocab_data"],
            )
        else:
            if (self.lang and vocab.lang) and (self.lang != vocab.lang):
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@ -8,6 +8,7 @@ from collections import defaultdict
 from thinc.api import Optimizer

 from .attrs import NAMES
+from .lookups import Lookups

 if TYPE_CHECKING:
    # This lets us add type hints for mypy etc. without causing circular imports
@ -198,6 +199,7 @@ class ModelMetaSchema(BaseModel):
 class ConfigSchemaTraining(BaseModel):
    # fmt: off
    vectors: Optional[StrictStr] = Field(..., title="Path to vectors")
+    lookups: Optional[Lookups] = Field(..., title="Vocab lookups")
    dev_corpus: StrictStr = Field(..., title="Path in the config to the dev data")
    train_corpus: StrictStr = Field(..., title="Path in the config to the training data")
    batcher: Batcher = Field(..., title="Batcher for the training data")
@ -228,7 +230,6 @@ class ConfigSchemaNlp(BaseModel):
    pipeline: List[StrictStr] = Field(..., title="The pipeline component names in order")
    disabled: List[StrictStr] = Field(..., title="Pipeline components to disable by default")
    tokenizer: Callable = Field(..., title="The tokenizer to use")
-    load_vocab_data: StrictBool = Field(..., title="Whether to load additional vocab data from spacy-lookups-data")
    before_creation: Optional[Callable[[Type["Language"]], Type["Language"]]] = Field(..., title="Optional callback to modify Language class before initialization")
    after_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after creation and before the pipeline is constructed")
    after_pipeline_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after the pipeline is constructed")
--- a/spacy/tests/test_util.py
+++ b/spacy/tests/test_util.py
@ -69,7 +69,6 @@ def test_util_dot_section():
    [nlp]
    lang = "en"
    pipeline = ["textcat"]
-    load_vocab_data = false

    [components]

@ -95,15 +94,13 @@ def test_util_dot_section():
    # not exclusive_classes
    assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] is False
    # Test that default values got overwritten
-    assert not en_config["nlp"]["load_vocab_data"]
-    assert nl_config["nlp"]["load_vocab_data"]  # default value True
+    assert en_config["nlp"]["pipeline"] == ["textcat"]
+    assert nl_config["nlp"]["pipeline"] == [] # default value []
    # Test proper functioning of 'dot_to_object'
    with pytest.raises(KeyError):
        dot_to_object(en_config, "nlp.pipeline.tagger")
    with pytest.raises(KeyError):
        dot_to_object(en_config, "nlp.unknownattribute")
-    assert not dot_to_object(en_config, "nlp.load_vocab_data")
-    assert dot_to_object(nl_config, "nlp.load_vocab_data")
    assert isinstance(dot_to_object(nl_config, "training.optimizer"), Optimizer)


--- a/spacy/util.py
+++ b/spacy/util.py
@ -253,6 +253,14 @@ def load_vectors_into_model(
                nlp.vocab.strings.add(vectors_nlp.vocab.strings[key])


+def load_vocab_data_into_model(
+    nlp: "Language", *, lookups: Optional["Lookups"]=None
+) -> None:
+    """Load vocab data."""
+    if lookups:
+        nlp.vocab.lookups = lookups
+
+
 def load_model(
    name: Union[str, Path],
    *,
--- a/spacy/vocab.pxd
+++ b/spacy/vocab.pxd
@ -28,7 +28,7 @@ cdef class Vocab:
    cpdef readonly StringStore strings
    cpdef public Morphology morphology
    cpdef public object vectors
-    cpdef public object lookups
+    cpdef public object _lookups
    cpdef public object writing_system
    cpdef public object get_noun_chunks
    cdef readonly int length
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -22,14 +22,9 @@ from .lang.norm_exceptions import BASE_NORMS
 from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang


-def create_vocab(lang, defaults, vectors_name=None, load_data=True):
+def create_vocab(lang, defaults, vectors_name=None):
    # If the spacy-lookups-data package is installed, we pre-populate the lookups
    # with lexeme data, if available
-    if load_data:
-        tables = ["lexeme_norm", "lexeme_prob", "lexeme_cluster", "lexeme_settings"]
-        lookups = load_lookups(lang, tables=tables, strict=False)
-    else:
-        lookups = Lookups()
    lex_attrs = {**LEX_ATTRS, **defaults.lex_attr_getters}
    # This is messy, but it's the minimal working fix to Issue #639.
    lex_attrs[IS_STOP] = functools.partial(is_stop, stops=defaults.stop_words)
@ -38,11 +33,9 @@ def create_vocab(lang, defaults, vectors_name=None, load_data=True):
    lex_attrs[NORM] = util.add_lookups(
        lex_attrs.get(NORM, LEX_ATTRS[NORM]),
        BASE_NORMS,
-        lookups.get_table("lexeme_norm", {}),
    )
    return Vocab(
        lex_attr_getters=lex_attrs,
-        lookups=lookups,
        writing_system=defaults.writing_system,
        get_noun_chunks=defaults.syntax_iterators.get("noun_chunks"),
        vectors_name=vectors_name,
@ -424,6 +417,19 @@ cdef class Vocab:
            orth = self.strings.add(orth)
        return orth in self.vectors

+    property lookups:
+        def __get__(self):
+            return self._lookups
+
+        def __set__(self, lookups):
+            self._lookups = lookups
+            if lookups.has_table("lexeme_norm"):
+                self.lex_attr_getters[NORM] = util.add_lookups(
+                    self.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]),
+                    self.lookups.get_table("lexeme_norm"),
+                )
+
+
    def to_disk(self, path, *, exclude=tuple()):
        """Save the current state to a directory.