Re-add setting for vocab data and tidy up

2025-07-18 20:22:25 +03:00 · 2020-07-25 12:14:28 +02:00 · 2020-07-25 12:14:28 +02:00 · 8d9d28eb8b
commit 8d9d28eb8b
parent b9aaa4e457
5 changed files with 20 additions and 22 deletions
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@ -1,6 +1,7 @@
 [nlp]
 lang = null
 pipeline = []
 load_vocab_data = true
 [nlp.tokenizer]
@tokenizers = "spacy.Tokenizer.v1"
--- a/spacy/language.py
+++ b/spacy/language.py
@ -121,15 +121,18 @@ class Language:
        vocab (Vocab): A `Vocab` object. If `True`, a vocab is created.
        meta (dict): Custom meta data for the Language class. Is written to by
            models to add model meta data.
-        max_length (int) :
+        max_length (int): Maximum number of characters in a single text. The
-            Maximum number of characters in a single text. The current models
+            current models may run out memory on extremely long texts, due to
-            may run out memory on extremely long texts, due to large internal
+            large internal allocations. You should segment these texts into
-            allocations. You should segment these texts into meaningful units,
+            meaningful units, e.g. paragraphs, subsections etc, before passing
-            e.g. paragraphs, subsections etc, before passing them to spaCy.
+            them to spaCy. Default maximum length is 1,000,000 charas (1mb). As
-            Default maximum length is 1,000,000 characters (1mb). As a rule of
+            a rule of thumb, if all pipeline components are enabled, spaCy's
-            thumb, if all pipeline components are enabled, spaCy's default
+            default models currently requires roughly 1GB of temporary memory per
            models currently requires roughly 1GB of temporary memory per
            100,000 characters in one text.
        create_tokenizer (Callable): Function that takes the nlp object and
            returns a tokenizer.
        create_lemmatizer (Callable): Function that takes the nlp object and
            returns a lemmatizer.
        RETURNS (Language): The newly constructed object.
        """
        # We're only calling this to import all factories provided via entry
@ -150,12 +153,12 @@ class Language:
            if not create_lemmatizer:
                lemma_cfg = {"lemmatizer": self._config["nlp"]["lemmatizer"]}
                create_lemmatizer = registry.make_from_config(lemma_cfg)["lemmatizer"]
            # TODO: where does the vocab data come in?
            vocab = create_vocab(
                self.lang,
                self.Defaults,
                lemmatizer=create_lemmatizer(self),
                vectors_name=vectors_name,
                load_data=self._config["nlp"]["load_vocab_data"],
            )
        else:
            if (self.lang and vocab.lang) and (self.lang != vocab.lang):
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@ -224,21 +224,13 @@ class ConfigSchemaTraining(BaseModel):
        arbitrary_types_allowed = True
 class ConfigSchemaNlpWritingSystem(BaseModel):
    direction: StrictStr = Field(..., title="The writing direction, e.g. 'rtl'")
    has_case: StrictBool = Field(..., title="Whether the language has case")
    has_letters: StrictBool = Field(..., title="Whether the language has letters")
    class Config:
        extra = "allow"
 class ConfigSchemaNlp(BaseModel):
    # fmt: off
    lang: StrictStr = Field(..., title="The base language to use")
    pipeline: List[StrictStr] = Field(..., title="The pipeline component names in order")
    tokenizer: Callable = Field(..., title="The tokenizer to use")
    lemmatizer: Callable = Field(..., title="The lemmatizer to use")
    load_vocab_data: StrictBool = Field(..., title="Whether to load additional vocab data from spacy-lookups-data")
    # fmt: on
    class Config:
--- a/spacy/util.py
+++ b/spacy/util.py
@ -188,8 +188,10 @@ def load_model(
    """Load a model from a package or data path.
    name (str): Package name or model path.
-    **overrides: Specific overrides, like pipeline components to disable.
+    disable (Iterable[str]): Names of pipeline components to disable.
-    RETURNS (Language): `Language` class with the loaded model.
+    component_cfg (Dict[str, dict]): Config overrides for pipeline components,
        keyed by component names.
    RETURNS (Language): The loaded nlp object.
    """
    cfg = component_cfg
    if isinstance(name, str):  # name or string path
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -23,10 +23,10 @@ from .lang.norm_exceptions import BASE_NORMS
 from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang
-def create_vocab(lang, defaults, lemmatizer=None, vectors_name=None, load_lookups_data=True):
+def create_vocab(lang, defaults, lemmatizer=None, vectors_name=None, load_data=True):
    # If the spacy-lookups-data package is installed, we pre-populate the lookups
    # with lexeme data, if available
-    if load_lookups_data:
+    if load_data:
        tables = ["lexeme_norm", "lexeme_prob", "lexeme_cluster", "lexeme_settings"]
        lookups = load_lookups(lang, tables=tables, strict=False)
    else: