Re-add setting for vocab data and tidy up

This commit is contained in:
Ines Montani 2020-07-25 12:14:28 +02:00
parent b9aaa4e457
commit 8d9d28eb8b
5 changed files with 20 additions and 22 deletions

View File

@ -1,6 +1,7 @@
[nlp] [nlp]
lang = null lang = null
pipeline = [] pipeline = []
load_vocab_data = true
[nlp.tokenizer] [nlp.tokenizer]
@tokenizers = "spacy.Tokenizer.v1" @tokenizers = "spacy.Tokenizer.v1"

View File

@ -121,15 +121,18 @@ class Language:
vocab (Vocab): A `Vocab` object. If `True`, a vocab is created. vocab (Vocab): A `Vocab` object. If `True`, a vocab is created.
meta (dict): Custom meta data for the Language class. Is written to by meta (dict): Custom meta data for the Language class. Is written to by
models to add model meta data. models to add model meta data.
max_length (int) : max_length (int): Maximum number of characters in a single text. The
Maximum number of characters in a single text. The current models current models may run out memory on extremely long texts, due to
may run out memory on extremely long texts, due to large internal large internal allocations. You should segment these texts into
allocations. You should segment these texts into meaningful units, meaningful units, e.g. paragraphs, subsections etc, before passing
e.g. paragraphs, subsections etc, before passing them to spaCy. them to spaCy. Default maximum length is 1,000,000 charas (1mb). As
Default maximum length is 1,000,000 characters (1mb). As a rule of a rule of thumb, if all pipeline components are enabled, spaCy's
thumb, if all pipeline components are enabled, spaCy's default default models currently requires roughly 1GB of temporary memory per
models currently requires roughly 1GB of temporary memory per
100,000 characters in one text. 100,000 characters in one text.
create_tokenizer (Callable): Function that takes the nlp object and
returns a tokenizer.
create_lemmatizer (Callable): Function that takes the nlp object and
returns a lemmatizer.
RETURNS (Language): The newly constructed object. RETURNS (Language): The newly constructed object.
""" """
# We're only calling this to import all factories provided via entry # We're only calling this to import all factories provided via entry
@ -150,12 +153,12 @@ class Language:
if not create_lemmatizer: if not create_lemmatizer:
lemma_cfg = {"lemmatizer": self._config["nlp"]["lemmatizer"]} lemma_cfg = {"lemmatizer": self._config["nlp"]["lemmatizer"]}
create_lemmatizer = registry.make_from_config(lemma_cfg)["lemmatizer"] create_lemmatizer = registry.make_from_config(lemma_cfg)["lemmatizer"]
# TODO: where does the vocab data come in?
vocab = create_vocab( vocab = create_vocab(
self.lang, self.lang,
self.Defaults, self.Defaults,
lemmatizer=create_lemmatizer(self), lemmatizer=create_lemmatizer(self),
vectors_name=vectors_name, vectors_name=vectors_name,
load_data=self._config["nlp"]["load_vocab_data"],
) )
else: else:
if (self.lang and vocab.lang) and (self.lang != vocab.lang): if (self.lang and vocab.lang) and (self.lang != vocab.lang):

View File

@ -224,21 +224,13 @@ class ConfigSchemaTraining(BaseModel):
arbitrary_types_allowed = True arbitrary_types_allowed = True
class ConfigSchemaNlpWritingSystem(BaseModel):
direction: StrictStr = Field(..., title="The writing direction, e.g. 'rtl'")
has_case: StrictBool = Field(..., title="Whether the language has case")
has_letters: StrictBool = Field(..., title="Whether the language has letters")
class Config:
extra = "allow"
class ConfigSchemaNlp(BaseModel): class ConfigSchemaNlp(BaseModel):
# fmt: off # fmt: off
lang: StrictStr = Field(..., title="The base language to use") lang: StrictStr = Field(..., title="The base language to use")
pipeline: List[StrictStr] = Field(..., title="The pipeline component names in order") pipeline: List[StrictStr] = Field(..., title="The pipeline component names in order")
tokenizer: Callable = Field(..., title="The tokenizer to use") tokenizer: Callable = Field(..., title="The tokenizer to use")
lemmatizer: Callable = Field(..., title="The lemmatizer to use") lemmatizer: Callable = Field(..., title="The lemmatizer to use")
load_vocab_data: StrictBool = Field(..., title="Whether to load additional vocab data from spacy-lookups-data")
# fmt: on # fmt: on
class Config: class Config:

View File

@ -188,8 +188,10 @@ def load_model(
"""Load a model from a package or data path. """Load a model from a package or data path.
name (str): Package name or model path. name (str): Package name or model path.
**overrides: Specific overrides, like pipeline components to disable. disable (Iterable[str]): Names of pipeline components to disable.
RETURNS (Language): `Language` class with the loaded model. component_cfg (Dict[str, dict]): Config overrides for pipeline components,
keyed by component names.
RETURNS (Language): The loaded nlp object.
""" """
cfg = component_cfg cfg = component_cfg
if isinstance(name, str): # name or string path if isinstance(name, str): # name or string path

View File

@ -23,10 +23,10 @@ from .lang.norm_exceptions import BASE_NORMS
from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang
def create_vocab(lang, defaults, lemmatizer=None, vectors_name=None, load_lookups_data=True): def create_vocab(lang, defaults, lemmatizer=None, vectors_name=None, load_data=True):
# If the spacy-lookups-data package is installed, we pre-populate the lookups # If the spacy-lookups-data package is installed, we pre-populate the lookups
# with lexeme data, if available # with lexeme data, if available
if load_lookups_data: if load_data:
tables = ["lexeme_norm", "lexeme_prob", "lexeme_cluster", "lexeme_settings"] tables = ["lexeme_norm", "lexeme_prob", "lexeme_cluster", "lexeme_settings"]
lookups = load_lookups(lang, tables=tables, strict=False) lookups = load_lookups(lang, tables=tables, strict=False)
else: else: