Re-add setting for vocab data and tidy up

This commit is contained in:
Ines Montani 2020-07-25 12:14:28 +02:00
parent b9aaa4e457
commit 8d9d28eb8b
5 changed files with 20 additions and 22 deletions

View File

@ -1,6 +1,7 @@
[nlp]
lang = null
pipeline = []
load_vocab_data = true
[nlp.tokenizer]
@tokenizers = "spacy.Tokenizer.v1"

View File

@ -121,15 +121,18 @@ class Language:
vocab (Vocab): A `Vocab` object. If `True`, a vocab is created.
meta (dict): Custom meta data for the Language class. Is written to by
models to add model meta data.
max_length (int) :
Maximum number of characters in a single text. The current models
may run out memory on extremely long texts, due to large internal
allocations. You should segment these texts into meaningful units,
e.g. paragraphs, subsections etc, before passing them to spaCy.
Default maximum length is 1,000,000 characters (1mb). As a rule of
thumb, if all pipeline components are enabled, spaCy's default
models currently requires roughly 1GB of temporary memory per
max_length (int): Maximum number of characters in a single text. The
current models may run out memory on extremely long texts, due to
large internal allocations. You should segment these texts into
meaningful units, e.g. paragraphs, subsections etc, before passing
them to spaCy. Default maximum length is 1,000,000 charas (1mb). As
a rule of thumb, if all pipeline components are enabled, spaCy's
default models currently requires roughly 1GB of temporary memory per
100,000 characters in one text.
create_tokenizer (Callable): Function that takes the nlp object and
returns a tokenizer.
create_lemmatizer (Callable): Function that takes the nlp object and
returns a lemmatizer.
RETURNS (Language): The newly constructed object.
"""
# We're only calling this to import all factories provided via entry
@ -150,12 +153,12 @@ class Language:
if not create_lemmatizer:
lemma_cfg = {"lemmatizer": self._config["nlp"]["lemmatizer"]}
create_lemmatizer = registry.make_from_config(lemma_cfg)["lemmatizer"]
# TODO: where does the vocab data come in?
vocab = create_vocab(
self.lang,
self.Defaults,
lemmatizer=create_lemmatizer(self),
vectors_name=vectors_name,
load_data=self._config["nlp"]["load_vocab_data"],
)
else:
if (self.lang and vocab.lang) and (self.lang != vocab.lang):

View File

@ -224,21 +224,13 @@ class ConfigSchemaTraining(BaseModel):
arbitrary_types_allowed = True
class ConfigSchemaNlpWritingSystem(BaseModel):
direction: StrictStr = Field(..., title="The writing direction, e.g. 'rtl'")
has_case: StrictBool = Field(..., title="Whether the language has case")
has_letters: StrictBool = Field(..., title="Whether the language has letters")
class Config:
extra = "allow"
class ConfigSchemaNlp(BaseModel):
# fmt: off
lang: StrictStr = Field(..., title="The base language to use")
pipeline: List[StrictStr] = Field(..., title="The pipeline component names in order")
tokenizer: Callable = Field(..., title="The tokenizer to use")
lemmatizer: Callable = Field(..., title="The lemmatizer to use")
load_vocab_data: StrictBool = Field(..., title="Whether to load additional vocab data from spacy-lookups-data")
# fmt: on
class Config:

View File

@ -188,8 +188,10 @@ def load_model(
"""Load a model from a package or data path.
name (str): Package name or model path.
**overrides: Specific overrides, like pipeline components to disable.
RETURNS (Language): `Language` class with the loaded model.
disable (Iterable[str]): Names of pipeline components to disable.
component_cfg (Dict[str, dict]): Config overrides for pipeline components,
keyed by component names.
RETURNS (Language): The loaded nlp object.
"""
cfg = component_cfg
if isinstance(name, str): # name or string path

View File

@ -23,10 +23,10 @@ from .lang.norm_exceptions import BASE_NORMS
from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang
def create_vocab(lang, defaults, lemmatizer=None, vectors_name=None, load_lookups_data=True):
def create_vocab(lang, defaults, lemmatizer=None, vectors_name=None, load_data=True):
# If the spacy-lookups-data package is installed, we pre-populate the lookups
# with lexeme data, if available
if load_lookups_data:
if load_data:
tables = ["lexeme_norm", "lexeme_prob", "lexeme_cluster", "lexeme_settings"]
lookups = load_lookups(lang, tables=tables, strict=False)
else: