mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 01:04:34 +03:00
Re-add setting for vocab data and tidy up
This commit is contained in:
parent
b9aaa4e457
commit
8d9d28eb8b
|
@ -1,6 +1,7 @@
|
||||||
[nlp]
|
[nlp]
|
||||||
lang = null
|
lang = null
|
||||||
pipeline = []
|
pipeline = []
|
||||||
|
load_vocab_data = true
|
||||||
|
|
||||||
[nlp.tokenizer]
|
[nlp.tokenizer]
|
||||||
@tokenizers = "spacy.Tokenizer.v1"
|
@tokenizers = "spacy.Tokenizer.v1"
|
||||||
|
|
|
@ -121,15 +121,18 @@ class Language:
|
||||||
vocab (Vocab): A `Vocab` object. If `True`, a vocab is created.
|
vocab (Vocab): A `Vocab` object. If `True`, a vocab is created.
|
||||||
meta (dict): Custom meta data for the Language class. Is written to by
|
meta (dict): Custom meta data for the Language class. Is written to by
|
||||||
models to add model meta data.
|
models to add model meta data.
|
||||||
max_length (int) :
|
max_length (int): Maximum number of characters in a single text. The
|
||||||
Maximum number of characters in a single text. The current models
|
current models may run out memory on extremely long texts, due to
|
||||||
may run out memory on extremely long texts, due to large internal
|
large internal allocations. You should segment these texts into
|
||||||
allocations. You should segment these texts into meaningful units,
|
meaningful units, e.g. paragraphs, subsections etc, before passing
|
||||||
e.g. paragraphs, subsections etc, before passing them to spaCy.
|
them to spaCy. Default maximum length is 1,000,000 charas (1mb). As
|
||||||
Default maximum length is 1,000,000 characters (1mb). As a rule of
|
a rule of thumb, if all pipeline components are enabled, spaCy's
|
||||||
thumb, if all pipeline components are enabled, spaCy's default
|
default models currently requires roughly 1GB of temporary memory per
|
||||||
models currently requires roughly 1GB of temporary memory per
|
|
||||||
100,000 characters in one text.
|
100,000 characters in one text.
|
||||||
|
create_tokenizer (Callable): Function that takes the nlp object and
|
||||||
|
returns a tokenizer.
|
||||||
|
create_lemmatizer (Callable): Function that takes the nlp object and
|
||||||
|
returns a lemmatizer.
|
||||||
RETURNS (Language): The newly constructed object.
|
RETURNS (Language): The newly constructed object.
|
||||||
"""
|
"""
|
||||||
# We're only calling this to import all factories provided via entry
|
# We're only calling this to import all factories provided via entry
|
||||||
|
@ -150,12 +153,12 @@ class Language:
|
||||||
if not create_lemmatizer:
|
if not create_lemmatizer:
|
||||||
lemma_cfg = {"lemmatizer": self._config["nlp"]["lemmatizer"]}
|
lemma_cfg = {"lemmatizer": self._config["nlp"]["lemmatizer"]}
|
||||||
create_lemmatizer = registry.make_from_config(lemma_cfg)["lemmatizer"]
|
create_lemmatizer = registry.make_from_config(lemma_cfg)["lemmatizer"]
|
||||||
# TODO: where does the vocab data come in?
|
|
||||||
vocab = create_vocab(
|
vocab = create_vocab(
|
||||||
self.lang,
|
self.lang,
|
||||||
self.Defaults,
|
self.Defaults,
|
||||||
lemmatizer=create_lemmatizer(self),
|
lemmatizer=create_lemmatizer(self),
|
||||||
vectors_name=vectors_name,
|
vectors_name=vectors_name,
|
||||||
|
load_data=self._config["nlp"]["load_vocab_data"],
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
if (self.lang and vocab.lang) and (self.lang != vocab.lang):
|
if (self.lang and vocab.lang) and (self.lang != vocab.lang):
|
||||||
|
|
|
@ -224,21 +224,13 @@ class ConfigSchemaTraining(BaseModel):
|
||||||
arbitrary_types_allowed = True
|
arbitrary_types_allowed = True
|
||||||
|
|
||||||
|
|
||||||
class ConfigSchemaNlpWritingSystem(BaseModel):
|
|
||||||
direction: StrictStr = Field(..., title="The writing direction, e.g. 'rtl'")
|
|
||||||
has_case: StrictBool = Field(..., title="Whether the language has case")
|
|
||||||
has_letters: StrictBool = Field(..., title="Whether the language has letters")
|
|
||||||
|
|
||||||
class Config:
|
|
||||||
extra = "allow"
|
|
||||||
|
|
||||||
|
|
||||||
class ConfigSchemaNlp(BaseModel):
|
class ConfigSchemaNlp(BaseModel):
|
||||||
# fmt: off
|
# fmt: off
|
||||||
lang: StrictStr = Field(..., title="The base language to use")
|
lang: StrictStr = Field(..., title="The base language to use")
|
||||||
pipeline: List[StrictStr] = Field(..., title="The pipeline component names in order")
|
pipeline: List[StrictStr] = Field(..., title="The pipeline component names in order")
|
||||||
tokenizer: Callable = Field(..., title="The tokenizer to use")
|
tokenizer: Callable = Field(..., title="The tokenizer to use")
|
||||||
lemmatizer: Callable = Field(..., title="The lemmatizer to use")
|
lemmatizer: Callable = Field(..., title="The lemmatizer to use")
|
||||||
|
load_vocab_data: StrictBool = Field(..., title="Whether to load additional vocab data from spacy-lookups-data")
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
|
|
|
@ -188,8 +188,10 @@ def load_model(
|
||||||
"""Load a model from a package or data path.
|
"""Load a model from a package or data path.
|
||||||
|
|
||||||
name (str): Package name or model path.
|
name (str): Package name or model path.
|
||||||
**overrides: Specific overrides, like pipeline components to disable.
|
disable (Iterable[str]): Names of pipeline components to disable.
|
||||||
RETURNS (Language): `Language` class with the loaded model.
|
component_cfg (Dict[str, dict]): Config overrides for pipeline components,
|
||||||
|
keyed by component names.
|
||||||
|
RETURNS (Language): The loaded nlp object.
|
||||||
"""
|
"""
|
||||||
cfg = component_cfg
|
cfg = component_cfg
|
||||||
if isinstance(name, str): # name or string path
|
if isinstance(name, str): # name or string path
|
||||||
|
|
|
@ -23,10 +23,10 @@ from .lang.norm_exceptions import BASE_NORMS
|
||||||
from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang
|
from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang
|
||||||
|
|
||||||
|
|
||||||
def create_vocab(lang, defaults, lemmatizer=None, vectors_name=None, load_lookups_data=True):
|
def create_vocab(lang, defaults, lemmatizer=None, vectors_name=None, load_data=True):
|
||||||
# If the spacy-lookups-data package is installed, we pre-populate the lookups
|
# If the spacy-lookups-data package is installed, we pre-populate the lookups
|
||||||
# with lexeme data, if available
|
# with lexeme data, if available
|
||||||
if load_lookups_data:
|
if load_data:
|
||||||
tables = ["lexeme_norm", "lexeme_prob", "lexeme_cluster", "lexeme_settings"]
|
tables = ["lexeme_norm", "lexeme_prob", "lexeme_cluster", "lexeme_settings"]
|
||||||
lookups = load_lookups(lang, tables=tables, strict=False)
|
lookups = load_lookups(lang, tables=tables, strict=False)
|
||||||
else:
|
else:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user