mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-10 19:57:17 +03:00
Re-add setting for vocab data and tidy up
This commit is contained in:
parent
b9aaa4e457
commit
8d9d28eb8b
|
@ -1,6 +1,7 @@
|
|||
[nlp]
|
||||
lang = null
|
||||
pipeline = []
|
||||
load_vocab_data = true
|
||||
|
||||
[nlp.tokenizer]
|
||||
@tokenizers = "spacy.Tokenizer.v1"
|
||||
|
|
|
@ -121,15 +121,18 @@ class Language:
|
|||
vocab (Vocab): A `Vocab` object. If `True`, a vocab is created.
|
||||
meta (dict): Custom meta data for the Language class. Is written to by
|
||||
models to add model meta data.
|
||||
max_length (int) :
|
||||
Maximum number of characters in a single text. The current models
|
||||
may run out memory on extremely long texts, due to large internal
|
||||
allocations. You should segment these texts into meaningful units,
|
||||
e.g. paragraphs, subsections etc, before passing them to spaCy.
|
||||
Default maximum length is 1,000,000 characters (1mb). As a rule of
|
||||
thumb, if all pipeline components are enabled, spaCy's default
|
||||
models currently requires roughly 1GB of temporary memory per
|
||||
max_length (int): Maximum number of characters in a single text. The
|
||||
current models may run out memory on extremely long texts, due to
|
||||
large internal allocations. You should segment these texts into
|
||||
meaningful units, e.g. paragraphs, subsections etc, before passing
|
||||
them to spaCy. Default maximum length is 1,000,000 charas (1mb). As
|
||||
a rule of thumb, if all pipeline components are enabled, spaCy's
|
||||
default models currently requires roughly 1GB of temporary memory per
|
||||
100,000 characters in one text.
|
||||
create_tokenizer (Callable): Function that takes the nlp object and
|
||||
returns a tokenizer.
|
||||
create_lemmatizer (Callable): Function that takes the nlp object and
|
||||
returns a lemmatizer.
|
||||
RETURNS (Language): The newly constructed object.
|
||||
"""
|
||||
# We're only calling this to import all factories provided via entry
|
||||
|
@ -150,12 +153,12 @@ class Language:
|
|||
if not create_lemmatizer:
|
||||
lemma_cfg = {"lemmatizer": self._config["nlp"]["lemmatizer"]}
|
||||
create_lemmatizer = registry.make_from_config(lemma_cfg)["lemmatizer"]
|
||||
# TODO: where does the vocab data come in?
|
||||
vocab = create_vocab(
|
||||
self.lang,
|
||||
self.Defaults,
|
||||
lemmatizer=create_lemmatizer(self),
|
||||
vectors_name=vectors_name,
|
||||
load_data=self._config["nlp"]["load_vocab_data"],
|
||||
)
|
||||
else:
|
||||
if (self.lang and vocab.lang) and (self.lang != vocab.lang):
|
||||
|
|
|
@ -224,21 +224,13 @@ class ConfigSchemaTraining(BaseModel):
|
|||
arbitrary_types_allowed = True
|
||||
|
||||
|
||||
class ConfigSchemaNlpWritingSystem(BaseModel):
|
||||
direction: StrictStr = Field(..., title="The writing direction, e.g. 'rtl'")
|
||||
has_case: StrictBool = Field(..., title="Whether the language has case")
|
||||
has_letters: StrictBool = Field(..., title="Whether the language has letters")
|
||||
|
||||
class Config:
|
||||
extra = "allow"
|
||||
|
||||
|
||||
class ConfigSchemaNlp(BaseModel):
|
||||
# fmt: off
|
||||
lang: StrictStr = Field(..., title="The base language to use")
|
||||
pipeline: List[StrictStr] = Field(..., title="The pipeline component names in order")
|
||||
tokenizer: Callable = Field(..., title="The tokenizer to use")
|
||||
lemmatizer: Callable = Field(..., title="The lemmatizer to use")
|
||||
load_vocab_data: StrictBool = Field(..., title="Whether to load additional vocab data from spacy-lookups-data")
|
||||
# fmt: on
|
||||
|
||||
class Config:
|
||||
|
|
|
@ -188,8 +188,10 @@ def load_model(
|
|||
"""Load a model from a package or data path.
|
||||
|
||||
name (str): Package name or model path.
|
||||
**overrides: Specific overrides, like pipeline components to disable.
|
||||
RETURNS (Language): `Language` class with the loaded model.
|
||||
disable (Iterable[str]): Names of pipeline components to disable.
|
||||
component_cfg (Dict[str, dict]): Config overrides for pipeline components,
|
||||
keyed by component names.
|
||||
RETURNS (Language): The loaded nlp object.
|
||||
"""
|
||||
cfg = component_cfg
|
||||
if isinstance(name, str): # name or string path
|
||||
|
|
|
@ -23,10 +23,10 @@ from .lang.norm_exceptions import BASE_NORMS
|
|||
from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang
|
||||
|
||||
|
||||
def create_vocab(lang, defaults, lemmatizer=None, vectors_name=None, load_lookups_data=True):
|
||||
def create_vocab(lang, defaults, lemmatizer=None, vectors_name=None, load_data=True):
|
||||
# If the spacy-lookups-data package is installed, we pre-populate the lookups
|
||||
# with lexeme data, if available
|
||||
if load_lookups_data:
|
||||
if load_data:
|
||||
tables = ["lexeme_norm", "lexeme_prob", "lexeme_cluster", "lexeme_settings"]
|
||||
lookups = load_lookups(lang, tables=tables, strict=False)
|
||||
else:
|
||||
|
|
Loading…
Reference in New Issue
Block a user