mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	Re-add setting for vocab data and tidy up
This commit is contained in:
		
							parent
							
								
									b9aaa4e457
								
							
						
					
					
						commit
						8d9d28eb8b
					
				|  | @ -1,6 +1,7 @@ | ||||||
| [nlp] | [nlp] | ||||||
| lang = null | lang = null | ||||||
| pipeline = [] | pipeline = [] | ||||||
|  | load_vocab_data = true | ||||||
| 
 | 
 | ||||||
| [nlp.tokenizer] | [nlp.tokenizer] | ||||||
| @tokenizers = "spacy.Tokenizer.v1" | @tokenizers = "spacy.Tokenizer.v1" | ||||||
|  |  | ||||||
|  | @ -121,15 +121,18 @@ class Language: | ||||||
|         vocab (Vocab): A `Vocab` object. If `True`, a vocab is created. |         vocab (Vocab): A `Vocab` object. If `True`, a vocab is created. | ||||||
|         meta (dict): Custom meta data for the Language class. Is written to by |         meta (dict): Custom meta data for the Language class. Is written to by | ||||||
|             models to add model meta data. |             models to add model meta data. | ||||||
|         max_length (int) : |         max_length (int): Maximum number of characters in a single text. The | ||||||
|             Maximum number of characters in a single text. The current models |             current models may run out memory on extremely long texts, due to | ||||||
|             may run out memory on extremely long texts, due to large internal |             large internal allocations. You should segment these texts into | ||||||
|             allocations. You should segment these texts into meaningful units, |             meaningful units, e.g. paragraphs, subsections etc, before passing | ||||||
|             e.g. paragraphs, subsections etc, before passing them to spaCy. |             them to spaCy. Default maximum length is 1,000,000 charas (1mb). As | ||||||
|             Default maximum length is 1,000,000 characters (1mb). As a rule of |             a rule of thumb, if all pipeline components are enabled, spaCy's | ||||||
|             thumb, if all pipeline components are enabled, spaCy's default |             default models currently requires roughly 1GB of temporary memory per | ||||||
|             models currently requires roughly 1GB of temporary memory per |  | ||||||
|             100,000 characters in one text. |             100,000 characters in one text. | ||||||
|  |         create_tokenizer (Callable): Function that takes the nlp object and | ||||||
|  |             returns a tokenizer. | ||||||
|  |         create_lemmatizer (Callable): Function that takes the nlp object and | ||||||
|  |             returns a lemmatizer. | ||||||
|         RETURNS (Language): The newly constructed object. |         RETURNS (Language): The newly constructed object. | ||||||
|         """ |         """ | ||||||
|         # We're only calling this to import all factories provided via entry |         # We're only calling this to import all factories provided via entry | ||||||
|  | @ -150,12 +153,12 @@ class Language: | ||||||
|             if not create_lemmatizer: |             if not create_lemmatizer: | ||||||
|                 lemma_cfg = {"lemmatizer": self._config["nlp"]["lemmatizer"]} |                 lemma_cfg = {"lemmatizer": self._config["nlp"]["lemmatizer"]} | ||||||
|                 create_lemmatizer = registry.make_from_config(lemma_cfg)["lemmatizer"] |                 create_lemmatizer = registry.make_from_config(lemma_cfg)["lemmatizer"] | ||||||
|             # TODO: where does the vocab data come in? |  | ||||||
|             vocab = create_vocab( |             vocab = create_vocab( | ||||||
|                 self.lang, |                 self.lang, | ||||||
|                 self.Defaults, |                 self.Defaults, | ||||||
|                 lemmatizer=create_lemmatizer(self), |                 lemmatizer=create_lemmatizer(self), | ||||||
|                 vectors_name=vectors_name, |                 vectors_name=vectors_name, | ||||||
|  |                 load_data=self._config["nlp"]["load_vocab_data"], | ||||||
|             ) |             ) | ||||||
|         else: |         else: | ||||||
|             if (self.lang and vocab.lang) and (self.lang != vocab.lang): |             if (self.lang and vocab.lang) and (self.lang != vocab.lang): | ||||||
|  |  | ||||||
|  | @ -224,21 +224,13 @@ class ConfigSchemaTraining(BaseModel): | ||||||
|         arbitrary_types_allowed = True |         arbitrary_types_allowed = True | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class ConfigSchemaNlpWritingSystem(BaseModel): |  | ||||||
|     direction: StrictStr = Field(..., title="The writing direction, e.g. 'rtl'") |  | ||||||
|     has_case: StrictBool = Field(..., title="Whether the language has case") |  | ||||||
|     has_letters: StrictBool = Field(..., title="Whether the language has letters") |  | ||||||
| 
 |  | ||||||
|     class Config: |  | ||||||
|         extra = "allow" |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| class ConfigSchemaNlp(BaseModel): | class ConfigSchemaNlp(BaseModel): | ||||||
|     # fmt: off |     # fmt: off | ||||||
|     lang: StrictStr = Field(..., title="The base language to use") |     lang: StrictStr = Field(..., title="The base language to use") | ||||||
|     pipeline: List[StrictStr] = Field(..., title="The pipeline component names in order") |     pipeline: List[StrictStr] = Field(..., title="The pipeline component names in order") | ||||||
|     tokenizer: Callable = Field(..., title="The tokenizer to use") |     tokenizer: Callable = Field(..., title="The tokenizer to use") | ||||||
|     lemmatizer: Callable = Field(..., title="The lemmatizer to use") |     lemmatizer: Callable = Field(..., title="The lemmatizer to use") | ||||||
|  |     load_vocab_data: StrictBool = Field(..., title="Whether to load additional vocab data from spacy-lookups-data") | ||||||
|     # fmt: on |     # fmt: on | ||||||
| 
 | 
 | ||||||
|     class Config: |     class Config: | ||||||
|  |  | ||||||
|  | @ -188,8 +188,10 @@ def load_model( | ||||||
|     """Load a model from a package or data path. |     """Load a model from a package or data path. | ||||||
| 
 | 
 | ||||||
|     name (str): Package name or model path. |     name (str): Package name or model path. | ||||||
|     **overrides: Specific overrides, like pipeline components to disable. |     disable (Iterable[str]): Names of pipeline components to disable. | ||||||
|     RETURNS (Language): `Language` class with the loaded model. |     component_cfg (Dict[str, dict]): Config overrides for pipeline components, | ||||||
|  |         keyed by component names. | ||||||
|  |     RETURNS (Language): The loaded nlp object. | ||||||
|     """ |     """ | ||||||
|     cfg = component_cfg |     cfg = component_cfg | ||||||
|     if isinstance(name, str):  # name or string path |     if isinstance(name, str):  # name or string path | ||||||
|  |  | ||||||
|  | @ -23,10 +23,10 @@ from .lang.norm_exceptions import BASE_NORMS | ||||||
| from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang | from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def create_vocab(lang, defaults, lemmatizer=None, vectors_name=None, load_lookups_data=True): | def create_vocab(lang, defaults, lemmatizer=None, vectors_name=None, load_data=True): | ||||||
|     # If the spacy-lookups-data package is installed, we pre-populate the lookups |     # If the spacy-lookups-data package is installed, we pre-populate the lookups | ||||||
|     # with lexeme data, if available |     # with lexeme data, if available | ||||||
|     if load_lookups_data: |     if load_data: | ||||||
|         tables = ["lexeme_norm", "lexeme_prob", "lexeme_cluster", "lexeme_settings"] |         tables = ["lexeme_norm", "lexeme_prob", "lexeme_cluster", "lexeme_settings"] | ||||||
|         lookups = load_lookups(lang, tables=tables, strict=False) |         lookups = load_lookups(lang, tables=tables, strict=False) | ||||||
|     else: |     else: | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user