mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	Simplify config use in Language.initialize
This commit is contained in:
		
							parent
							
								
									56f8bc73ef
								
							
						
					
					
						commit
						63d1598137
					
				|  | @ -18,6 +18,7 @@ from .tokens.underscore import Underscore | ||||||
| from .vocab import Vocab, create_vocab | from .vocab import Vocab, create_vocab | ||||||
| from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis | from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis | ||||||
| from .training import Example, validate_examples | from .training import Example, validate_examples | ||||||
|  | from .training.initialize import init_vocab, init_tok2vec | ||||||
| from .scorer import Scorer | from .scorer import Scorer | ||||||
| from .util import registry, SimpleFrozenList | from .util import registry, SimpleFrozenList | ||||||
| from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER | from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER | ||||||
|  | @ -27,7 +28,8 @@ from .lang.punctuation import TOKENIZER_INFIXES | ||||||
| from .tokens import Doc | from .tokens import Doc | ||||||
| from .tokenizer import Tokenizer | from .tokenizer import Tokenizer | ||||||
| from .errors import Errors, Warnings | from .errors import Errors, Warnings | ||||||
| from .schemas import ConfigSchema, ConfigSchemaNlp, validate_init_settings | from .schemas import ConfigSchema, ConfigSchemaNlp, ConfigSchemaInit | ||||||
|  | from .schemas import ConfigSchemaPretrain, validate_init_settings | ||||||
| from .git_info import GIT_VERSION | from .git_info import GIT_VERSION | ||||||
| from . import util | from . import util | ||||||
| from . import about | from . import about | ||||||
|  | @ -1161,7 +1163,6 @@ class Language: | ||||||
|         self, |         self, | ||||||
|         get_examples: Optional[Callable[[], Iterable[Example]]] = None, |         get_examples: Optional[Callable[[], Iterable[Example]]] = None, | ||||||
|         *, |         *, | ||||||
|         settings: Dict[str, Dict[str, Any]] = SimpleFrozenDict(), |  | ||||||
|         sgd: Optional[Optimizer] = None, |         sgd: Optional[Optimizer] = None, | ||||||
|     ) -> Optimizer: |     ) -> Optimizer: | ||||||
|         """Initialize the pipe for training, using data examples if available. |         """Initialize the pipe for training, using data examples if available. | ||||||
|  | @ -1198,28 +1199,38 @@ class Language: | ||||||
|         if not valid_examples: |         if not valid_examples: | ||||||
|             err = Errors.E930.format(name="Language", obj="empty list") |             err = Errors.E930.format(name="Language", obj="empty list") | ||||||
|             raise ValueError(err) |             raise ValueError(err) | ||||||
|  |         # Make sure the config is interpolated so we can resolve subsections | ||||||
|  |         config = self.config.interpolate() | ||||||
|  |         # These are the settings provided in the [initialize] block in the config | ||||||
|  |         I = registry.resolve(config["initialize"], schema=ConfigSchemaInit) | ||||||
|  |         V = I["vocab"] | ||||||
|  |         init_vocab( | ||||||
|  |             self, data=V["data"], lookups=V["lookups"], vectors=V["vectors"], | ||||||
|  |         ) | ||||||
|  |         pretrain_cfg = config.get("pretraining") | ||||||
|  |         if pretrain_cfg: | ||||||
|  |             P = registry.resolve(pretrain_cfg, schema=ConfigSchemaPretrain) | ||||||
|  |             init_tok2vec(self, P, V) | ||||||
|         if self.vocab.vectors.data.shape[1] >= 1: |         if self.vocab.vectors.data.shape[1] >= 1: | ||||||
|             ops = get_current_ops() |             ops = get_current_ops() | ||||||
|             self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data) |             self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data) | ||||||
|         self._optimizer = sgd |  | ||||||
|         if hasattr(self.tokenizer, "initialize"): |         if hasattr(self.tokenizer, "initialize"): | ||||||
|             tok_settings = settings.get("tokenizer", {}) |  | ||||||
|             tok_settings = validate_init_settings( |             tok_settings = validate_init_settings( | ||||||
|                 self.tokenizer.initialize, |                 self.tokenizer.initialize, | ||||||
|                 tok_settings, |                 I["tokenizer"], | ||||||
|                 section="tokenizer", |                 section="tokenizer", | ||||||
|                 name="tokenizer", |                 name="tokenizer", | ||||||
|             ) |             ) | ||||||
|             self.tokenizer.initialize(get_examples, nlp=self, **tok_settings) |             self.tokenizer.initialize(get_examples, nlp=self, **tok_settings) | ||||||
|         proc_settings = settings.get("components", {}) |  | ||||||
|         for name, proc in self.pipeline: |         for name, proc in self.pipeline: | ||||||
|             if hasattr(proc, "initialize"): |             if hasattr(proc, "initialize"): | ||||||
|                 p_settings = proc_settings.get(name, {}) |                 p_settings = I["components"].get(name, {}) | ||||||
|                 p_settings = validate_init_settings( |                 p_settings = validate_init_settings( | ||||||
|                     proc.initialize, p_settings, section="components", name=name |                     proc.initialize, p_settings, section="components", name=name | ||||||
|                 ) |                 ) | ||||||
|                 proc.initialize(get_examples, nlp=self, **p_settings) |                 proc.initialize(get_examples, nlp=self, **p_settings) | ||||||
|         self._link_components() |         self._link_components() | ||||||
|  |         self._optimizer = sgd | ||||||
|         if sgd is not None: |         if sgd is not None: | ||||||
|             self._optimizer = sgd |             self._optimizer = sgd | ||||||
|         elif self._optimizer is None: |         elif self._optimizer is None: | ||||||
|  |  | ||||||
|  | @ -37,30 +37,33 @@ def test_initialize_arguments(): | ||||||
|     get_examples = lambda: [example] |     get_examples = lambda: [example] | ||||||
|     nlp.add_pipe(name) |     nlp.add_pipe(name) | ||||||
|     # The settings here will typically come from the [initialize] block |     # The settings here will typically come from the [initialize] block | ||||||
|  |     init_cfg = {"tokenizer": {"custom": 1}, "components": {name: {}}} | ||||||
|  |     nlp.config["initialize"].update(init_cfg) | ||||||
|     with pytest.raises(ConfigValidationError) as e: |     with pytest.raises(ConfigValidationError) as e: | ||||||
|         # Empty settings, no required custom1 argument |         # Empty config for component, no required custom1 argument | ||||||
|         settings = {"tokenizer": {"custom": 1}, "components": {name: {}}} |         nlp.initialize(get_examples) | ||||||
|         nlp.initialize(get_examples, settings=settings) |  | ||||||
|     errors = e.value.errors |     errors = e.value.errors | ||||||
|     assert len(errors) == 1 |     assert len(errors) == 1 | ||||||
|     assert errors[0]["loc"] == ("custom1",) |     assert errors[0]["loc"] == ("custom1",) | ||||||
|     assert errors[0]["type"] == "value_error.missing" |     assert errors[0]["type"] == "value_error.missing" | ||||||
|  |     init_cfg = { | ||||||
|  |         "tokenizer": {"custom": 1}, | ||||||
|  |         "components": {name: {"custom1": "x", "custom2": 1}}, | ||||||
|  |     } | ||||||
|  |     nlp.config["initialize"].update(init_cfg) | ||||||
|     with pytest.raises(ConfigValidationError) as e: |     with pytest.raises(ConfigValidationError) as e: | ||||||
|         # Wrong type |         # Wrong type of custom 2 | ||||||
|         settings = { |         nlp.initialize(get_examples) | ||||||
|             "tokenizer": {"custom": 1}, |  | ||||||
|             "components": {name: {"custom1": "x", "custom2": 1}}, |  | ||||||
|         } |  | ||||||
|         nlp.initialize(get_examples, settings=settings) |  | ||||||
|     errors = e.value.errors |     errors = e.value.errors | ||||||
|     assert len(errors) == 1 |     assert len(errors) == 1 | ||||||
|     assert errors[0]["loc"] == ("custom2",) |     assert errors[0]["loc"] == ("custom2",) | ||||||
|     assert errors[0]["type"] == "value_error.strictbool" |     assert errors[0]["type"] == "value_error.strictbool" | ||||||
|     settings = { |     init_cfg = { | ||||||
|         "tokenizer": {"custom": 1}, |         "tokenizer": {"custom": 1}, | ||||||
|         "components": {name: {"custom1": "x", "custom2": True}}, |         "components": {name: {"custom1": "x", "custom2": True}}, | ||||||
|     } |     } | ||||||
|     nlp.initialize(get_examples, settings=settings) |     nlp.config["initialize"].update(init_cfg) | ||||||
|  |     nlp.initialize(get_examples) | ||||||
|     assert nlp.tokenizer.from_initialize == 1 |     assert nlp.tokenizer.from_initialize == 1 | ||||||
|     pipe = nlp.get_pipe(name) |     pipe = nlp.get_pipe(name) | ||||||
|     assert pipe.from_initialize == ("x", True) |     assert pipe.from_initialize == ("x", True) | ||||||
|  |  | ||||||
|  | @ -1,4 +1,4 @@ | ||||||
| from typing import Union, Dict, Optional, Any, List, IO | from typing import Union, Dict, Optional, Any, List, IO, TYPE_CHECKING | ||||||
| from thinc.api import Config, fix_random_seed, set_gpu_allocator | from thinc.api import Config, fix_random_seed, set_gpu_allocator | ||||||
| from thinc.api import ConfigValidationError | from thinc.api import ConfigValidationError | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
|  | @ -11,16 +11,18 @@ import zipfile | ||||||
| import tqdm | import tqdm | ||||||
| 
 | 
 | ||||||
| from .loop import create_before_to_disk_callback | from .loop import create_before_to_disk_callback | ||||||
| from ..language import Language |  | ||||||
| from ..lookups import Lookups | from ..lookups import Lookups | ||||||
| from ..vectors import Vectors | from ..vectors import Vectors | ||||||
| from ..errors import Errors | from ..errors import Errors | ||||||
| from ..schemas import ConfigSchemaTraining, ConfigSchemaInit, ConfigSchemaPretrain | from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain | ||||||
| from ..util import registry, load_model_from_config, resolve_dot_names | from ..util import registry, load_model_from_config, resolve_dot_names | ||||||
| from ..util import load_model, ensure_path, OOV_RANK, DEFAULT_OOV_PROB | from ..util import load_model, ensure_path, OOV_RANK, DEFAULT_OOV_PROB | ||||||
| 
 | 
 | ||||||
|  | if TYPE_CHECKING: | ||||||
|  |     from ..language import Language  # noqa: F401 | ||||||
| 
 | 
 | ||||||
| def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> Language: | 
 | ||||||
|  | def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> "Language": | ||||||
|     msg = Printer(no_print=silent) |     msg = Printer(no_print=silent) | ||||||
|     raw_config = config |     raw_config = config | ||||||
|     config = raw_config.interpolate() |     config = raw_config.interpolate() | ||||||
|  | @ -38,11 +40,6 @@ def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> Langu | ||||||
|     T = registry.resolve(config["training"], schema=ConfigSchemaTraining) |     T = registry.resolve(config["training"], schema=ConfigSchemaTraining) | ||||||
|     dot_names = [T["train_corpus"], T["dev_corpus"]] |     dot_names = [T["train_corpus"], T["dev_corpus"]] | ||||||
|     train_corpus, dev_corpus = resolve_dot_names(config, dot_names) |     train_corpus, dev_corpus = resolve_dot_names(config, dot_names) | ||||||
|     I = registry.resolve(config["initialize"], schema=ConfigSchemaInit) |  | ||||||
|     V = I["vocab"] |  | ||||||
|     init_vocab( |  | ||||||
|         nlp, data=V["data"], lookups=V["lookups"], vectors=V["vectors"], silent=silent |  | ||||||
|     ) |  | ||||||
|     optimizer = T["optimizer"] |     optimizer = T["optimizer"] | ||||||
|     before_to_disk = create_before_to_disk_callback(T["before_to_disk"]) |     before_to_disk = create_before_to_disk_callback(T["before_to_disk"]) | ||||||
|     # Components that shouldn't be updated during training |     # Components that shouldn't be updated during training | ||||||
|  | @ -55,16 +52,11 @@ def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> Langu | ||||||
|             msg.info(f"Resuming training for: {resume_components}") |             msg.info(f"Resuming training for: {resume_components}") | ||||||
|             nlp.resume_training(sgd=optimizer) |             nlp.resume_training(sgd=optimizer) | ||||||
|     with nlp.select_pipes(disable=[*frozen_components, *resume_components]): |     with nlp.select_pipes(disable=[*frozen_components, *resume_components]): | ||||||
|         nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer, settings=I) |         nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer) | ||||||
|         msg.good("Initialized pipeline components") |         msg.good("Initialized pipeline components") | ||||||
|     # Verify the config after calling 'initialize' to ensure labels |     # Verify the config after calling 'initialize' to ensure labels | ||||||
|     # are properly initialized |     # are properly initialized | ||||||
|     verify_config(nlp) |     verify_config(nlp) | ||||||
|     if "pretraining" in config and config["pretraining"]: |  | ||||||
|         P = registry.resolve(config["pretraining"], schema=ConfigSchemaPretrain) |  | ||||||
|         loaded = add_tok2vec_weights(nlp, P, V) |  | ||||||
|         if loaded and P["component"]: |  | ||||||
|             msg.good(f"Loaded pretrained weights into component '{P['component']}'") |  | ||||||
|     nlp = before_to_disk(nlp) |     nlp = before_to_disk(nlp) | ||||||
|     return nlp |     return nlp | ||||||
| 
 | 
 | ||||||
|  | @ -75,13 +67,13 @@ def must_reinitialize(train_config: Config, init_config: Config) -> bool: | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def init_vocab( | def init_vocab( | ||||||
|     nlp: Language, |     nlp: "Language", | ||||||
|     *, |     *, | ||||||
|     data: Optional[Path] = None, |     data: Optional[Path] = None, | ||||||
|     lookups: Optional[Lookups] = None, |     lookups: Optional[Lookups] = None, | ||||||
|     vectors: Optional[str] = None, |     vectors: Optional[str] = None, | ||||||
|     silent: bool = True, |     silent: bool = True, | ||||||
| ) -> Language: | ) -> "Language": | ||||||
|     msg = Printer(no_print=silent) |     msg = Printer(no_print=silent) | ||||||
|     if lookups: |     if lookups: | ||||||
|         nlp.vocab.lookups = lookups |         nlp.vocab.lookups = lookups | ||||||
|  | @ -109,7 +101,7 @@ def init_vocab( | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def load_vectors_into_model( | def load_vectors_into_model( | ||||||
|     nlp: Language, name: Union[str, Path], *, add_strings: bool = True |     nlp: "Language", name: Union[str, Path], *, add_strings: bool = True | ||||||
| ) -> None: | ) -> None: | ||||||
|     """Load word vectors from an installed model or path into a model instance.""" |     """Load word vectors from an installed model or path into a model instance.""" | ||||||
|     try: |     try: | ||||||
|  | @ -132,8 +124,8 @@ def load_vectors_into_model( | ||||||
|                 nlp.vocab.strings.add(vectors_nlp.vocab.strings[key]) |                 nlp.vocab.strings.add(vectors_nlp.vocab.strings[key]) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def add_tok2vec_weights( | def init_tok2vec( | ||||||
|     nlp: Language, pretrain_config: Dict[str, Any], vocab_config: Dict[str, Any] |     nlp: "Language", pretrain_config: Dict[str, Any], vocab_config: Dict[str, Any] | ||||||
| ) -> bool: | ) -> bool: | ||||||
|     # Load pretrained tok2vec weights - cf. CLI command 'pretrain' |     # Load pretrained tok2vec weights - cf. CLI command 'pretrain' | ||||||
|     P = pretrain_config |     P = pretrain_config | ||||||
|  | @ -171,7 +163,7 @@ def add_tok2vec_weights( | ||||||
|     return False |     return False | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def verify_config(nlp: Language) -> None: | def verify_config(nlp: "Language") -> None: | ||||||
|     """Perform additional checks based on the config, loaded nlp object and training data.""" |     """Perform additional checks based on the config, loaded nlp object and training data.""" | ||||||
|     # TODO: maybe we should validate based on the actual components, the list |     # TODO: maybe we should validate based on the actual components, the list | ||||||
|     # in config["nlp"]["pipeline"] instead? |     # in config["nlp"]["pipeline"] instead? | ||||||
|  | @ -182,7 +174,7 @@ def verify_config(nlp: Language) -> None: | ||||||
|             verify_textcat_config(nlp, pipe_config) |             verify_textcat_config(nlp, pipe_config) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def verify_textcat_config(nlp: Language, pipe_config: Dict[str, Any]) -> None: | def verify_textcat_config(nlp: "Language", pipe_config: Dict[str, Any]) -> None: | ||||||
|     # if 'positive_label' is provided: double check whether it's in the data and |     # if 'positive_label' is provided: double check whether it's in the data and | ||||||
|     # the task is binary |     # the task is binary | ||||||
|     if pipe_config.get("positive_label"): |     if pipe_config.get("positive_label"): | ||||||
|  | @ -211,7 +203,7 @@ def get_sourced_components(config: Union[Dict[str, Any], Config]) -> List[str]: | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def convert_vectors( | def convert_vectors( | ||||||
|     nlp: Language, |     nlp: "Language", | ||||||
|     vectors_loc: Optional[Path], |     vectors_loc: Optional[Path], | ||||||
|     *, |     *, | ||||||
|     truncate: int, |     truncate: int, | ||||||
|  |  | ||||||
|  | @ -1,5 +1,5 @@ | ||||||
| from typing import List, Callable, Tuple, Dict, Iterable, Iterator, Union, Any | from typing import List, Callable, Tuple, Dict, Iterable, Iterator, Union, Any | ||||||
| from typing import Optional | from typing import Optional, TYPE_CHECKING | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
| from timeit import default_timer as timer | from timeit import default_timer as timer | ||||||
| from thinc.api import Optimizer, Config, constant, fix_random_seed, set_gpu_allocator | from thinc.api import Optimizer, Config, constant, fix_random_seed, set_gpu_allocator | ||||||
|  | @ -9,13 +9,15 @@ from wasabi import Printer | ||||||
| 
 | 
 | ||||||
| from .example import Example | from .example import Example | ||||||
| from ..schemas import ConfigSchemaTraining | from ..schemas import ConfigSchemaTraining | ||||||
| from ..language import Language |  | ||||||
| from ..errors import Errors | from ..errors import Errors | ||||||
| from ..util import resolve_dot_names, registry | from ..util import resolve_dot_names, registry | ||||||
| 
 | 
 | ||||||
|  | if TYPE_CHECKING: | ||||||
|  |     from ..language import Language  # noqa: F401 | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| def train( | def train( | ||||||
|     nlp: Language, |     nlp: "Language", | ||||||
|     output_path: Optional[Path] = None, |     output_path: Optional[Path] = None, | ||||||
|     *, |     *, | ||||||
|     use_gpu: int = -1, |     use_gpu: int = -1, | ||||||
|  | @ -110,7 +112,7 @@ def train( | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def train_while_improving( | def train_while_improving( | ||||||
|     nlp: Language, |     nlp: "Language", | ||||||
|     optimizer: Optimizer, |     optimizer: Optimizer, | ||||||
|     train_data, |     train_data, | ||||||
|     evaluate, |     evaluate, | ||||||
|  | @ -233,7 +235,7 @@ def subdivide_batch(batch, accumulate_gradient): | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def create_evaluation_callback( | def create_evaluation_callback( | ||||||
|     nlp: Language, dev_corpus: Callable, weights: Dict[str, float] |     nlp: "Language", dev_corpus: Callable, weights: Dict[str, float] | ||||||
| ) -> Callable[[], Tuple[float, Dict[str, float]]]: | ) -> Callable[[], Tuple[float, Dict[str, float]]]: | ||||||
|     weights = {key: value for key, value in weights.items() if value is not None} |     weights = {key: value for key, value in weights.items() if value is not None} | ||||||
| 
 | 
 | ||||||
|  | @ -277,7 +279,7 @@ def create_train_batches( | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def update_meta( | def update_meta( | ||||||
|     training: Union[Dict[str, Any], Config], nlp: Language, info: Dict[str, Any] |     training: Union[Dict[str, Any], Config], nlp: "Language", info: Dict[str, Any] | ||||||
| ) -> None: | ) -> None: | ||||||
|     nlp.meta["performance"] = {} |     nlp.meta["performance"] = {} | ||||||
|     for metric in training["score_weights"]: |     for metric in training["score_weights"]: | ||||||
|  | @ -288,8 +290,10 @@ def update_meta( | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def create_before_to_disk_callback( | def create_before_to_disk_callback( | ||||||
|     callback: Optional[Callable[[Language], Language]] |     callback: Optional[Callable[["Language"], "Language"]] | ||||||
| ) -> Callable[[Language], Language]: | ) -> Callable[["Language"], "Language"]: | ||||||
|  |     from ..language import Language  # noqa: F811 | ||||||
|  | 
 | ||||||
|     def before_to_disk(nlp: Language) -> Language: |     def before_to_disk(nlp: Language) -> Language: | ||||||
|         if not callback: |         if not callback: | ||||||
|             return nlp |             return nlp | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user