diff --git a/spacy/language.py b/spacy/language.py index 8ef2f1d61..8d546529d 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -18,6 +18,7 @@ from .tokens.underscore import Underscore from .vocab import Vocab, create_vocab from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis from .training import Example, validate_examples +from .training.initialize import init_vocab, init_tok2vec from .scorer import Scorer from .util import registry, SimpleFrozenList from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER @@ -27,7 +28,8 @@ from .lang.punctuation import TOKENIZER_INFIXES from .tokens import Doc from .tokenizer import Tokenizer from .errors import Errors, Warnings -from .schemas import ConfigSchema, ConfigSchemaNlp, validate_init_settings +from .schemas import ConfigSchema, ConfigSchemaNlp, ConfigSchemaInit +from .schemas import ConfigSchemaPretrain, validate_init_settings from .git_info import GIT_VERSION from . import util from . import about @@ -1161,7 +1163,6 @@ class Language: self, get_examples: Optional[Callable[[], Iterable[Example]]] = None, *, - settings: Dict[str, Dict[str, Any]] = SimpleFrozenDict(), sgd: Optional[Optimizer] = None, ) -> Optimizer: """Initialize the pipe for training, using data examples if available. @@ -1198,28 +1199,38 @@ class Language: if not valid_examples: err = Errors.E930.format(name="Language", obj="empty list") raise ValueError(err) + # Make sure the config is interpolated so we can resolve subsections + config = self.config.interpolate() + # These are the settings provided in the [initialize] block in the config + I = registry.resolve(config["initialize"], schema=ConfigSchemaInit) + V = I["vocab"] + init_vocab( + self, data=V["data"], lookups=V["lookups"], vectors=V["vectors"], + ) + pretrain_cfg = config.get("pretraining") + if pretrain_cfg: + P = registry.resolve(pretrain_cfg, schema=ConfigSchemaPretrain) + init_tok2vec(self, P, V) if self.vocab.vectors.data.shape[1] >= 1: ops = get_current_ops() self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data) - self._optimizer = sgd if hasattr(self.tokenizer, "initialize"): - tok_settings = settings.get("tokenizer", {}) tok_settings = validate_init_settings( self.tokenizer.initialize, - tok_settings, + I["tokenizer"], section="tokenizer", name="tokenizer", ) self.tokenizer.initialize(get_examples, nlp=self, **tok_settings) - proc_settings = settings.get("components", {}) for name, proc in self.pipeline: if hasattr(proc, "initialize"): - p_settings = proc_settings.get(name, {}) + p_settings = I["components"].get(name, {}) p_settings = validate_init_settings( proc.initialize, p_settings, section="components", name=name ) proc.initialize(get_examples, nlp=self, **p_settings) self._link_components() + self._optimizer = sgd if sgd is not None: self._optimizer = sgd elif self._optimizer is None: diff --git a/spacy/tests/pipeline/test_initialize.py b/spacy/tests/pipeline/test_initialize.py index 1d2e7e5a3..b6c22ee09 100644 --- a/spacy/tests/pipeline/test_initialize.py +++ b/spacy/tests/pipeline/test_initialize.py @@ -37,30 +37,33 @@ def test_initialize_arguments(): get_examples = lambda: [example] nlp.add_pipe(name) # The settings here will typically come from the [initialize] block + init_cfg = {"tokenizer": {"custom": 1}, "components": {name: {}}} + nlp.config["initialize"].update(init_cfg) with pytest.raises(ConfigValidationError) as e: - # Empty settings, no required custom1 argument - settings = {"tokenizer": {"custom": 1}, "components": {name: {}}} - nlp.initialize(get_examples, settings=settings) + # Empty config for component, no required custom1 argument + nlp.initialize(get_examples) errors = e.value.errors assert len(errors) == 1 assert errors[0]["loc"] == ("custom1",) assert errors[0]["type"] == "value_error.missing" + init_cfg = { + "tokenizer": {"custom": 1}, + "components": {name: {"custom1": "x", "custom2": 1}}, + } + nlp.config["initialize"].update(init_cfg) with pytest.raises(ConfigValidationError) as e: - # Wrong type - settings = { - "tokenizer": {"custom": 1}, - "components": {name: {"custom1": "x", "custom2": 1}}, - } - nlp.initialize(get_examples, settings=settings) + # Wrong type of custom 2 + nlp.initialize(get_examples) errors = e.value.errors assert len(errors) == 1 assert errors[0]["loc"] == ("custom2",) assert errors[0]["type"] == "value_error.strictbool" - settings = { + init_cfg = { "tokenizer": {"custom": 1}, "components": {name: {"custom1": "x", "custom2": True}}, } - nlp.initialize(get_examples, settings=settings) + nlp.config["initialize"].update(init_cfg) + nlp.initialize(get_examples) assert nlp.tokenizer.from_initialize == 1 pipe = nlp.get_pipe(name) assert pipe.from_initialize == ("x", True) diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index b42732d48..9517c6c48 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -1,4 +1,4 @@ -from typing import Union, Dict, Optional, Any, List, IO +from typing import Union, Dict, Optional, Any, List, IO, TYPE_CHECKING from thinc.api import Config, fix_random_seed, set_gpu_allocator from thinc.api import ConfigValidationError from pathlib import Path @@ -11,16 +11,18 @@ import zipfile import tqdm from .loop import create_before_to_disk_callback -from ..language import Language from ..lookups import Lookups from ..vectors import Vectors from ..errors import Errors -from ..schemas import ConfigSchemaTraining, ConfigSchemaInit, ConfigSchemaPretrain +from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain from ..util import registry, load_model_from_config, resolve_dot_names from ..util import load_model, ensure_path, OOV_RANK, DEFAULT_OOV_PROB +if TYPE_CHECKING: + from ..language import Language # noqa: F401 -def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> Language: + +def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> "Language": msg = Printer(no_print=silent) raw_config = config config = raw_config.interpolate() @@ -38,11 +40,6 @@ def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> Langu T = registry.resolve(config["training"], schema=ConfigSchemaTraining) dot_names = [T["train_corpus"], T["dev_corpus"]] train_corpus, dev_corpus = resolve_dot_names(config, dot_names) - I = registry.resolve(config["initialize"], schema=ConfigSchemaInit) - V = I["vocab"] - init_vocab( - nlp, data=V["data"], lookups=V["lookups"], vectors=V["vectors"], silent=silent - ) optimizer = T["optimizer"] before_to_disk = create_before_to_disk_callback(T["before_to_disk"]) # Components that shouldn't be updated during training @@ -55,16 +52,11 @@ def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> Langu msg.info(f"Resuming training for: {resume_components}") nlp.resume_training(sgd=optimizer) with nlp.select_pipes(disable=[*frozen_components, *resume_components]): - nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer, settings=I) + nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer) msg.good("Initialized pipeline components") # Verify the config after calling 'initialize' to ensure labels # are properly initialized verify_config(nlp) - if "pretraining" in config and config["pretraining"]: - P = registry.resolve(config["pretraining"], schema=ConfigSchemaPretrain) - loaded = add_tok2vec_weights(nlp, P, V) - if loaded and P["component"]: - msg.good(f"Loaded pretrained weights into component '{P['component']}'") nlp = before_to_disk(nlp) return nlp @@ -75,13 +67,13 @@ def must_reinitialize(train_config: Config, init_config: Config) -> bool: def init_vocab( - nlp: Language, + nlp: "Language", *, data: Optional[Path] = None, lookups: Optional[Lookups] = None, vectors: Optional[str] = None, silent: bool = True, -) -> Language: +) -> "Language": msg = Printer(no_print=silent) if lookups: nlp.vocab.lookups = lookups @@ -109,7 +101,7 @@ def init_vocab( def load_vectors_into_model( - nlp: Language, name: Union[str, Path], *, add_strings: bool = True + nlp: "Language", name: Union[str, Path], *, add_strings: bool = True ) -> None: """Load word vectors from an installed model or path into a model instance.""" try: @@ -132,8 +124,8 @@ def load_vectors_into_model( nlp.vocab.strings.add(vectors_nlp.vocab.strings[key]) -def add_tok2vec_weights( - nlp: Language, pretrain_config: Dict[str, Any], vocab_config: Dict[str, Any] +def init_tok2vec( + nlp: "Language", pretrain_config: Dict[str, Any], vocab_config: Dict[str, Any] ) -> bool: # Load pretrained tok2vec weights - cf. CLI command 'pretrain' P = pretrain_config @@ -171,7 +163,7 @@ def add_tok2vec_weights( return False -def verify_config(nlp: Language) -> None: +def verify_config(nlp: "Language") -> None: """Perform additional checks based on the config, loaded nlp object and training data.""" # TODO: maybe we should validate based on the actual components, the list # in config["nlp"]["pipeline"] instead? @@ -182,7 +174,7 @@ def verify_config(nlp: Language) -> None: verify_textcat_config(nlp, pipe_config) -def verify_textcat_config(nlp: Language, pipe_config: Dict[str, Any]) -> None: +def verify_textcat_config(nlp: "Language", pipe_config: Dict[str, Any]) -> None: # if 'positive_label' is provided: double check whether it's in the data and # the task is binary if pipe_config.get("positive_label"): @@ -211,7 +203,7 @@ def get_sourced_components(config: Union[Dict[str, Any], Config]) -> List[str]: def convert_vectors( - nlp: Language, + nlp: "Language", vectors_loc: Optional[Path], *, truncate: int, diff --git a/spacy/training/loop.py b/spacy/training/loop.py index 5153be66c..41e6464e0 100644 --- a/spacy/training/loop.py +++ b/spacy/training/loop.py @@ -1,5 +1,5 @@ from typing import List, Callable, Tuple, Dict, Iterable, Iterator, Union, Any -from typing import Optional +from typing import Optional, TYPE_CHECKING from pathlib import Path from timeit import default_timer as timer from thinc.api import Optimizer, Config, constant, fix_random_seed, set_gpu_allocator @@ -9,13 +9,15 @@ from wasabi import Printer from .example import Example from ..schemas import ConfigSchemaTraining -from ..language import Language from ..errors import Errors from ..util import resolve_dot_names, registry +if TYPE_CHECKING: + from ..language import Language # noqa: F401 + def train( - nlp: Language, + nlp: "Language", output_path: Optional[Path] = None, *, use_gpu: int = -1, @@ -110,7 +112,7 @@ def train( def train_while_improving( - nlp: Language, + nlp: "Language", optimizer: Optimizer, train_data, evaluate, @@ -233,7 +235,7 @@ def subdivide_batch(batch, accumulate_gradient): def create_evaluation_callback( - nlp: Language, dev_corpus: Callable, weights: Dict[str, float] + nlp: "Language", dev_corpus: Callable, weights: Dict[str, float] ) -> Callable[[], Tuple[float, Dict[str, float]]]: weights = {key: value for key, value in weights.items() if value is not None} @@ -277,7 +279,7 @@ def create_train_batches( def update_meta( - training: Union[Dict[str, Any], Config], nlp: Language, info: Dict[str, Any] + training: Union[Dict[str, Any], Config], nlp: "Language", info: Dict[str, Any] ) -> None: nlp.meta["performance"] = {} for metric in training["score_weights"]: @@ -288,8 +290,10 @@ def update_meta( def create_before_to_disk_callback( - callback: Optional[Callable[[Language], Language]] -) -> Callable[[Language], Language]: + callback: Optional[Callable[["Language"], "Language"]] +) -> Callable[["Language"], "Language"]: + from ..language import Language # noqa: F811 + def before_to_disk(nlp: Language) -> Language: if not callback: return nlp