mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Simplify config use in Language.initialize
This commit is contained in:
		
							parent
							
								
									56f8bc73ef
								
							
						
					
					
						commit
						63d1598137
					
				| 
						 | 
				
			
			@ -18,6 +18,7 @@ from .tokens.underscore import Underscore
 | 
			
		|||
from .vocab import Vocab, create_vocab
 | 
			
		||||
from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
 | 
			
		||||
from .training import Example, validate_examples
 | 
			
		||||
from .training.initialize import init_vocab, init_tok2vec
 | 
			
		||||
from .scorer import Scorer
 | 
			
		||||
from .util import registry, SimpleFrozenList
 | 
			
		||||
from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER
 | 
			
		||||
| 
						 | 
				
			
			@ -27,7 +28,8 @@ from .lang.punctuation import TOKENIZER_INFIXES
 | 
			
		|||
from .tokens import Doc
 | 
			
		||||
from .tokenizer import Tokenizer
 | 
			
		||||
from .errors import Errors, Warnings
 | 
			
		||||
from .schemas import ConfigSchema, ConfigSchemaNlp, validate_init_settings
 | 
			
		||||
from .schemas import ConfigSchema, ConfigSchemaNlp, ConfigSchemaInit
 | 
			
		||||
from .schemas import ConfigSchemaPretrain, validate_init_settings
 | 
			
		||||
from .git_info import GIT_VERSION
 | 
			
		||||
from . import util
 | 
			
		||||
from . import about
 | 
			
		||||
| 
						 | 
				
			
			@ -1161,7 +1163,6 @@ class Language:
 | 
			
		|||
        self,
 | 
			
		||||
        get_examples: Optional[Callable[[], Iterable[Example]]] = None,
 | 
			
		||||
        *,
 | 
			
		||||
        settings: Dict[str, Dict[str, Any]] = SimpleFrozenDict(),
 | 
			
		||||
        sgd: Optional[Optimizer] = None,
 | 
			
		||||
    ) -> Optimizer:
 | 
			
		||||
        """Initialize the pipe for training, using data examples if available.
 | 
			
		||||
| 
						 | 
				
			
			@ -1198,28 +1199,38 @@ class Language:
 | 
			
		|||
        if not valid_examples:
 | 
			
		||||
            err = Errors.E930.format(name="Language", obj="empty list")
 | 
			
		||||
            raise ValueError(err)
 | 
			
		||||
        # Make sure the config is interpolated so we can resolve subsections
 | 
			
		||||
        config = self.config.interpolate()
 | 
			
		||||
        # These are the settings provided in the [initialize] block in the config
 | 
			
		||||
        I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
 | 
			
		||||
        V = I["vocab"]
 | 
			
		||||
        init_vocab(
 | 
			
		||||
            self, data=V["data"], lookups=V["lookups"], vectors=V["vectors"],
 | 
			
		||||
        )
 | 
			
		||||
        pretrain_cfg = config.get("pretraining")
 | 
			
		||||
        if pretrain_cfg:
 | 
			
		||||
            P = registry.resolve(pretrain_cfg, schema=ConfigSchemaPretrain)
 | 
			
		||||
            init_tok2vec(self, P, V)
 | 
			
		||||
        if self.vocab.vectors.data.shape[1] >= 1:
 | 
			
		||||
            ops = get_current_ops()
 | 
			
		||||
            self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
 | 
			
		||||
        self._optimizer = sgd
 | 
			
		||||
        if hasattr(self.tokenizer, "initialize"):
 | 
			
		||||
            tok_settings = settings.get("tokenizer", {})
 | 
			
		||||
            tok_settings = validate_init_settings(
 | 
			
		||||
                self.tokenizer.initialize,
 | 
			
		||||
                tok_settings,
 | 
			
		||||
                I["tokenizer"],
 | 
			
		||||
                section="tokenizer",
 | 
			
		||||
                name="tokenizer",
 | 
			
		||||
            )
 | 
			
		||||
            self.tokenizer.initialize(get_examples, nlp=self, **tok_settings)
 | 
			
		||||
        proc_settings = settings.get("components", {})
 | 
			
		||||
        for name, proc in self.pipeline:
 | 
			
		||||
            if hasattr(proc, "initialize"):
 | 
			
		||||
                p_settings = proc_settings.get(name, {})
 | 
			
		||||
                p_settings = I["components"].get(name, {})
 | 
			
		||||
                p_settings = validate_init_settings(
 | 
			
		||||
                    proc.initialize, p_settings, section="components", name=name
 | 
			
		||||
                )
 | 
			
		||||
                proc.initialize(get_examples, nlp=self, **p_settings)
 | 
			
		||||
        self._link_components()
 | 
			
		||||
        self._optimizer = sgd
 | 
			
		||||
        if sgd is not None:
 | 
			
		||||
            self._optimizer = sgd
 | 
			
		||||
        elif self._optimizer is None:
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -37,30 +37,33 @@ def test_initialize_arguments():
 | 
			
		|||
    get_examples = lambda: [example]
 | 
			
		||||
    nlp.add_pipe(name)
 | 
			
		||||
    # The settings here will typically come from the [initialize] block
 | 
			
		||||
    init_cfg = {"tokenizer": {"custom": 1}, "components": {name: {}}}
 | 
			
		||||
    nlp.config["initialize"].update(init_cfg)
 | 
			
		||||
    with pytest.raises(ConfigValidationError) as e:
 | 
			
		||||
        # Empty settings, no required custom1 argument
 | 
			
		||||
        settings = {"tokenizer": {"custom": 1}, "components": {name: {}}}
 | 
			
		||||
        nlp.initialize(get_examples, settings=settings)
 | 
			
		||||
        # Empty config for component, no required custom1 argument
 | 
			
		||||
        nlp.initialize(get_examples)
 | 
			
		||||
    errors = e.value.errors
 | 
			
		||||
    assert len(errors) == 1
 | 
			
		||||
    assert errors[0]["loc"] == ("custom1",)
 | 
			
		||||
    assert errors[0]["type"] == "value_error.missing"
 | 
			
		||||
    with pytest.raises(ConfigValidationError) as e:
 | 
			
		||||
        # Wrong type
 | 
			
		||||
        settings = {
 | 
			
		||||
    init_cfg = {
 | 
			
		||||
        "tokenizer": {"custom": 1},
 | 
			
		||||
        "components": {name: {"custom1": "x", "custom2": 1}},
 | 
			
		||||
    }
 | 
			
		||||
        nlp.initialize(get_examples, settings=settings)
 | 
			
		||||
    nlp.config["initialize"].update(init_cfg)
 | 
			
		||||
    with pytest.raises(ConfigValidationError) as e:
 | 
			
		||||
        # Wrong type of custom 2
 | 
			
		||||
        nlp.initialize(get_examples)
 | 
			
		||||
    errors = e.value.errors
 | 
			
		||||
    assert len(errors) == 1
 | 
			
		||||
    assert errors[0]["loc"] == ("custom2",)
 | 
			
		||||
    assert errors[0]["type"] == "value_error.strictbool"
 | 
			
		||||
    settings = {
 | 
			
		||||
    init_cfg = {
 | 
			
		||||
        "tokenizer": {"custom": 1},
 | 
			
		||||
        "components": {name: {"custom1": "x", "custom2": True}},
 | 
			
		||||
    }
 | 
			
		||||
    nlp.initialize(get_examples, settings=settings)
 | 
			
		||||
    nlp.config["initialize"].update(init_cfg)
 | 
			
		||||
    nlp.initialize(get_examples)
 | 
			
		||||
    assert nlp.tokenizer.from_initialize == 1
 | 
			
		||||
    pipe = nlp.get_pipe(name)
 | 
			
		||||
    assert pipe.from_initialize == ("x", True)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,4 @@
 | 
			
		|||
from typing import Union, Dict, Optional, Any, List, IO
 | 
			
		||||
from typing import Union, Dict, Optional, Any, List, IO, TYPE_CHECKING
 | 
			
		||||
from thinc.api import Config, fix_random_seed, set_gpu_allocator
 | 
			
		||||
from thinc.api import ConfigValidationError
 | 
			
		||||
from pathlib import Path
 | 
			
		||||
| 
						 | 
				
			
			@ -11,16 +11,18 @@ import zipfile
 | 
			
		|||
import tqdm
 | 
			
		||||
 | 
			
		||||
from .loop import create_before_to_disk_callback
 | 
			
		||||
from ..language import Language
 | 
			
		||||
from ..lookups import Lookups
 | 
			
		||||
from ..vectors import Vectors
 | 
			
		||||
from ..errors import Errors
 | 
			
		||||
from ..schemas import ConfigSchemaTraining, ConfigSchemaInit, ConfigSchemaPretrain
 | 
			
		||||
from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain
 | 
			
		||||
from ..util import registry, load_model_from_config, resolve_dot_names
 | 
			
		||||
from ..util import load_model, ensure_path, OOV_RANK, DEFAULT_OOV_PROB
 | 
			
		||||
 | 
			
		||||
if TYPE_CHECKING:
 | 
			
		||||
    from ..language import Language  # noqa: F401
 | 
			
		||||
 | 
			
		||||
def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> Language:
 | 
			
		||||
 | 
			
		||||
def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> "Language":
 | 
			
		||||
    msg = Printer(no_print=silent)
 | 
			
		||||
    raw_config = config
 | 
			
		||||
    config = raw_config.interpolate()
 | 
			
		||||
| 
						 | 
				
			
			@ -38,11 +40,6 @@ def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> Langu
 | 
			
		|||
    T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
 | 
			
		||||
    dot_names = [T["train_corpus"], T["dev_corpus"]]
 | 
			
		||||
    train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
 | 
			
		||||
    I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
 | 
			
		||||
    V = I["vocab"]
 | 
			
		||||
    init_vocab(
 | 
			
		||||
        nlp, data=V["data"], lookups=V["lookups"], vectors=V["vectors"], silent=silent
 | 
			
		||||
    )
 | 
			
		||||
    optimizer = T["optimizer"]
 | 
			
		||||
    before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
 | 
			
		||||
    # Components that shouldn't be updated during training
 | 
			
		||||
| 
						 | 
				
			
			@ -55,16 +52,11 @@ def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> Langu
 | 
			
		|||
            msg.info(f"Resuming training for: {resume_components}")
 | 
			
		||||
            nlp.resume_training(sgd=optimizer)
 | 
			
		||||
    with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
 | 
			
		||||
        nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer, settings=I)
 | 
			
		||||
        nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
 | 
			
		||||
        msg.good("Initialized pipeline components")
 | 
			
		||||
    # Verify the config after calling 'initialize' to ensure labels
 | 
			
		||||
    # are properly initialized
 | 
			
		||||
    verify_config(nlp)
 | 
			
		||||
    if "pretraining" in config and config["pretraining"]:
 | 
			
		||||
        P = registry.resolve(config["pretraining"], schema=ConfigSchemaPretrain)
 | 
			
		||||
        loaded = add_tok2vec_weights(nlp, P, V)
 | 
			
		||||
        if loaded and P["component"]:
 | 
			
		||||
            msg.good(f"Loaded pretrained weights into component '{P['component']}'")
 | 
			
		||||
    nlp = before_to_disk(nlp)
 | 
			
		||||
    return nlp
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -75,13 +67,13 @@ def must_reinitialize(train_config: Config, init_config: Config) -> bool:
 | 
			
		|||
 | 
			
		||||
 | 
			
		||||
def init_vocab(
 | 
			
		||||
    nlp: Language,
 | 
			
		||||
    nlp: "Language",
 | 
			
		||||
    *,
 | 
			
		||||
    data: Optional[Path] = None,
 | 
			
		||||
    lookups: Optional[Lookups] = None,
 | 
			
		||||
    vectors: Optional[str] = None,
 | 
			
		||||
    silent: bool = True,
 | 
			
		||||
) -> Language:
 | 
			
		||||
) -> "Language":
 | 
			
		||||
    msg = Printer(no_print=silent)
 | 
			
		||||
    if lookups:
 | 
			
		||||
        nlp.vocab.lookups = lookups
 | 
			
		||||
| 
						 | 
				
			
			@ -109,7 +101,7 @@ def init_vocab(
 | 
			
		|||
 | 
			
		||||
 | 
			
		||||
def load_vectors_into_model(
 | 
			
		||||
    nlp: Language, name: Union[str, Path], *, add_strings: bool = True
 | 
			
		||||
    nlp: "Language", name: Union[str, Path], *, add_strings: bool = True
 | 
			
		||||
) -> None:
 | 
			
		||||
    """Load word vectors from an installed model or path into a model instance."""
 | 
			
		||||
    try:
 | 
			
		||||
| 
						 | 
				
			
			@ -132,8 +124,8 @@ def load_vectors_into_model(
 | 
			
		|||
                nlp.vocab.strings.add(vectors_nlp.vocab.strings[key])
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def add_tok2vec_weights(
 | 
			
		||||
    nlp: Language, pretrain_config: Dict[str, Any], vocab_config: Dict[str, Any]
 | 
			
		||||
def init_tok2vec(
 | 
			
		||||
    nlp: "Language", pretrain_config: Dict[str, Any], vocab_config: Dict[str, Any]
 | 
			
		||||
) -> bool:
 | 
			
		||||
    # Load pretrained tok2vec weights - cf. CLI command 'pretrain'
 | 
			
		||||
    P = pretrain_config
 | 
			
		||||
| 
						 | 
				
			
			@ -171,7 +163,7 @@ def add_tok2vec_weights(
 | 
			
		|||
    return False
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def verify_config(nlp: Language) -> None:
 | 
			
		||||
def verify_config(nlp: "Language") -> None:
 | 
			
		||||
    """Perform additional checks based on the config, loaded nlp object and training data."""
 | 
			
		||||
    # TODO: maybe we should validate based on the actual components, the list
 | 
			
		||||
    # in config["nlp"]["pipeline"] instead?
 | 
			
		||||
| 
						 | 
				
			
			@ -182,7 +174,7 @@ def verify_config(nlp: Language) -> None:
 | 
			
		|||
            verify_textcat_config(nlp, pipe_config)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def verify_textcat_config(nlp: Language, pipe_config: Dict[str, Any]) -> None:
 | 
			
		||||
def verify_textcat_config(nlp: "Language", pipe_config: Dict[str, Any]) -> None:
 | 
			
		||||
    # if 'positive_label' is provided: double check whether it's in the data and
 | 
			
		||||
    # the task is binary
 | 
			
		||||
    if pipe_config.get("positive_label"):
 | 
			
		||||
| 
						 | 
				
			
			@ -211,7 +203,7 @@ def get_sourced_components(config: Union[Dict[str, Any], Config]) -> List[str]:
 | 
			
		|||
 | 
			
		||||
 | 
			
		||||
def convert_vectors(
 | 
			
		||||
    nlp: Language,
 | 
			
		||||
    nlp: "Language",
 | 
			
		||||
    vectors_loc: Optional[Path],
 | 
			
		||||
    *,
 | 
			
		||||
    truncate: int,
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,5 +1,5 @@
 | 
			
		|||
from typing import List, Callable, Tuple, Dict, Iterable, Iterator, Union, Any
 | 
			
		||||
from typing import Optional
 | 
			
		||||
from typing import Optional, TYPE_CHECKING
 | 
			
		||||
from pathlib import Path
 | 
			
		||||
from timeit import default_timer as timer
 | 
			
		||||
from thinc.api import Optimizer, Config, constant, fix_random_seed, set_gpu_allocator
 | 
			
		||||
| 
						 | 
				
			
			@ -9,13 +9,15 @@ from wasabi import Printer
 | 
			
		|||
 | 
			
		||||
from .example import Example
 | 
			
		||||
from ..schemas import ConfigSchemaTraining
 | 
			
		||||
from ..language import Language
 | 
			
		||||
from ..errors import Errors
 | 
			
		||||
from ..util import resolve_dot_names, registry
 | 
			
		||||
 | 
			
		||||
if TYPE_CHECKING:
 | 
			
		||||
    from ..language import Language  # noqa: F401
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def train(
 | 
			
		||||
    nlp: Language,
 | 
			
		||||
    nlp: "Language",
 | 
			
		||||
    output_path: Optional[Path] = None,
 | 
			
		||||
    *,
 | 
			
		||||
    use_gpu: int = -1,
 | 
			
		||||
| 
						 | 
				
			
			@ -110,7 +112,7 @@ def train(
 | 
			
		|||
 | 
			
		||||
 | 
			
		||||
def train_while_improving(
 | 
			
		||||
    nlp: Language,
 | 
			
		||||
    nlp: "Language",
 | 
			
		||||
    optimizer: Optimizer,
 | 
			
		||||
    train_data,
 | 
			
		||||
    evaluate,
 | 
			
		||||
| 
						 | 
				
			
			@ -233,7 +235,7 @@ def subdivide_batch(batch, accumulate_gradient):
 | 
			
		|||
 | 
			
		||||
 | 
			
		||||
def create_evaluation_callback(
 | 
			
		||||
    nlp: Language, dev_corpus: Callable, weights: Dict[str, float]
 | 
			
		||||
    nlp: "Language", dev_corpus: Callable, weights: Dict[str, float]
 | 
			
		||||
) -> Callable[[], Tuple[float, Dict[str, float]]]:
 | 
			
		||||
    weights = {key: value for key, value in weights.items() if value is not None}
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -277,7 +279,7 @@ def create_train_batches(
 | 
			
		|||
 | 
			
		||||
 | 
			
		||||
def update_meta(
 | 
			
		||||
    training: Union[Dict[str, Any], Config], nlp: Language, info: Dict[str, Any]
 | 
			
		||||
    training: Union[Dict[str, Any], Config], nlp: "Language", info: Dict[str, Any]
 | 
			
		||||
) -> None:
 | 
			
		||||
    nlp.meta["performance"] = {}
 | 
			
		||||
    for metric in training["score_weights"]:
 | 
			
		||||
| 
						 | 
				
			
			@ -288,8 +290,10 @@ def update_meta(
 | 
			
		|||
 | 
			
		||||
 | 
			
		||||
def create_before_to_disk_callback(
 | 
			
		||||
    callback: Optional[Callable[[Language], Language]]
 | 
			
		||||
) -> Callable[[Language], Language]:
 | 
			
		||||
    callback: Optional[Callable[["Language"], "Language"]]
 | 
			
		||||
) -> Callable[["Language"], "Language"]:
 | 
			
		||||
    from ..language import Language  # noqa: F811
 | 
			
		||||
 | 
			
		||||
    def before_to_disk(nlp: Language) -> Language:
 | 
			
		||||
        if not callback:
 | 
			
		||||
            return nlp
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue
	
	Block a user