mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Refactor CLI
This commit is contained in:
		
							parent
							
								
									a89e0ff7cb
								
							
						
					
					
						commit
						822ea4ef61
					
				| 
						 | 
					@ -15,8 +15,7 @@ from .debug_config import debug_config  # noqa: F401
 | 
				
			||||||
from .debug_model import debug_model  # noqa: F401
 | 
					from .debug_model import debug_model  # noqa: F401
 | 
				
			||||||
from .evaluate import evaluate  # noqa: F401
 | 
					from .evaluate import evaluate  # noqa: F401
 | 
				
			||||||
from .convert import convert  # noqa: F401
 | 
					from .convert import convert  # noqa: F401
 | 
				
			||||||
#from .init_model import init_model  # noqa: F401
 | 
					from .init_pipeline import init_pipeline_cli  # noqa: F401
 | 
				
			||||||
from .init_pipeline import init_pipeline  # noqa: F401
 | 
					 | 
				
			||||||
from .init_config import init_config, fill_config  # noqa: F401
 | 
					from .init_config import init_config, fill_config  # noqa: F401
 | 
				
			||||||
from .validate import validate  # noqa: F401
 | 
					from .validate import validate  # noqa: F401
 | 
				
			||||||
from .project.clone import project_clone  # noqa: F401
 | 
					from .project.clone import project_clone  # noqa: F401
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -10,13 +10,12 @@ from click import NoSuchOption
 | 
				
			||||||
from click.parser import split_arg_string
 | 
					from click.parser import split_arg_string
 | 
				
			||||||
from typer.main import get_command
 | 
					from typer.main import get_command
 | 
				
			||||||
from contextlib import contextmanager
 | 
					from contextlib import contextmanager
 | 
				
			||||||
from thinc.api import Config, ConfigValidationError
 | 
					from thinc.api import Config, ConfigValidationError, require_gpu
 | 
				
			||||||
from configparser import InterpolationError
 | 
					from configparser import InterpolationError
 | 
				
			||||||
import os
 | 
					import os
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..schemas import ProjectConfigSchema, validate
 | 
					from ..schemas import ProjectConfigSchema, validate
 | 
				
			||||||
from ..util import import_file, run_command, make_tempdir, registry, logger
 | 
					from ..util import import_file, run_command, make_tempdir, registry, logger
 | 
				
			||||||
from ..util import ensure_path
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
if TYPE_CHECKING:
 | 
					if TYPE_CHECKING:
 | 
				
			||||||
    from pathy import Pathy  # noqa: F401
 | 
					    from pathy import Pathy  # noqa: F401
 | 
				
			||||||
| 
						 | 
					@ -276,18 +275,6 @@ def import_code(code_path: Optional[Union[Path, str]]) -> None:
 | 
				
			||||||
            msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1)
 | 
					            msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def get_sourced_components(config: Union[Dict[str, Any], Config]) -> List[str]:
 | 
					 | 
				
			||||||
    """RETURNS (List[str]): All sourced components in the original config,
 | 
					 | 
				
			||||||
        e.g. {"source": "en_core_web_sm"}. If the config contains a key
 | 
					 | 
				
			||||||
        "factory", we assume it refers to a component factory.
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    return [
 | 
					 | 
				
			||||||
        name
 | 
					 | 
				
			||||||
        for name, cfg in config.get("components", {}).items()
 | 
					 | 
				
			||||||
        if "factory" not in cfg and "source" in cfg
 | 
					 | 
				
			||||||
    ]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def upload_file(src: Path, dest: Union[str, "Pathy"]) -> None:
 | 
					def upload_file(src: Path, dest: Union[str, "Pathy"]) -> None:
 | 
				
			||||||
    """Upload a file.
 | 
					    """Upload a file.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -459,3 +446,23 @@ def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[in
 | 
				
			||||||
            p = int(p)
 | 
					            p = int(p)
 | 
				
			||||||
        result.append(p)
 | 
					        result.append(p)
 | 
				
			||||||
    return result
 | 
					    return result
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class CliLogger:
 | 
				
			||||||
 | 
					    """Helper mocking up the most commonly used logger methods. Can be passed
 | 
				
			||||||
 | 
					    into functions like train() to make them output pretty-printed messages
 | 
				
			||||||
 | 
					    on the CLI and regular logging if used from within Python.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    debug = msg.text
 | 
				
			||||||
 | 
					    info = msg.info
 | 
				
			||||||
 | 
					    warn = msg.info
 | 
				
			||||||
 | 
					    error = msg.fail
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def setup_gpu(use_gpu: int):
 | 
				
			||||||
 | 
					    if use_gpu >= 0:
 | 
				
			||||||
 | 
					        msg.info(f"Using GPU: {use_gpu}")
 | 
				
			||||||
 | 
					        require_gpu(use_gpu)
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        msg.info("Using CPU")
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,7 +1,7 @@
 | 
				
			||||||
from typing import Optional, Dict, Any, Union, List
 | 
					from typing import Optional, Dict, Any, Union, List
 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
from wasabi import msg, table
 | 
					from wasabi import msg, table
 | 
				
			||||||
from thinc.api import Config, ConfigValidationError
 | 
					from thinc.api import Config
 | 
				
			||||||
from thinc.config import VARIABLE_RE
 | 
					from thinc.config import VARIABLE_RE
 | 
				
			||||||
import typer
 | 
					import typer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -52,10 +52,8 @@ def debug_config(
 | 
				
			||||||
    with show_validation_error(config_path):
 | 
					    with show_validation_error(config_path):
 | 
				
			||||||
        config = util.load_config(config_path, overrides=overrides)
 | 
					        config = util.load_config(config_path, overrides=overrides)
 | 
				
			||||||
        nlp = util.load_model_from_config(config)
 | 
					        nlp = util.load_model_from_config(config)
 | 
				
			||||||
        # Use the resolved config here in case user has one function returning
 | 
					        dot_names = ["training.dev_corpus", "training.train_corpus"]
 | 
				
			||||||
        # a dict of corpora etc.
 | 
					        util.resolve_dot_names(nlp.config, dot_names)
 | 
				
			||||||
        resolved = util.resolve_training_config(nlp.config)
 | 
					 | 
				
			||||||
        check_section_refs(resolved, ["training.dev_corpus", "training.train_corpus"])
 | 
					 | 
				
			||||||
    msg.good("Config is valid")
 | 
					    msg.good("Config is valid")
 | 
				
			||||||
    if show_vars:
 | 
					    if show_vars:
 | 
				
			||||||
        variables = get_variables(config)
 | 
					        variables = get_variables(config)
 | 
				
			||||||
| 
						 | 
					@ -97,23 +95,3 @@ def get_variables(config: Config) -> Dict[str, Any]:
 | 
				
			||||||
        value = util.dot_to_object(config, path)
 | 
					        value = util.dot_to_object(config, path)
 | 
				
			||||||
        result[variable] = repr(value)
 | 
					        result[variable] = repr(value)
 | 
				
			||||||
    return result
 | 
					    return result
 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def check_section_refs(config: Config, fields: List[str]) -> None:
 | 
					 | 
				
			||||||
    """Validate fields in the config that refer to other sections or values
 | 
					 | 
				
			||||||
    (e.g. in the corpora) and make sure that those references exist.
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    errors = []
 | 
					 | 
				
			||||||
    for field in fields:
 | 
					 | 
				
			||||||
        # If the field doesn't exist in the config, we ignore it
 | 
					 | 
				
			||||||
        try:
 | 
					 | 
				
			||||||
            value = util.dot_to_object(config, field)
 | 
					 | 
				
			||||||
        except KeyError:
 | 
					 | 
				
			||||||
            continue
 | 
					 | 
				
			||||||
        try:
 | 
					 | 
				
			||||||
            util.dot_to_object(config, value)
 | 
					 | 
				
			||||||
        except KeyError:
 | 
					 | 
				
			||||||
            msg = f"not a valid section reference: {value}"
 | 
					 | 
				
			||||||
            errors.append({"loc": field.split("."), "msg": msg})
 | 
					 | 
				
			||||||
    if errors:
 | 
					 | 
				
			||||||
        raise ConfigValidationError(config=config, errors=errors)
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -7,10 +7,13 @@ from wasabi import Printer, MESSAGES, msg
 | 
				
			||||||
import typer
 | 
					import typer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
 | 
					from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
 | 
				
			||||||
from ._util import import_code, debug_cli, get_sourced_components
 | 
					from ._util import import_code, debug_cli
 | 
				
			||||||
from ..training import Corpus, Example
 | 
					from ..training import Corpus, Example
 | 
				
			||||||
 | 
					from ..training.initialize import get_sourced_components
 | 
				
			||||||
 | 
					from ..schemas import ConfigSchemaTraining
 | 
				
			||||||
from ..pipeline._parser_internals import nonproj
 | 
					from ..pipeline._parser_internals import nonproj
 | 
				
			||||||
from ..language import Language
 | 
					from ..language import Language
 | 
				
			||||||
 | 
					from ..util import registry
 | 
				
			||||||
from .. import util
 | 
					from .. import util
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -94,26 +97,13 @@ def debug_data(
 | 
				
			||||||
    with show_validation_error(config_path):
 | 
					    with show_validation_error(config_path):
 | 
				
			||||||
        cfg = util.load_config(config_path, overrides=config_overrides)
 | 
					        cfg = util.load_config(config_path, overrides=config_overrides)
 | 
				
			||||||
        nlp = util.load_model_from_config(cfg)
 | 
					        nlp = util.load_model_from_config(cfg)
 | 
				
			||||||
        C = util.resolve_training_config(nlp.config)
 | 
					        T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining)
 | 
				
			||||||
    # Use original config here, not resolved version
 | 
					    # Use original config here, not resolved version
 | 
				
			||||||
    sourced_components = get_sourced_components(cfg)
 | 
					    sourced_components = get_sourced_components(cfg)
 | 
				
			||||||
    frozen_components = C["training"]["frozen_components"]
 | 
					    frozen_components = T["frozen_components"]
 | 
				
			||||||
    resume_components = [p for p in sourced_components if p not in frozen_components]
 | 
					    resume_components = [p for p in sourced_components if p not in frozen_components]
 | 
				
			||||||
    pipeline = nlp.pipe_names
 | 
					    pipeline = nlp.pipe_names
 | 
				
			||||||
    factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
 | 
					    factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
 | 
				
			||||||
    tag_map_path = util.ensure_path(C["training"]["tag_map"])
 | 
					 | 
				
			||||||
    tag_map = {}
 | 
					 | 
				
			||||||
    if tag_map_path is not None:
 | 
					 | 
				
			||||||
        tag_map = srsly.read_json(tag_map_path)
 | 
					 | 
				
			||||||
    morph_rules_path = util.ensure_path(C["training"]["morph_rules"])
 | 
					 | 
				
			||||||
    morph_rules = {}
 | 
					 | 
				
			||||||
    if morph_rules_path is not None:
 | 
					 | 
				
			||||||
        morph_rules = srsly.read_json(morph_rules_path)
 | 
					 | 
				
			||||||
    # Replace tag map with provided mapping
 | 
					 | 
				
			||||||
    nlp.vocab.morphology.load_tag_map(tag_map)
 | 
					 | 
				
			||||||
    # Load morph rules
 | 
					 | 
				
			||||||
    nlp.vocab.morphology.load_morph_exceptions(morph_rules)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    msg.divider("Data file validation")
 | 
					    msg.divider("Data file validation")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Create the gold corpus to be able to better analyze data
 | 
					    # Create the gold corpus to be able to better analyze data
 | 
				
			||||||
| 
						 | 
					@ -145,10 +135,10 @@ def debug_data(
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    train_texts = gold_train_data["texts"]
 | 
					    train_texts = gold_train_data["texts"]
 | 
				
			||||||
    dev_texts = gold_dev_data["texts"]
 | 
					    dev_texts = gold_dev_data["texts"]
 | 
				
			||||||
    frozen_components = C["training"]["frozen_components"]
 | 
					    frozen_components = T["frozen_components"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    msg.divider("Training stats")
 | 
					    msg.divider("Training stats")
 | 
				
			||||||
    msg.text(f"Language: {C['nlp']['lang']}")
 | 
					    msg.text(f"Language: {nlp.lang}")
 | 
				
			||||||
    msg.text(f"Training pipeline: {', '.join(pipeline)}")
 | 
					    msg.text(f"Training pipeline: {', '.join(pipeline)}")
 | 
				
			||||||
    if resume_components:
 | 
					    if resume_components:
 | 
				
			||||||
        msg.text(f"Components from other pipelines: {', '.join(resume_components)}")
 | 
					        msg.text(f"Components from other pipelines: {', '.join(resume_components)}")
 | 
				
			||||||
| 
						 | 
					@ -355,6 +345,7 @@ def debug_data(
 | 
				
			||||||
    if "tagger" in factory_names:
 | 
					    if "tagger" in factory_names:
 | 
				
			||||||
        msg.divider("Part-of-speech Tagging")
 | 
					        msg.divider("Part-of-speech Tagging")
 | 
				
			||||||
        labels = [label for label in gold_train_data["tags"]]
 | 
					        labels = [label for label in gold_train_data["tags"]]
 | 
				
			||||||
 | 
					        # TODO: does this need to be updated?
 | 
				
			||||||
        tag_map = nlp.vocab.morphology.tag_map
 | 
					        tag_map = nlp.vocab.morphology.tag_map
 | 
				
			||||||
        msg.info(f"{len(labels)} label(s) in data ({len(tag_map)} label(s) in tag map)")
 | 
					        msg.info(f"{len(labels)} label(s) in data ({len(tag_map)} label(s) in tag map)")
 | 
				
			||||||
        labels_with_counts = _format_labels(
 | 
					        labels_with_counts = _format_labels(
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -4,12 +4,14 @@ from pathlib import Path
 | 
				
			||||||
from spacy.training import Example
 | 
					from spacy.training import Example
 | 
				
			||||||
from spacy.util import dot_to_object
 | 
					from spacy.util import dot_to_object
 | 
				
			||||||
from wasabi import msg
 | 
					from wasabi import msg
 | 
				
			||||||
from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam
 | 
					from thinc.api import fix_random_seed, set_dropout_rate, Adam
 | 
				
			||||||
from thinc.api import Model, data_validation, set_gpu_allocator
 | 
					from thinc.api import Model, data_validation, set_gpu_allocator
 | 
				
			||||||
import typer
 | 
					import typer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ._util import Arg, Opt, debug_cli, show_validation_error
 | 
					from ._util import Arg, Opt, debug_cli, show_validation_error
 | 
				
			||||||
from ._util import parse_config_overrides, string_to_list
 | 
					from ._util import parse_config_overrides, string_to_list, setup_gpu
 | 
				
			||||||
 | 
					from ..schemas import ConfigSchemaTraining
 | 
				
			||||||
 | 
					from ..util import registry
 | 
				
			||||||
from .. import util
 | 
					from .. import util
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -37,11 +39,7 @@ def debug_model_cli(
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    DOCS: https://nightly.spacy.io/api/cli#debug-model
 | 
					    DOCS: https://nightly.spacy.io/api/cli#debug-model
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    if use_gpu >= 0:
 | 
					    setup_gpu(use_gpu)
 | 
				
			||||||
        msg.info("Using GPU")
 | 
					 | 
				
			||||||
        require_gpu(use_gpu)
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        msg.info("Using CPU")
 | 
					 | 
				
			||||||
    layers = string_to_list(layers, intify=True)
 | 
					    layers = string_to_list(layers, intify=True)
 | 
				
			||||||
    print_settings = {
 | 
					    print_settings = {
 | 
				
			||||||
        "dimensions": dimensions,
 | 
					        "dimensions": dimensions,
 | 
				
			||||||
| 
						 | 
					@ -65,8 +63,8 @@ def debug_model_cli(
 | 
				
			||||||
        set_gpu_allocator(allocator)
 | 
					        set_gpu_allocator(allocator)
 | 
				
			||||||
    with show_validation_error(config_path):
 | 
					    with show_validation_error(config_path):
 | 
				
			||||||
        nlp = util.load_model_from_config(raw_config)
 | 
					        nlp = util.load_model_from_config(raw_config)
 | 
				
			||||||
        C = util.resolve_training_config(nlp.config)
 | 
					        T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining)
 | 
				
			||||||
    seed = C["training"]["seed"]
 | 
					    seed = T["seed"]
 | 
				
			||||||
    if seed is not None:
 | 
					    if seed is not None:
 | 
				
			||||||
        msg.info(f"Fixing random seed: {seed}")
 | 
					        msg.info(f"Fixing random seed: {seed}")
 | 
				
			||||||
        fix_random_seed(seed)
 | 
					        fix_random_seed(seed)
 | 
				
			||||||
| 
						 | 
					@ -77,7 +75,7 @@ def debug_model_cli(
 | 
				
			||||||
            exits=1,
 | 
					            exits=1,
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
    model = pipe.model
 | 
					    model = pipe.model
 | 
				
			||||||
    debug_model(C, nlp, model, print_settings=print_settings)
 | 
					    debug_model(T, nlp, model, print_settings=print_settings)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def debug_model(
 | 
					def debug_model(
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -3,11 +3,11 @@ from wasabi import Printer
 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
import re
 | 
					import re
 | 
				
			||||||
import srsly
 | 
					import srsly
 | 
				
			||||||
from thinc.api import require_gpu, fix_random_seed
 | 
					from thinc.api import fix_random_seed
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..training import Corpus
 | 
					from ..training import Corpus
 | 
				
			||||||
from ..tokens import Doc
 | 
					from ..tokens import Doc
 | 
				
			||||||
from ._util import app, Arg, Opt
 | 
					from ._util import app, Arg, Opt, setup_gpu
 | 
				
			||||||
from ..scorer import Scorer
 | 
					from ..scorer import Scorer
 | 
				
			||||||
from .. import util
 | 
					from .. import util
 | 
				
			||||||
from .. import displacy
 | 
					from .. import displacy
 | 
				
			||||||
| 
						 | 
					@ -61,8 +61,7 @@ def evaluate(
 | 
				
			||||||
) -> Scorer:
 | 
					) -> Scorer:
 | 
				
			||||||
    msg = Printer(no_print=silent, pretty=not silent)
 | 
					    msg = Printer(no_print=silent, pretty=not silent)
 | 
				
			||||||
    fix_random_seed()
 | 
					    fix_random_seed()
 | 
				
			||||||
    if use_gpu >= 0:
 | 
					    setup_gpu(use_gpu)
 | 
				
			||||||
        require_gpu(use_gpu)
 | 
					 | 
				
			||||||
    data_path = util.ensure_path(data_path)
 | 
					    data_path = util.ensure_path(data_path)
 | 
				
			||||||
    output_path = util.ensure_path(output)
 | 
					    output_path = util.ensure_path(output)
 | 
				
			||||||
    displacy_path = util.ensure_path(displacy_path)
 | 
					    displacy_path = util.ensure_path(displacy_path)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,22 +1,13 @@
 | 
				
			||||||
from typing import Optional, Dict, Callable, Any
 | 
					from typing import Optional
 | 
				
			||||||
import logging
 | 
					import logging
 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
from wasabi import msg
 | 
					from wasabi import msg
 | 
				
			||||||
import typer
 | 
					import typer
 | 
				
			||||||
from thinc.api import Config, fix_random_seed, set_gpu_allocator
 | 
					 | 
				
			||||||
import srsly
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .. import util
 | 
					from .. import util
 | 
				
			||||||
from ..util import registry, resolve_dot_names, OOV_RANK
 | 
					from ..training.initialize import init_nlp
 | 
				
			||||||
from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain, ConfigSchemaInit
 | 
					 | 
				
			||||||
from ..language import Language
 | 
					 | 
				
			||||||
from ..lookups import Lookups
 | 
					 | 
				
			||||||
from ..errors import Errors
 | 
					 | 
				
			||||||
from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
 | 
					from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
 | 
				
			||||||
from ._util import import_code, get_sourced_components
 | 
					from ._util import import_code, CliLogger, setup_gpu
 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
DEFAULT_OOV_PROB = -20
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@init_cli.command(
 | 
					@init_cli.command(
 | 
				
			||||||
| 
						 | 
					@ -31,178 +22,16 @@ def init_pipeline_cli(
 | 
				
			||||||
    output_path: Path = Arg(..., help="Output directory for the prepared data"),
 | 
					    output_path: Path = Arg(..., help="Output directory for the prepared data"),
 | 
				
			||||||
    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
 | 
					    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
 | 
				
			||||||
    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
 | 
					    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
 | 
				
			||||||
 | 
					    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
 | 
				
			||||||
    # fmt: on
 | 
					    # fmt: on
 | 
				
			||||||
):
 | 
					):
 | 
				
			||||||
    util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR)
 | 
					    util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR)
 | 
				
			||||||
    overrides = parse_config_overrides(ctx.args)
 | 
					    overrides = parse_config_overrides(ctx.args)
 | 
				
			||||||
    import_code(code_path)
 | 
					    import_code(code_path)
 | 
				
			||||||
 | 
					    setup_gpu(use_gpu)
 | 
				
			||||||
    with show_validation_error(config_path):
 | 
					    with show_validation_error(config_path):
 | 
				
			||||||
        config = util.load_config(config_path, overrides=overrides)
 | 
					        config = util.load_config(config_path, overrides=overrides)
 | 
				
			||||||
    nlp = init_pipeline(config)
 | 
					    with show_validation_error(hint_fill=False):
 | 
				
			||||||
 | 
					        nlp = init_nlp(config, use_gpu=use_gpu, logger=CliLogger, on_succcess=msg.good)
 | 
				
			||||||
    nlp.to_disk(output_path)
 | 
					    nlp.to_disk(output_path)
 | 
				
			||||||
    msg.good(f"Saved initialized pipeline to {output_path}")
 | 
					    msg.good(f"Saved initialized pipeline to {output_path}")
 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def init_pipeline(config: Config, use_gpu: int = -1) -> Language:
 | 
					 | 
				
			||||||
    raw_config = config
 | 
					 | 
				
			||||||
    config = raw_config.interpolate()
 | 
					 | 
				
			||||||
    if config["training"]["seed"] is not None:
 | 
					 | 
				
			||||||
        fix_random_seed(config["training"]["seed"])
 | 
					 | 
				
			||||||
    allocator = config["training"]["gpu_allocator"]
 | 
					 | 
				
			||||||
    if use_gpu >= 0 and allocator:
 | 
					 | 
				
			||||||
        set_gpu_allocator(allocator)
 | 
					 | 
				
			||||||
    # Use original config here before it's resolved to functions
 | 
					 | 
				
			||||||
    sourced_components = get_sourced_components(config)
 | 
					 | 
				
			||||||
    with show_validation_error():
 | 
					 | 
				
			||||||
        nlp = util.load_model_from_config(raw_config, auto_fill=True)
 | 
					 | 
				
			||||||
    msg.good("Set up nlp object from config")
 | 
					 | 
				
			||||||
    config = nlp.config.interpolate()
 | 
					 | 
				
			||||||
    # Resolve all training-relevant sections using the filled nlp config
 | 
					 | 
				
			||||||
    T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
 | 
					 | 
				
			||||||
    dot_names = [T["train_corpus"], T["dev_corpus"]]
 | 
					 | 
				
			||||||
    train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
 | 
					 | 
				
			||||||
    I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
 | 
					 | 
				
			||||||
    V = I["vocab"]
 | 
					 | 
				
			||||||
    init_vocab(nlp, data=V["data"], lookups=V["lookups"], vectors=V["vectors"])
 | 
					 | 
				
			||||||
    optimizer = T["optimizer"]
 | 
					 | 
				
			||||||
    before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
 | 
					 | 
				
			||||||
    # Components that shouldn't be updated during training
 | 
					 | 
				
			||||||
    frozen_components = T["frozen_components"]
 | 
					 | 
				
			||||||
    # Sourced components that require resume_training
 | 
					 | 
				
			||||||
    resume_components = [p for p in sourced_components if p not in frozen_components]
 | 
					 | 
				
			||||||
    msg.info(f"Pipeline: {nlp.pipe_names}")
 | 
					 | 
				
			||||||
    if resume_components:
 | 
					 | 
				
			||||||
        with nlp.select_pipes(enable=resume_components):
 | 
					 | 
				
			||||||
            msg.info(f"Resuming training for: {resume_components}")
 | 
					 | 
				
			||||||
            nlp.resume_training(sgd=optimizer)
 | 
					 | 
				
			||||||
    with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
 | 
					 | 
				
			||||||
        nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
 | 
					 | 
				
			||||||
        msg.good(f"Initialized pipeline components")
 | 
					 | 
				
			||||||
    # Verify the config after calling 'begin_training' to ensure labels
 | 
					 | 
				
			||||||
    # are properly initialized
 | 
					 | 
				
			||||||
    verify_config(nlp)
 | 
					 | 
				
			||||||
    if "pretraining" in config and config["pretraining"]:
 | 
					 | 
				
			||||||
        P = registry.resolve(config["pretraining"], schema=ConfigSchemaPretrain)
 | 
					 | 
				
			||||||
        add_tok2vec_weights(nlp, P, I)
 | 
					 | 
				
			||||||
    # TODO: this should be handled better?
 | 
					 | 
				
			||||||
    nlp = before_to_disk(nlp)
 | 
					 | 
				
			||||||
    return nlp
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def init_vocab(
 | 
					 | 
				
			||||||
    nlp: Language,
 | 
					 | 
				
			||||||
    *,
 | 
					 | 
				
			||||||
    data: Optional[Path] = None,
 | 
					 | 
				
			||||||
    lookups: Optional[Lookups] = None,
 | 
					 | 
				
			||||||
    vectors: Optional[str] = None,
 | 
					 | 
				
			||||||
) -> Language:
 | 
					 | 
				
			||||||
    if lookups:
 | 
					 | 
				
			||||||
        nlp.vocab.lookups = lookups
 | 
					 | 
				
			||||||
        msg.good(f"Added vocab lookups: {', '.join(lookups.tables)}")
 | 
					 | 
				
			||||||
    data_path = util.ensure_path(data)
 | 
					 | 
				
			||||||
    if data_path is not None:
 | 
					 | 
				
			||||||
        lex_attrs = srsly.read_jsonl(data_path)
 | 
					 | 
				
			||||||
        for lexeme in nlp.vocab:
 | 
					 | 
				
			||||||
            lexeme.rank = OOV_RANK
 | 
					 | 
				
			||||||
        for attrs in lex_attrs:
 | 
					 | 
				
			||||||
            if "settings" in attrs:
 | 
					 | 
				
			||||||
                continue
 | 
					 | 
				
			||||||
            lexeme = nlp.vocab[attrs["orth"]]
 | 
					 | 
				
			||||||
            lexeme.set_attrs(**attrs)
 | 
					 | 
				
			||||||
        if len(nlp.vocab):
 | 
					 | 
				
			||||||
            oov_prob = min(lex.prob for lex in nlp.vocab) - 1
 | 
					 | 
				
			||||||
        else:
 | 
					 | 
				
			||||||
            oov_prob = DEFAULT_OOV_PROB
 | 
					 | 
				
			||||||
        nlp.vocab.cfg.update({"oov_prob": oov_prob})
 | 
					 | 
				
			||||||
        msg.good(f"Added {len(nlp.vocab)} lexical entries to the vocab")
 | 
					 | 
				
			||||||
    msg.good("Created vocabulary")
 | 
					 | 
				
			||||||
    if vectors is not None:
 | 
					 | 
				
			||||||
        add_vectors(nlp, vectors)
 | 
					 | 
				
			||||||
        msg.good(f"Added vectors: {vectors}")
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def add_tok2vec_weights(
 | 
					 | 
				
			||||||
    nlp: Language, pretrain_config: Dict[str, Any], vocab_config: Dict[str, Any]
 | 
					 | 
				
			||||||
) -> None:
 | 
					 | 
				
			||||||
    # Load pretrained tok2vec weights - cf. CLI command 'pretrain'
 | 
					 | 
				
			||||||
    P = pretrain_config
 | 
					 | 
				
			||||||
    V = vocab_config
 | 
					 | 
				
			||||||
    weights_data = None
 | 
					 | 
				
			||||||
    init_tok2vec = util.ensure_path(V["init_tok2vec"])
 | 
					 | 
				
			||||||
    if init_tok2vec is not None:
 | 
					 | 
				
			||||||
        if P["objective"].get("type") == "vectors" and not V["vectors"]:
 | 
					 | 
				
			||||||
            err = "Need initialize.vectors if pretraining.objective.type is vectors"
 | 
					 | 
				
			||||||
            msg.fail(err, exits=1)
 | 
					 | 
				
			||||||
        if not init_tok2vec.exists():
 | 
					 | 
				
			||||||
            msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1)
 | 
					 | 
				
			||||||
        with init_tok2vec.open("rb") as file_:
 | 
					 | 
				
			||||||
            weights_data = file_.read()
 | 
					 | 
				
			||||||
    if weights_data is not None:
 | 
					 | 
				
			||||||
        tok2vec_component = P["component"]
 | 
					 | 
				
			||||||
        if tok2vec_component is None:
 | 
					 | 
				
			||||||
            msg.fail(
 | 
					 | 
				
			||||||
                f"To use pretrained tok2vec weights, [pretraining.component] "
 | 
					 | 
				
			||||||
                f"needs to specify the component that should load them.",
 | 
					 | 
				
			||||||
                exits=1,
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
        layer = nlp.get_pipe(tok2vec_component).model
 | 
					 | 
				
			||||||
        if P["layer"]:
 | 
					 | 
				
			||||||
            layer = layer.get_ref(P["layer"])
 | 
					 | 
				
			||||||
        layer.from_bytes(weights_data)
 | 
					 | 
				
			||||||
        msg.good(f"Loaded pretrained weights into component '{tok2vec_component}'")
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def add_vectors(nlp: Language, vectors: str) -> None:
 | 
					 | 
				
			||||||
    title = f"Config validation error for vectors {vectors}"
 | 
					 | 
				
			||||||
    desc = (
 | 
					 | 
				
			||||||
        "This typically means that there's a problem in the config.cfg included "
 | 
					 | 
				
			||||||
        "with the packaged vectors. Make sure that the vectors package you're "
 | 
					 | 
				
			||||||
        "loading is compatible with the current version of spaCy."
 | 
					 | 
				
			||||||
    )
 | 
					 | 
				
			||||||
    with show_validation_error(
 | 
					 | 
				
			||||||
        title=title, desc=desc, hint_fill=False, show_config=False
 | 
					 | 
				
			||||||
    ):
 | 
					 | 
				
			||||||
        util.load_vectors_into_model(nlp, vectors)
 | 
					 | 
				
			||||||
        msg(f"Added {len(nlp.vocab.vectors)} vectors from {vectors}")
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def verify_config(nlp: Language) -> None:
 | 
					 | 
				
			||||||
    """Perform additional checks based on the config, loaded nlp object and training data."""
 | 
					 | 
				
			||||||
    # TODO: maybe we should validate based on the actual components, the list
 | 
					 | 
				
			||||||
    # in config["nlp"]["pipeline"] instead?
 | 
					 | 
				
			||||||
    for pipe_config in nlp.config["components"].values():
 | 
					 | 
				
			||||||
        # We can't assume that the component name == the factory
 | 
					 | 
				
			||||||
        factory = pipe_config["factory"]
 | 
					 | 
				
			||||||
        if factory == "textcat":
 | 
					 | 
				
			||||||
            verify_textcat_config(nlp, pipe_config)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def verify_textcat_config(nlp: Language, pipe_config: Dict[str, Any]) -> None:
 | 
					 | 
				
			||||||
    # if 'positive_label' is provided: double check whether it's in the data and
 | 
					 | 
				
			||||||
    # the task is binary
 | 
					 | 
				
			||||||
    if pipe_config.get("positive_label"):
 | 
					 | 
				
			||||||
        textcat_labels = nlp.get_pipe("textcat").labels
 | 
					 | 
				
			||||||
        pos_label = pipe_config.get("positive_label")
 | 
					 | 
				
			||||||
        if pos_label not in textcat_labels:
 | 
					 | 
				
			||||||
            raise ValueError(
 | 
					 | 
				
			||||||
                Errors.E920.format(pos_label=pos_label, labels=textcat_labels)
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
        if len(list(textcat_labels)) != 2:
 | 
					 | 
				
			||||||
            raise ValueError(
 | 
					 | 
				
			||||||
                Errors.E919.format(pos_label=pos_label, labels=textcat_labels)
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def create_before_to_disk_callback(
 | 
					 | 
				
			||||||
    callback: Optional[Callable[[Language], Language]]
 | 
					 | 
				
			||||||
) -> Callable[[Language], Language]:
 | 
					 | 
				
			||||||
    def before_to_disk(nlp: Language) -> Language:
 | 
					 | 
				
			||||||
        if not callback:
 | 
					 | 
				
			||||||
            return nlp
 | 
					 | 
				
			||||||
        modified_nlp = callback(nlp)
 | 
					 | 
				
			||||||
        if not isinstance(modified_nlp, Language):
 | 
					 | 
				
			||||||
            err = Errors.E914.format(name="before_to_disk", value=type(modified_nlp))
 | 
					 | 
				
			||||||
            raise ValueError(err)
 | 
					 | 
				
			||||||
        return modified_nlp
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    return before_to_disk
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,25 +1,13 @@
 | 
				
			||||||
from typing import Optional
 | 
					from typing import Optional
 | 
				
			||||||
import numpy
 | 
					 | 
				
			||||||
import time
 | 
					 | 
				
			||||||
import re
 | 
					 | 
				
			||||||
from collections import Counter
 | 
					 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
from thinc.api import require_gpu, set_gpu_allocator
 | 
					 | 
				
			||||||
from thinc.api import set_dropout_rate, to_categorical, fix_random_seed
 | 
					 | 
				
			||||||
from thinc.api import Config, CosineDistance, L2Distance
 | 
					 | 
				
			||||||
from wasabi import msg
 | 
					from wasabi import msg
 | 
				
			||||||
import srsly
 | 
					 | 
				
			||||||
from functools import partial
 | 
					 | 
				
			||||||
import typer
 | 
					import typer
 | 
				
			||||||
 | 
					import re
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
 | 
					from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
 | 
				
			||||||
from ._util import import_code
 | 
					from ._util import import_code, setup_gpu, CliLogger
 | 
				
			||||||
from ..ml.models.multi_task import build_cloze_multi_task_model
 | 
					from ..training.pretrain import pretrain
 | 
				
			||||||
from ..ml.models.multi_task import build_cloze_characters_multi_task_model
 | 
					from ..util import load_config
 | 
				
			||||||
from ..tokens import Doc
 | 
					 | 
				
			||||||
from ..attrs import ID
 | 
					 | 
				
			||||||
from .. import util
 | 
					 | 
				
			||||||
from ..util import dot_to_object
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@app.command(
 | 
					@app.command(
 | 
				
			||||||
| 
						 | 
					@ -61,15 +49,11 @@ def pretrain_cli(
 | 
				
			||||||
    config_overrides = parse_config_overrides(ctx.args)
 | 
					    config_overrides = parse_config_overrides(ctx.args)
 | 
				
			||||||
    import_code(code_path)
 | 
					    import_code(code_path)
 | 
				
			||||||
    verify_cli_args(config_path, output_dir, resume_path, epoch_resume)
 | 
					    verify_cli_args(config_path, output_dir, resume_path, epoch_resume)
 | 
				
			||||||
    if use_gpu >= 0:
 | 
					    setup_gpu(use_gpu)
 | 
				
			||||||
        msg.info("Using GPU")
 | 
					 | 
				
			||||||
        require_gpu(use_gpu)
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        msg.info("Using CPU")
 | 
					 | 
				
			||||||
    msg.info(f"Loading config from: {config_path}")
 | 
					    msg.info(f"Loading config from: {config_path}")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    with show_validation_error(config_path):
 | 
					    with show_validation_error(config_path):
 | 
				
			||||||
        raw_config = util.load_config(
 | 
					        raw_config = load_config(
 | 
				
			||||||
            config_path, overrides=config_overrides, interpolate=False
 | 
					            config_path, overrides=config_overrides, interpolate=False
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
    config = raw_config.interpolate()
 | 
					    config = raw_config.interpolate()
 | 
				
			||||||
| 
						 | 
					@ -89,250 +73,11 @@ def pretrain_cli(
 | 
				
			||||||
        resume_path=resume_path,
 | 
					        resume_path=resume_path,
 | 
				
			||||||
        epoch_resume=epoch_resume,
 | 
					        epoch_resume=epoch_resume,
 | 
				
			||||||
        use_gpu=use_gpu,
 | 
					        use_gpu=use_gpu,
 | 
				
			||||||
 | 
					        logger=CliLogger,
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def pretrain(
 | 
					 | 
				
			||||||
    config: Config,
 | 
					 | 
				
			||||||
    output_dir: Path,
 | 
					 | 
				
			||||||
    resume_path: Optional[Path] = None,
 | 
					 | 
				
			||||||
    epoch_resume: Optional[int] = None,
 | 
					 | 
				
			||||||
    use_gpu: int = -1,
 | 
					 | 
				
			||||||
):
 | 
					 | 
				
			||||||
    if config["training"]["seed"] is not None:
 | 
					 | 
				
			||||||
        fix_random_seed(config["training"]["seed"])
 | 
					 | 
				
			||||||
    allocator = config["training"]["gpu_allocator"]
 | 
					 | 
				
			||||||
    if use_gpu >= 0 and allocator:
 | 
					 | 
				
			||||||
        set_gpu_allocator(allocator)
 | 
					 | 
				
			||||||
    nlp = util.load_model_from_config(config)
 | 
					 | 
				
			||||||
    C = util.resolve_training_config(nlp.config)
 | 
					 | 
				
			||||||
    P_cfg = C["pretraining"]
 | 
					 | 
				
			||||||
    corpus = dot_to_object(C, P_cfg["corpus"])
 | 
					 | 
				
			||||||
    batcher = P_cfg["batcher"]
 | 
					 | 
				
			||||||
    model = create_pretraining_model(nlp, C["pretraining"])
 | 
					 | 
				
			||||||
    optimizer = C["pretraining"]["optimizer"]
 | 
					 | 
				
			||||||
    # Load in pretrained weights to resume from
 | 
					 | 
				
			||||||
    if resume_path is not None:
 | 
					 | 
				
			||||||
        _resume_model(model, resume_path, epoch_resume)
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        # Without '--resume-path' the '--epoch-resume' argument is ignored
 | 
					 | 
				
			||||||
        epoch_resume = 0
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    tracker = ProgressTracker(frequency=10000)
 | 
					 | 
				
			||||||
    msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}")
 | 
					 | 
				
			||||||
    row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
 | 
					 | 
				
			||||||
    msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def _save_model(epoch, is_temp=False):
 | 
					 | 
				
			||||||
        is_temp_str = ".temp" if is_temp else ""
 | 
					 | 
				
			||||||
        with model.use_params(optimizer.averages):
 | 
					 | 
				
			||||||
            with (output_dir / f"model{epoch}{is_temp_str}.bin").open("wb") as file_:
 | 
					 | 
				
			||||||
                file_.write(model.get_ref("tok2vec").to_bytes())
 | 
					 | 
				
			||||||
            log = {
 | 
					 | 
				
			||||||
                "nr_word": tracker.nr_word,
 | 
					 | 
				
			||||||
                "loss": tracker.loss,
 | 
					 | 
				
			||||||
                "epoch_loss": tracker.epoch_loss,
 | 
					 | 
				
			||||||
                "epoch": epoch,
 | 
					 | 
				
			||||||
            }
 | 
					 | 
				
			||||||
            with (output_dir / "log.jsonl").open("a") as file_:
 | 
					 | 
				
			||||||
                file_.write(srsly.json_dumps(log) + "\n")
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    objective = create_objective(P_cfg["objective"])
 | 
					 | 
				
			||||||
    # TODO: I think we probably want this to look more like the
 | 
					 | 
				
			||||||
    # 'create_train_batches' function?
 | 
					 | 
				
			||||||
    for epoch in range(epoch_resume, P_cfg["max_epochs"]):
 | 
					 | 
				
			||||||
        for batch_id, batch in enumerate(batcher(corpus(nlp))):
 | 
					 | 
				
			||||||
            docs = ensure_docs(batch)
 | 
					 | 
				
			||||||
            loss = make_update(model, docs, optimizer, objective)
 | 
					 | 
				
			||||||
            progress = tracker.update(epoch, loss, docs)
 | 
					 | 
				
			||||||
            if progress:
 | 
					 | 
				
			||||||
                msg.row(progress, **row_settings)
 | 
					 | 
				
			||||||
            if P_cfg["n_save_every"] and (batch_id % P_cfg["n_save_every"] == 0):
 | 
					 | 
				
			||||||
                _save_model(epoch, is_temp=True)
 | 
					 | 
				
			||||||
        _save_model(epoch)
 | 
					 | 
				
			||||||
        tracker.epoch_loss = 0.0
 | 
					 | 
				
			||||||
    msg.good("Successfully finished pretrain")
 | 
					    msg.good("Successfully finished pretrain")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def ensure_docs(examples_or_docs):
 | 
					 | 
				
			||||||
    docs = []
 | 
					 | 
				
			||||||
    for eg_or_doc in examples_or_docs:
 | 
					 | 
				
			||||||
        if isinstance(eg_or_doc, Doc):
 | 
					 | 
				
			||||||
            docs.append(eg_or_doc)
 | 
					 | 
				
			||||||
        else:
 | 
					 | 
				
			||||||
            docs.append(eg_or_doc.reference)
 | 
					 | 
				
			||||||
    return docs
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def _resume_model(model, resume_path, epoch_resume):
 | 
					 | 
				
			||||||
    msg.info(f"Resume training tok2vec from: {resume_path}")
 | 
					 | 
				
			||||||
    with resume_path.open("rb") as file_:
 | 
					 | 
				
			||||||
        weights_data = file_.read()
 | 
					 | 
				
			||||||
        model.get_ref("tok2vec").from_bytes(weights_data)
 | 
					 | 
				
			||||||
    # Parse the epoch number from the given weight file
 | 
					 | 
				
			||||||
    model_name = re.search(r"model\d+\.bin", str(resume_path))
 | 
					 | 
				
			||||||
    if model_name:
 | 
					 | 
				
			||||||
        # Default weight file name so read epoch_start from it by cutting off 'model' and '.bin'
 | 
					 | 
				
			||||||
        epoch_resume = int(model_name.group(0)[5:][:-4]) + 1
 | 
					 | 
				
			||||||
        msg.info(f"Resuming from epoch: {epoch_resume}")
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        msg.info(f"Resuming from epoch: {epoch_resume}")
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def make_update(model, docs, optimizer, objective_func):
 | 
					 | 
				
			||||||
    """Perform an update over a single batch of documents.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    docs (iterable): A batch of `Doc` objects.
 | 
					 | 
				
			||||||
    optimizer (callable): An optimizer.
 | 
					 | 
				
			||||||
    RETURNS loss: A float for the loss.
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    predictions, backprop = model.begin_update(docs)
 | 
					 | 
				
			||||||
    loss, gradients = objective_func(model.ops, docs, predictions)
 | 
					 | 
				
			||||||
    backprop(gradients)
 | 
					 | 
				
			||||||
    model.finish_update(optimizer)
 | 
					 | 
				
			||||||
    # Don't want to return a cupy object here
 | 
					 | 
				
			||||||
    # The gradients are modified in-place by the BERT MLM,
 | 
					 | 
				
			||||||
    # so we get an accurate loss
 | 
					 | 
				
			||||||
    return float(loss)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def create_objective(config):
 | 
					 | 
				
			||||||
    """Create the objective for pretraining.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    We'd like to replace this with a registry function but it's tricky because
 | 
					 | 
				
			||||||
    we're also making a model choice based on this. For now we hard-code support
 | 
					 | 
				
			||||||
    for two types (characters, vectors). For characters you can specify
 | 
					 | 
				
			||||||
    n_characters, for vectors you can specify the loss.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    Bleh.
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    objective_type = config["type"]
 | 
					 | 
				
			||||||
    if objective_type == "characters":
 | 
					 | 
				
			||||||
        return partial(get_characters_loss, nr_char=config["n_characters"])
 | 
					 | 
				
			||||||
    elif objective_type == "vectors":
 | 
					 | 
				
			||||||
        if config["loss"] == "cosine":
 | 
					 | 
				
			||||||
            return partial(
 | 
					 | 
				
			||||||
                get_vectors_loss,
 | 
					 | 
				
			||||||
                distance=CosineDistance(normalize=True, ignore_zeros=True),
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
        elif config["loss"] == "L2":
 | 
					 | 
				
			||||||
            return partial(
 | 
					 | 
				
			||||||
                get_vectors_loss, distance=L2Distance(normalize=True, ignore_zeros=True)
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
        else:
 | 
					 | 
				
			||||||
            raise ValueError("Unexpected loss type", config["loss"])
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        raise ValueError("Unexpected objective_type", objective_type)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def get_vectors_loss(ops, docs, prediction, distance):
 | 
					 | 
				
			||||||
    """Compute a loss based on a distance between the documents' vectors and
 | 
					 | 
				
			||||||
    the prediction.
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    # The simplest way to implement this would be to vstack the
 | 
					 | 
				
			||||||
    # token.vector values, but that's a bit inefficient, especially on GPU.
 | 
					 | 
				
			||||||
    # Instead we fetch the index into the vectors table for each of our tokens,
 | 
					 | 
				
			||||||
    # and look them up all at once. This prevents data copying.
 | 
					 | 
				
			||||||
    ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
 | 
					 | 
				
			||||||
    target = docs[0].vocab.vectors.data[ids]
 | 
					 | 
				
			||||||
    d_target, loss = distance(prediction, target)
 | 
					 | 
				
			||||||
    return loss, d_target
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def get_characters_loss(ops, docs, prediction, nr_char):
 | 
					 | 
				
			||||||
    """Compute a loss based on a number of characters predicted from the docs."""
 | 
					 | 
				
			||||||
    target_ids = numpy.vstack([doc.to_utf8_array(nr_char=nr_char) for doc in docs])
 | 
					 | 
				
			||||||
    target_ids = target_ids.reshape((-1,))
 | 
					 | 
				
			||||||
    target = ops.asarray(to_categorical(target_ids, n_classes=256), dtype="f")
 | 
					 | 
				
			||||||
    target = target.reshape((-1, 256 * nr_char))
 | 
					 | 
				
			||||||
    diff = prediction - target
 | 
					 | 
				
			||||||
    loss = (diff ** 2).sum()
 | 
					 | 
				
			||||||
    d_target = diff / float(prediction.shape[0])
 | 
					 | 
				
			||||||
    return loss, d_target
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def create_pretraining_model(nlp, pretrain_config):
 | 
					 | 
				
			||||||
    """Define a network for the pretraining. We simply add an output layer onto
 | 
					 | 
				
			||||||
    the tok2vec input model. The tok2vec input model needs to be a model that
 | 
					 | 
				
			||||||
    takes a batch of Doc objects (as a list), and returns a list of arrays.
 | 
					 | 
				
			||||||
    Each array in the output needs to have one row per token in the doc.
 | 
					 | 
				
			||||||
    The actual tok2vec layer is stored as a reference, and only this bit will be
 | 
					 | 
				
			||||||
    serialized to file and read back in when calling the 'train' command.
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    component = nlp.get_pipe(pretrain_config["component"])
 | 
					 | 
				
			||||||
    if pretrain_config.get("layer"):
 | 
					 | 
				
			||||||
        tok2vec = component.model.get_ref(pretrain_config["layer"])
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        tok2vec = component.model
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    # TODO
 | 
					 | 
				
			||||||
    maxout_pieces = 3
 | 
					 | 
				
			||||||
    hidden_size = 300
 | 
					 | 
				
			||||||
    if pretrain_config["objective"]["type"] == "vectors":
 | 
					 | 
				
			||||||
        model = build_cloze_multi_task_model(
 | 
					 | 
				
			||||||
            nlp.vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
    elif pretrain_config["objective"]["type"] == "characters":
 | 
					 | 
				
			||||||
        model = build_cloze_characters_multi_task_model(
 | 
					 | 
				
			||||||
            nlp.vocab,
 | 
					 | 
				
			||||||
            tok2vec,
 | 
					 | 
				
			||||||
            hidden_size=hidden_size,
 | 
					 | 
				
			||||||
            maxout_pieces=maxout_pieces,
 | 
					 | 
				
			||||||
            nr_char=pretrain_config["objective"]["n_characters"],
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
    model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")])
 | 
					 | 
				
			||||||
    set_dropout_rate(model, pretrain_config["dropout"])
 | 
					 | 
				
			||||||
    return model
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
class ProgressTracker:
 | 
					 | 
				
			||||||
    def __init__(self, frequency=1000000):
 | 
					 | 
				
			||||||
        self.loss = 0.0
 | 
					 | 
				
			||||||
        self.prev_loss = 0.0
 | 
					 | 
				
			||||||
        self.nr_word = 0
 | 
					 | 
				
			||||||
        self.words_per_epoch = Counter()
 | 
					 | 
				
			||||||
        self.frequency = frequency
 | 
					 | 
				
			||||||
        self.last_time = time.time()
 | 
					 | 
				
			||||||
        self.last_update = 0
 | 
					 | 
				
			||||||
        self.epoch_loss = 0.0
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def update(self, epoch, loss, docs):
 | 
					 | 
				
			||||||
        self.loss += loss
 | 
					 | 
				
			||||||
        self.epoch_loss += loss
 | 
					 | 
				
			||||||
        words_in_batch = sum(len(doc) for doc in docs)
 | 
					 | 
				
			||||||
        self.words_per_epoch[epoch] += words_in_batch
 | 
					 | 
				
			||||||
        self.nr_word += words_in_batch
 | 
					 | 
				
			||||||
        words_since_update = self.nr_word - self.last_update
 | 
					 | 
				
			||||||
        if words_since_update >= self.frequency:
 | 
					 | 
				
			||||||
            wps = words_since_update / (time.time() - self.last_time)
 | 
					 | 
				
			||||||
            self.last_update = self.nr_word
 | 
					 | 
				
			||||||
            self.last_time = time.time()
 | 
					 | 
				
			||||||
            loss_per_word = self.loss - self.prev_loss
 | 
					 | 
				
			||||||
            status = (
 | 
					 | 
				
			||||||
                epoch,
 | 
					 | 
				
			||||||
                self.nr_word,
 | 
					 | 
				
			||||||
                _smart_round(self.loss, width=10),
 | 
					 | 
				
			||||||
                _smart_round(loss_per_word, width=6),
 | 
					 | 
				
			||||||
                int(wps),
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
            self.prev_loss = float(self.loss)
 | 
					 | 
				
			||||||
            return status
 | 
					 | 
				
			||||||
        else:
 | 
					 | 
				
			||||||
            return None
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def _smart_round(figure, width=10, max_decimal=4):
 | 
					 | 
				
			||||||
    """Round large numbers as integers, smaller numbers as decimals."""
 | 
					 | 
				
			||||||
    n_digits = len(str(int(figure)))
 | 
					 | 
				
			||||||
    n_decimal = width - (n_digits + 1)
 | 
					 | 
				
			||||||
    if n_decimal <= 1:
 | 
					 | 
				
			||||||
        return str(int(figure))
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        n_decimal = min(n_decimal, max_decimal)
 | 
					 | 
				
			||||||
        format_str = "%." + str(n_decimal) + "f"
 | 
					 | 
				
			||||||
        return format_str % figure
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def verify_cli_args(config_path, output_dir, resume_path, epoch_resume):
 | 
					def verify_cli_args(config_path, output_dir, resume_path, epoch_resume):
 | 
				
			||||||
    if not config_path or not config_path.exists():
 | 
					    if not config_path or not config_path.exists():
 | 
				
			||||||
        msg.fail("Config file not found", config_path, exits=1)
 | 
					        msg.fail("Config file not found", config_path, exits=1)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,24 +1,16 @@
 | 
				
			||||||
from typing import Optional, Dict, Any, Tuple, Union, Callable, List
 | 
					from typing import Optional
 | 
				
			||||||
from timeit import default_timer as timer
 | 
					 | 
				
			||||||
import tqdm
 | 
					 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
from wasabi import msg
 | 
					from wasabi import msg
 | 
				
			||||||
import thinc
 | 
					from thinc.api import Config
 | 
				
			||||||
import thinc.schedules
 | 
					 | 
				
			||||||
from thinc.api import Config, Optimizer, require_gpu, fix_random_seed, set_gpu_allocator
 | 
					 | 
				
			||||||
import random
 | 
					 | 
				
			||||||
import typer
 | 
					import typer
 | 
				
			||||||
import logging
 | 
					import logging
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .init_pipeline import init_pipeline
 | 
					 | 
				
			||||||
from .init_pipeline import create_before_to_disk_callback
 | 
					 | 
				
			||||||
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
 | 
					from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
 | 
				
			||||||
from ._util import import_code
 | 
					from ._util import import_code, CliLogger, setup_gpu
 | 
				
			||||||
from ..language import Language
 | 
					from ..language import Language
 | 
				
			||||||
 | 
					from ..training.loop import train
 | 
				
			||||||
 | 
					from ..training.initialize import init_nlp, must_reinitialize
 | 
				
			||||||
from .. import util
 | 
					from .. import util
 | 
				
			||||||
from ..errors import Errors
 | 
					 | 
				
			||||||
from ..util import resolve_dot_names, registry
 | 
					 | 
				
			||||||
from ..schemas import ConfigSchemaTraining
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@app.command(
 | 
					@app.command(
 | 
				
			||||||
| 
						 | 
					@ -52,31 +44,33 @@ def train_cli(
 | 
				
			||||||
    verify_cli_args(config_path, output_path)
 | 
					    verify_cli_args(config_path, output_path)
 | 
				
			||||||
    overrides = parse_config_overrides(ctx.args)
 | 
					    overrides = parse_config_overrides(ctx.args)
 | 
				
			||||||
    import_code(code_path)
 | 
					    import_code(code_path)
 | 
				
			||||||
    if use_gpu >= 0:
 | 
					    setup_gpu(use_gpu)
 | 
				
			||||||
        msg.info(f"Using GPU: {use_gpu}")
 | 
					    with show_validation_error(config_path):
 | 
				
			||||||
        require_gpu(use_gpu)
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        msg.info("Using CPU")
 | 
					 | 
				
			||||||
        config = util.load_config(config_path, overrides=overrides, interpolate=False)
 | 
					        config = util.load_config(config_path, overrides=overrides, interpolate=False)
 | 
				
			||||||
    msg.divider("Initializing pipeline")
 | 
					    msg.divider("Initializing pipeline")
 | 
				
			||||||
    nlp = init_nlp(config, output_path)
 | 
					    nlp = init_pipeline(config, output_path, use_gpu=use_gpu)
 | 
				
			||||||
    msg.divider("Training pipeline")
 | 
					    msg.divider("Training pipeline")
 | 
				
			||||||
    train(nlp, output_path, use_gpu=use_gpu)
 | 
					    final_path = train(nlp, output_path, use_gpu=use_gpu, logger=CliLogger)
 | 
				
			||||||
 | 
					    if final_path:
 | 
				
			||||||
 | 
					        msg.good(f"Saved pipeline to output directory", final_path)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def init_nlp(config: Config, output_path: Optional[Path]) -> Language:
 | 
					def init_pipeline(
 | 
				
			||||||
 | 
					    config: Config, output_path: Optional[Path], *, use_gpu: int = -1
 | 
				
			||||||
 | 
					) -> Language:
 | 
				
			||||||
 | 
					    init_kwargs = {"use_gpu": use_gpu, "logger": CliLogger, "on_success": msg.good}
 | 
				
			||||||
    if output_path is not None:
 | 
					    if output_path is not None:
 | 
				
			||||||
        init_path = output_path / "model-initial"
 | 
					        init_path = output_path / "model-initial"
 | 
				
			||||||
        if not init_path.exists():
 | 
					        if not init_path.exists():
 | 
				
			||||||
            msg.info(f"Initializing the pipeline in {init_path}")
 | 
					            msg.info(f"Initializing the pipeline in {init_path}")
 | 
				
			||||||
            nlp = init_pipeline(config)
 | 
					            nlp = init_nlp(config, **init_kwargs)
 | 
				
			||||||
            nlp.to_disk(init_path)
 | 
					            nlp.to_disk(init_path)
 | 
				
			||||||
            msg.good(f"Saved initialized pipeline to {init_path}")
 | 
					            msg.good(f"Saved initialized pipeline to {init_path}")
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            nlp = util.load_model(init_path)
 | 
					            nlp = util.load_model(init_path)
 | 
				
			||||||
            if must_reinitialize(config, nlp.config):
 | 
					            if must_reinitialize(config, nlp.config):
 | 
				
			||||||
                msg.warn("Config has changed: need to re-initialize pipeline")
 | 
					                msg.warn("Config has changed: need to re-initialize pipeline")
 | 
				
			||||||
                nlp = init_pipeline(config)
 | 
					                nlp = init_nlp(config, **init_kwargs)
 | 
				
			||||||
                nlp.to_disk(init_path)
 | 
					                nlp.to_disk(init_path)
 | 
				
			||||||
                msg.good(f"Re-initialized pipeline in {init_path}")
 | 
					                msg.good(f"Re-initialized pipeline in {init_path}")
 | 
				
			||||||
            else:
 | 
					            else:
 | 
				
			||||||
| 
						 | 
					@ -88,279 +82,7 @@ def init_nlp(config: Config, output_path: Optional[Path]) -> Language:
 | 
				
			||||||
        "the vocabulary, vectors and label scheme. To take advantage of this, "
 | 
					        "the vocabulary, vectors and label scheme. To take advantage of this, "
 | 
				
			||||||
        "provide an output directory."
 | 
					        "provide an output directory."
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
    return init_pipeline(config)
 | 
					    return init_nlp(config, **init_kwargs)
 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def train(
 | 
					 | 
				
			||||||
    nlp: Language, output_path: Optional[Path] = None, *, use_gpu: int = -1
 | 
					 | 
				
			||||||
) -> None:
 | 
					 | 
				
			||||||
    # Create iterator, which yields out info after each optimization step.
 | 
					 | 
				
			||||||
    config = nlp.config.interpolate()
 | 
					 | 
				
			||||||
    if config["training"]["seed"] is not None:
 | 
					 | 
				
			||||||
        fix_random_seed(config["training"]["seed"])
 | 
					 | 
				
			||||||
    allocator = config["training"]["gpu_allocator"]
 | 
					 | 
				
			||||||
    if use_gpu >= 0 and allocator:
 | 
					 | 
				
			||||||
        set_gpu_allocator(allocator)
 | 
					 | 
				
			||||||
    T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
 | 
					 | 
				
			||||||
    dot_names = [T["train_corpus"], T["dev_corpus"]]
 | 
					 | 
				
			||||||
    train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
 | 
					 | 
				
			||||||
    optimizer = T["optimizer"]
 | 
					 | 
				
			||||||
    score_weights = T["score_weights"]
 | 
					 | 
				
			||||||
    batcher = T["batcher"]
 | 
					 | 
				
			||||||
    train_logger = T["logger"]
 | 
					 | 
				
			||||||
    before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
 | 
					 | 
				
			||||||
    # Components that shouldn't be updated during training
 | 
					 | 
				
			||||||
    frozen_components = T["frozen_components"]
 | 
					 | 
				
			||||||
    # Create iterator, which yields out info after each optimization step.
 | 
					 | 
				
			||||||
    training_step_iterator = train_while_improving(
 | 
					 | 
				
			||||||
        nlp,
 | 
					 | 
				
			||||||
        optimizer,
 | 
					 | 
				
			||||||
        create_train_batches(train_corpus(nlp), batcher, T["max_epochs"]),
 | 
					 | 
				
			||||||
        create_evaluation_callback(nlp, dev_corpus, score_weights),
 | 
					 | 
				
			||||||
        dropout=T["dropout"],
 | 
					 | 
				
			||||||
        accumulate_gradient=T["accumulate_gradient"],
 | 
					 | 
				
			||||||
        patience=T["patience"],
 | 
					 | 
				
			||||||
        max_steps=T["max_steps"],
 | 
					 | 
				
			||||||
        eval_frequency=T["eval_frequency"],
 | 
					 | 
				
			||||||
        exclude=frozen_components,
 | 
					 | 
				
			||||||
    )
 | 
					 | 
				
			||||||
    msg.info(f"Pipeline: {nlp.pipe_names}")
 | 
					 | 
				
			||||||
    if frozen_components:
 | 
					 | 
				
			||||||
        msg.info(f"Frozen components: {frozen_components}")
 | 
					 | 
				
			||||||
    msg.info(f"Initial learn rate: {optimizer.learn_rate}")
 | 
					 | 
				
			||||||
    with nlp.select_pipes(disable=frozen_components):
 | 
					 | 
				
			||||||
        print_row, finalize_logger = train_logger(nlp)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    try:
 | 
					 | 
				
			||||||
        progress = tqdm.tqdm(total=T["eval_frequency"], leave=False)
 | 
					 | 
				
			||||||
        progress.set_description(f"Epoch 1")
 | 
					 | 
				
			||||||
        for batch, info, is_best_checkpoint in training_step_iterator:
 | 
					 | 
				
			||||||
            progress.update(1)
 | 
					 | 
				
			||||||
            if is_best_checkpoint is not None:
 | 
					 | 
				
			||||||
                progress.close()
 | 
					 | 
				
			||||||
                print_row(info)
 | 
					 | 
				
			||||||
                if is_best_checkpoint and output_path is not None:
 | 
					 | 
				
			||||||
                    with nlp.select_pipes(disable=frozen_components):
 | 
					 | 
				
			||||||
                        update_meta(T, nlp, info)
 | 
					 | 
				
			||||||
                    with nlp.use_params(optimizer.averages):
 | 
					 | 
				
			||||||
                        nlp = before_to_disk(nlp)
 | 
					 | 
				
			||||||
                        nlp.to_disk(output_path / "model-best")
 | 
					 | 
				
			||||||
                progress = tqdm.tqdm(total=T["eval_frequency"], leave=False)
 | 
					 | 
				
			||||||
                progress.set_description(f"Epoch {info['epoch']}")
 | 
					 | 
				
			||||||
    except Exception as e:
 | 
					 | 
				
			||||||
        finalize_logger()
 | 
					 | 
				
			||||||
        if output_path is not None:
 | 
					 | 
				
			||||||
            # We don't want to swallow the traceback if we don't have a
 | 
					 | 
				
			||||||
            # specific error.
 | 
					 | 
				
			||||||
            msg.warn(
 | 
					 | 
				
			||||||
                f"Aborting and saving the final best model. "
 | 
					 | 
				
			||||||
                f"Encountered exception: {str(e)}"
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
            nlp = before_to_disk(nlp)
 | 
					 | 
				
			||||||
            nlp.to_disk(output_path / "model-final")
 | 
					 | 
				
			||||||
        raise e
 | 
					 | 
				
			||||||
    finally:
 | 
					 | 
				
			||||||
        finalize_logger()
 | 
					 | 
				
			||||||
        if output_path is not None:
 | 
					 | 
				
			||||||
            final_model_path = output_path / "model-final"
 | 
					 | 
				
			||||||
            if optimizer.averages:
 | 
					 | 
				
			||||||
                with nlp.use_params(optimizer.averages):
 | 
					 | 
				
			||||||
                    nlp.to_disk(final_model_path)
 | 
					 | 
				
			||||||
            else:
 | 
					 | 
				
			||||||
                nlp.to_disk(final_model_path)
 | 
					 | 
				
			||||||
            msg.good(f"Saved pipeline to output directory {final_model_path}")
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def must_reinitialize(train_config: Config, init_config: Config) -> bool:
 | 
					 | 
				
			||||||
    # TODO: do this better and more fine-grained
 | 
					 | 
				
			||||||
    return train_config.interpolate().to_str() == init_config.interpolate().to_str()
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def add_vectors(nlp: Language, vectors: str) -> None:
 | 
					 | 
				
			||||||
    title = f"Config validation error for vectors {vectors}"
 | 
					 | 
				
			||||||
    desc = (
 | 
					 | 
				
			||||||
        "This typically means that there's a problem in the config.cfg included "
 | 
					 | 
				
			||||||
        "with the packaged vectors. Make sure that the vectors package you're "
 | 
					 | 
				
			||||||
        "loading is compatible with the current version of spaCy."
 | 
					 | 
				
			||||||
    )
 | 
					 | 
				
			||||||
    with show_validation_error(
 | 
					 | 
				
			||||||
        title=title, desc=desc, hint_fill=False, show_config=False
 | 
					 | 
				
			||||||
    ):
 | 
					 | 
				
			||||||
        util.load_vectors_into_model(nlp, vectors)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def create_train_batches(iterator, batcher, max_epochs: int):
 | 
					 | 
				
			||||||
    epoch = 0
 | 
					 | 
				
			||||||
    examples = list(iterator)
 | 
					 | 
				
			||||||
    if not examples:
 | 
					 | 
				
			||||||
        # Raise error if no data
 | 
					 | 
				
			||||||
        raise ValueError(Errors.E986)
 | 
					 | 
				
			||||||
    while max_epochs < 1 or epoch != max_epochs:
 | 
					 | 
				
			||||||
        random.shuffle(examples)
 | 
					 | 
				
			||||||
        for batch in batcher(examples):
 | 
					 | 
				
			||||||
            yield epoch, batch
 | 
					 | 
				
			||||||
        epoch += 1
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def create_evaluation_callback(
 | 
					 | 
				
			||||||
    nlp: Language, dev_corpus: Callable, weights: Dict[str, float]
 | 
					 | 
				
			||||||
) -> Callable[[], Tuple[float, Dict[str, float]]]:
 | 
					 | 
				
			||||||
    weights = {key: value for key, value in weights.items() if value is not None}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def evaluate() -> Tuple[float, Dict[str, float]]:
 | 
					 | 
				
			||||||
        dev_examples = list(dev_corpus(nlp))
 | 
					 | 
				
			||||||
        scores = nlp.evaluate(dev_examples)
 | 
					 | 
				
			||||||
        # Calculate a weighted sum based on score_weights for the main score.
 | 
					 | 
				
			||||||
        # We can only consider scores that are ints/floats, not dicts like
 | 
					 | 
				
			||||||
        # entity scores per type etc.
 | 
					 | 
				
			||||||
        for key, value in scores.items():
 | 
					 | 
				
			||||||
            if key in weights and not isinstance(value, (int, float)):
 | 
					 | 
				
			||||||
                raise ValueError(Errors.E915.format(name=key, score_type=type(value)))
 | 
					 | 
				
			||||||
        try:
 | 
					 | 
				
			||||||
            weighted_score = sum(
 | 
					 | 
				
			||||||
                scores.get(s, 0.0) * weights.get(s, 0.0) for s in weights
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
        except KeyError as e:
 | 
					 | 
				
			||||||
            keys = list(scores.keys())
 | 
					 | 
				
			||||||
            err = Errors.E983.format(dict="score_weights", key=str(e), keys=keys)
 | 
					 | 
				
			||||||
            raise KeyError(err) from None
 | 
					 | 
				
			||||||
        return weighted_score, scores
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    return evaluate
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def train_while_improving(
 | 
					 | 
				
			||||||
    nlp: Language,
 | 
					 | 
				
			||||||
    optimizer: Optimizer,
 | 
					 | 
				
			||||||
    train_data,
 | 
					 | 
				
			||||||
    evaluate,
 | 
					 | 
				
			||||||
    *,
 | 
					 | 
				
			||||||
    dropout: float,
 | 
					 | 
				
			||||||
    eval_frequency: int,
 | 
					 | 
				
			||||||
    accumulate_gradient: int,
 | 
					 | 
				
			||||||
    patience: int,
 | 
					 | 
				
			||||||
    max_steps: int,
 | 
					 | 
				
			||||||
    exclude: List[str],
 | 
					 | 
				
			||||||
):
 | 
					 | 
				
			||||||
    """Train until an evaluation stops improving. Works as a generator,
 | 
					 | 
				
			||||||
    with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`,
 | 
					 | 
				
			||||||
    where info is a dict, and is_best_checkpoint is in [True, False, None] --
 | 
					 | 
				
			||||||
    None indicating that the iteration was not evaluated as a checkpoint.
 | 
					 | 
				
			||||||
    The evaluation is conducted by calling the evaluate callback.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    Positional arguments:
 | 
					 | 
				
			||||||
        nlp: The spaCy pipeline to evaluate.
 | 
					 | 
				
			||||||
        optimizer: The optimizer callable.
 | 
					 | 
				
			||||||
        train_data (Iterable[Batch]): A generator of batches, with the training
 | 
					 | 
				
			||||||
            data. Each batch should be a Sized[Tuple[Input, Annot]]. The training
 | 
					 | 
				
			||||||
            data iterable needs to take care of iterating over the epochs and
 | 
					 | 
				
			||||||
            shuffling.
 | 
					 | 
				
			||||||
        evaluate (Callable[[], Tuple[float, Any]]): A callback to perform evaluation.
 | 
					 | 
				
			||||||
            The callback should take no arguments and return a tuple
 | 
					 | 
				
			||||||
            `(main_score, other_scores)`. The main_score should be a float where
 | 
					 | 
				
			||||||
            higher is better. other_scores can be any object.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    Every iteration, the function yields out a tuple with:
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    * batch: A list of Example objects.
 | 
					 | 
				
			||||||
    * info: A dict with various information about the last update (see below).
 | 
					 | 
				
			||||||
    * is_best_checkpoint: A value in None, False, True, indicating whether this
 | 
					 | 
				
			||||||
        was the best evaluation so far. You should use this to save the model
 | 
					 | 
				
			||||||
        checkpoints during training. If None, evaluation was not conducted on
 | 
					 | 
				
			||||||
        that iteration. False means evaluation was conducted, but a previous
 | 
					 | 
				
			||||||
        evaluation was better.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    The info dict provides the following information:
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        epoch (int): How many passes over the data have been completed.
 | 
					 | 
				
			||||||
        step (int): How many steps have been completed.
 | 
					 | 
				
			||||||
        score (float): The main score from the last evaluation.
 | 
					 | 
				
			||||||
        other_scores: : The other scores from the last evaluation.
 | 
					 | 
				
			||||||
        losses: The accumulated losses throughout training.
 | 
					 | 
				
			||||||
        checkpoints: A list of previous results, where each result is a
 | 
					 | 
				
			||||||
            (score, step, epoch) tuple.
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    if isinstance(dropout, float):
 | 
					 | 
				
			||||||
        dropouts = thinc.schedules.constant(dropout)
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        dropouts = dropout
 | 
					 | 
				
			||||||
    results = []
 | 
					 | 
				
			||||||
    losses = {}
 | 
					 | 
				
			||||||
    words_seen = 0
 | 
					 | 
				
			||||||
    start_time = timer()
 | 
					 | 
				
			||||||
    for step, (epoch, batch) in enumerate(train_data):
 | 
					 | 
				
			||||||
        dropout = next(dropouts)
 | 
					 | 
				
			||||||
        for subbatch in subdivide_batch(batch, accumulate_gradient):
 | 
					 | 
				
			||||||
            nlp.update(
 | 
					 | 
				
			||||||
                subbatch, drop=dropout, losses=losses, sgd=False, exclude=exclude
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
        # TODO: refactor this so we don't have to run it separately in here
 | 
					 | 
				
			||||||
        for name, proc in nlp.pipeline:
 | 
					 | 
				
			||||||
            if (
 | 
					 | 
				
			||||||
                name not in exclude
 | 
					 | 
				
			||||||
                and hasattr(proc, "model")
 | 
					 | 
				
			||||||
                and proc.model not in (True, False, None)
 | 
					 | 
				
			||||||
            ):
 | 
					 | 
				
			||||||
                proc.model.finish_update(optimizer)
 | 
					 | 
				
			||||||
        optimizer.step_schedules()
 | 
					 | 
				
			||||||
        if not (step % eval_frequency):
 | 
					 | 
				
			||||||
            if optimizer.averages:
 | 
					 | 
				
			||||||
                with nlp.use_params(optimizer.averages):
 | 
					 | 
				
			||||||
                    score, other_scores = evaluate()
 | 
					 | 
				
			||||||
            else:
 | 
					 | 
				
			||||||
                score, other_scores = evaluate()
 | 
					 | 
				
			||||||
            results.append((score, step))
 | 
					 | 
				
			||||||
            is_best_checkpoint = score == max(results)[0]
 | 
					 | 
				
			||||||
        else:
 | 
					 | 
				
			||||||
            score, other_scores = (None, None)
 | 
					 | 
				
			||||||
            is_best_checkpoint = None
 | 
					 | 
				
			||||||
        words_seen += sum(len(eg) for eg in batch)
 | 
					 | 
				
			||||||
        info = {
 | 
					 | 
				
			||||||
            "epoch": epoch,
 | 
					 | 
				
			||||||
            "step": step,
 | 
					 | 
				
			||||||
            "score": score,
 | 
					 | 
				
			||||||
            "other_scores": other_scores,
 | 
					 | 
				
			||||||
            "losses": losses,
 | 
					 | 
				
			||||||
            "checkpoints": results,
 | 
					 | 
				
			||||||
            "seconds": int(timer() - start_time),
 | 
					 | 
				
			||||||
            "words": words_seen,
 | 
					 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
        yield batch, info, is_best_checkpoint
 | 
					 | 
				
			||||||
        if is_best_checkpoint is not None:
 | 
					 | 
				
			||||||
            losses = {}
 | 
					 | 
				
			||||||
        # Stop if no improvement in `patience` updates (if specified)
 | 
					 | 
				
			||||||
        best_score, best_step = max(results)
 | 
					 | 
				
			||||||
        if patience and (step - best_step) >= patience:
 | 
					 | 
				
			||||||
            break
 | 
					 | 
				
			||||||
        # Stop if we've exhausted our max steps (if specified)
 | 
					 | 
				
			||||||
        if max_steps and step >= max_steps:
 | 
					 | 
				
			||||||
            break
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def subdivide_batch(batch, accumulate_gradient):
 | 
					 | 
				
			||||||
    batch = list(batch)
 | 
					 | 
				
			||||||
    batch.sort(key=lambda eg: len(eg.predicted))
 | 
					 | 
				
			||||||
    sub_len = len(batch) // accumulate_gradient
 | 
					 | 
				
			||||||
    start = 0
 | 
					 | 
				
			||||||
    for i in range(accumulate_gradient):
 | 
					 | 
				
			||||||
        subbatch = batch[start : start + sub_len]
 | 
					 | 
				
			||||||
        if subbatch:
 | 
					 | 
				
			||||||
            yield subbatch
 | 
					 | 
				
			||||||
        start += len(subbatch)
 | 
					 | 
				
			||||||
    subbatch = batch[start:]
 | 
					 | 
				
			||||||
    if subbatch:
 | 
					 | 
				
			||||||
        yield subbatch
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def update_meta(
 | 
					 | 
				
			||||||
    training: Union[Dict[str, Any], Config], nlp: Language, info: Dict[str, Any]
 | 
					 | 
				
			||||||
) -> None:
 | 
					 | 
				
			||||||
    nlp.meta["performance"] = {}
 | 
					 | 
				
			||||||
    for metric in training["score_weights"]:
 | 
					 | 
				
			||||||
        if metric is not None:
 | 
					 | 
				
			||||||
            nlp.meta["performance"][metric] = info["other_scores"].get(metric, 0.0)
 | 
					 | 
				
			||||||
    for pipe_name in nlp.pipe_names:
 | 
					 | 
				
			||||||
        nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name]
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def verify_cli_args(config_path: Path, output_path: Optional[Path] = None) -> None:
 | 
					def verify_cli_args(config_path: Path, output_path: Optional[Path] = None) -> None:
 | 
				
			||||||
| 
						 | 
					@ -371,17 +93,3 @@ def verify_cli_args(config_path: Path, output_path: Optional[Path] = None) -> No
 | 
				
			||||||
        if not output_path.exists():
 | 
					        if not output_path.exists():
 | 
				
			||||||
            output_path.mkdir()
 | 
					            output_path.mkdir()
 | 
				
			||||||
            msg.good(f"Created output directory: {output_path}")
 | 
					            msg.good(f"Created output directory: {output_path}")
 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
# TODO: this is currently imported by the ray extension and not used otherwise
 | 
					 | 
				
			||||||
def load_from_paths(
 | 
					 | 
				
			||||||
    config: Config,
 | 
					 | 
				
			||||||
) -> Tuple[List[Dict[str, str]], Dict[str, dict], bytes]:
 | 
					 | 
				
			||||||
    weights_data = None
 | 
					 | 
				
			||||||
    init_tok2vec = util.ensure_path(config["training"]["init_tok2vec"])
 | 
					 | 
				
			||||||
    if init_tok2vec is not None:
 | 
					 | 
				
			||||||
        if not init_tok2vec.exists():
 | 
					 | 
				
			||||||
            msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1)
 | 
					 | 
				
			||||||
        with init_tok2vec.open("rb") as file_:
 | 
					 | 
				
			||||||
            weights_data = file_.read()
 | 
					 | 
				
			||||||
    return None, {}, {}, weights_data
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -9,10 +9,10 @@ from spacy.pipeline import TextCategorizer
 | 
				
			||||||
from spacy.tokens import Doc
 | 
					from spacy.tokens import Doc
 | 
				
			||||||
from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
 | 
					from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
 | 
				
			||||||
from spacy.scorer import Scorer
 | 
					from spacy.scorer import Scorer
 | 
				
			||||||
 | 
					from spacy.training import Example
 | 
				
			||||||
 | 
					from spacy.training.initialize import verify_textcat_config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..util import make_tempdir
 | 
					from ..util import make_tempdir
 | 
				
			||||||
from ...cli.train import verify_textcat_config
 | 
					 | 
				
			||||||
from ...training import Example
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
TRAIN_DATA = [
 | 
					TRAIN_DATA = [
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -7,7 +7,6 @@ from spacy.cli.init_config import init_config, RECOMMENDATIONS
 | 
				
			||||||
from spacy.cli._util import validate_project_commands, parse_config_overrides
 | 
					from spacy.cli._util import validate_project_commands, parse_config_overrides
 | 
				
			||||||
from spacy.cli._util import load_project_config, substitute_project_variables
 | 
					from spacy.cli._util import load_project_config, substitute_project_variables
 | 
				
			||||||
from spacy.cli._util import string_to_list, OVERRIDES_ENV_VAR
 | 
					from spacy.cli._util import string_to_list, OVERRIDES_ENV_VAR
 | 
				
			||||||
from spacy.cli.debug_config import check_section_refs
 | 
					 | 
				
			||||||
from thinc.api import ConfigValidationError, Config
 | 
					from thinc.api import ConfigValidationError, Config
 | 
				
			||||||
import srsly
 | 
					import srsly
 | 
				
			||||||
import os
 | 
					import os
 | 
				
			||||||
| 
						 | 
					@ -414,15 +413,3 @@ def test_string_to_list(value):
 | 
				
			||||||
def test_string_to_list_intify(value):
 | 
					def test_string_to_list_intify(value):
 | 
				
			||||||
    assert string_to_list(value, intify=False) == ["1", "2", "3"]
 | 
					    assert string_to_list(value, intify=False) == ["1", "2", "3"]
 | 
				
			||||||
    assert string_to_list(value, intify=True) == [1, 2, 3]
 | 
					    assert string_to_list(value, intify=True) == [1, 2, 3]
 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def test_check_section_refs():
 | 
					 | 
				
			||||||
    config = {"a": {"b": {"c": "a.d.e"}, "d": {"e": 1}}, "f": {"g": "d.f"}}
 | 
					 | 
				
			||||||
    config = Config(config)
 | 
					 | 
				
			||||||
    # Valid section reference
 | 
					 | 
				
			||||||
    check_section_refs(config, ["a.b.c"])
 | 
					 | 
				
			||||||
    # Section that doesn't exist in this config
 | 
					 | 
				
			||||||
    check_section_refs(config, ["x.y.z"])
 | 
					 | 
				
			||||||
    # Invalid section reference
 | 
					 | 
				
			||||||
    with pytest.raises(ConfigValidationError):
 | 
					 | 
				
			||||||
        check_section_refs(config, ["a.b.c", "f.g"])
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -7,7 +7,6 @@ from spacy import util
 | 
				
			||||||
from spacy import prefer_gpu, require_gpu
 | 
					from spacy import prefer_gpu, require_gpu
 | 
				
			||||||
from spacy.ml._precomputable_affine import PrecomputableAffine
 | 
					from spacy.ml._precomputable_affine import PrecomputableAffine
 | 
				
			||||||
from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
 | 
					from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
 | 
				
			||||||
from thinc.api import Optimizer
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.fixture
 | 
					@pytest.fixture
 | 
				
			||||||
| 
						 | 
					@ -158,16 +157,3 @@ def test_dot_to_dict(dot_notation, expected):
 | 
				
			||||||
    result = util.dot_to_dict(dot_notation)
 | 
					    result = util.dot_to_dict(dot_notation)
 | 
				
			||||||
    assert result == expected
 | 
					    assert result == expected
 | 
				
			||||||
    assert util.dict_to_dot(result) == dot_notation
 | 
					    assert util.dict_to_dot(result) == dot_notation
 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def test_resolve_training_config():
 | 
					 | 
				
			||||||
    config = {
 | 
					 | 
				
			||||||
        "nlp": {"lang": "en", "disabled": []},
 | 
					 | 
				
			||||||
        "training": {"dropout": 0.1, "optimizer": {"@optimizers": "Adam.v1"}},
 | 
					 | 
				
			||||||
        "corpora": {},
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
    resolved = util.resolve_training_config(config)
 | 
					 | 
				
			||||||
    assert resolved["training"]["dropout"] == 0.1
 | 
					 | 
				
			||||||
    assert isinstance(resolved["training"]["optimizer"], Optimizer)
 | 
					 | 
				
			||||||
    assert resolved["corpora"] == {}
 | 
					 | 
				
			||||||
    assert "nlp" not in resolved
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,14 +1,15 @@
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .util import get_random_doc
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from spacy import util
 | 
					from spacy import util
 | 
				
			||||||
from spacy.util import dot_to_object, SimpleFrozenList
 | 
					from spacy.util import dot_to_object, SimpleFrozenList
 | 
				
			||||||
from thinc.api import Config, Optimizer
 | 
					from thinc.api import Config, Optimizer, ConfigValidationError
 | 
				
			||||||
from spacy.training.batchers import minibatch_by_words
 | 
					from spacy.training.batchers import minibatch_by_words
 | 
				
			||||||
from ..lang.en import English
 | 
					from spacy.lang.en import English
 | 
				
			||||||
from ..lang.nl import Dutch
 | 
					from spacy.lang.nl import Dutch
 | 
				
			||||||
from ..language import DEFAULT_CONFIG_PATH
 | 
					from spacy.language import DEFAULT_CONFIG_PATH
 | 
				
			||||||
 | 
					from spacy.schemas import ConfigSchemaTraining
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from .util import get_random_doc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.mark.parametrize(
 | 
					@pytest.mark.parametrize(
 | 
				
			||||||
| 
						 | 
					@ -101,8 +102,8 @@ def test_util_dot_section():
 | 
				
			||||||
        dot_to_object(en_nlp.config, "nlp.pipeline.tagger")
 | 
					        dot_to_object(en_nlp.config, "nlp.pipeline.tagger")
 | 
				
			||||||
    with pytest.raises(KeyError):
 | 
					    with pytest.raises(KeyError):
 | 
				
			||||||
        dot_to_object(en_nlp.config, "nlp.unknownattribute")
 | 
					        dot_to_object(en_nlp.config, "nlp.unknownattribute")
 | 
				
			||||||
    resolved = util.resolve_training_config(nl_nlp.config)
 | 
					    T = util.registry.resolve(nl_nlp.config["training"], schema=ConfigSchemaTraining)
 | 
				
			||||||
    assert isinstance(dot_to_object(resolved, "training.optimizer"), Optimizer)
 | 
					    assert isinstance(dot_to_object({"training": T}, "training.optimizer"), Optimizer)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_simple_frozen_list():
 | 
					def test_simple_frozen_list():
 | 
				
			||||||
| 
						 | 
					@ -120,3 +121,17 @@ def test_simple_frozen_list():
 | 
				
			||||||
    t = SimpleFrozenList(["foo", "bar"], error="Error!")
 | 
					    t = SimpleFrozenList(["foo", "bar"], error="Error!")
 | 
				
			||||||
    with pytest.raises(NotImplementedError):
 | 
					    with pytest.raises(NotImplementedError):
 | 
				
			||||||
        t.append("baz")
 | 
					        t.append("baz")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_resolve_dot_names():
 | 
				
			||||||
 | 
					    config = {
 | 
				
			||||||
 | 
					        "training": {"optimizer": {"@optimizers": "Adam.v1"}},
 | 
				
			||||||
 | 
					        "foo": {"bar": "training.optimizer", "baz": "training.xyz"},
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    result = util.resolve_dot_names(config, ["foo.bar"])
 | 
				
			||||||
 | 
					    assert isinstance(result[0], Optimizer)
 | 
				
			||||||
 | 
					    with pytest.raises(ConfigValidationError) as e:
 | 
				
			||||||
 | 
					        util.resolve_dot_names(config, ["foo.baz", "foo.bar"])
 | 
				
			||||||
 | 
					    errors = e.value.errors
 | 
				
			||||||
 | 
					    assert len(errors) == 1
 | 
				
			||||||
 | 
					    assert errors[0]["loc"] == ["training", "xyz"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2,8 +2,8 @@ from typing import Dict, Iterable, Callable
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
from thinc.api import Config
 | 
					from thinc.api import Config
 | 
				
			||||||
from spacy import Language
 | 
					from spacy import Language
 | 
				
			||||||
from spacy.util import load_model_from_config, registry, dot_to_object
 | 
					from spacy.util import load_model_from_config, registry, resolve_dot_names
 | 
				
			||||||
from spacy.util import resolve_training_config
 | 
					from spacy.schemas import ConfigSchemaTraining
 | 
				
			||||||
from spacy.training import Example
 | 
					from spacy.training import Example
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -39,21 +39,21 @@ def test_readers():
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    config = Config().from_str(config_string)
 | 
					    config = Config().from_str(config_string)
 | 
				
			||||||
    nlp = load_model_from_config(config, auto_fill=True)
 | 
					    nlp = load_model_from_config(config, auto_fill=True)
 | 
				
			||||||
    resolved = resolve_training_config(nlp.config)
 | 
					    dot_names = ["training.train_corpus", "training.dev_corpus"]
 | 
				
			||||||
    train_corpus = dot_to_object(resolved, resolved["training"]["train_corpus"])
 | 
					    train_corpus, dev_corpus = resolve_dot_names(nlp.config, dot_names)
 | 
				
			||||||
    assert isinstance(train_corpus, Callable)
 | 
					    assert isinstance(train_corpus, Callable)
 | 
				
			||||||
    optimizer = resolved["training"]["optimizer"]
 | 
					    T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining)
 | 
				
			||||||
 | 
					    optimizer = T["optimizer"]
 | 
				
			||||||
    # simulate a training loop
 | 
					    # simulate a training loop
 | 
				
			||||||
    nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
 | 
					    nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
 | 
				
			||||||
    for example in train_corpus(nlp):
 | 
					    for example in train_corpus(nlp):
 | 
				
			||||||
        nlp.update([example], sgd=optimizer)
 | 
					        nlp.update([example], sgd=optimizer)
 | 
				
			||||||
    dev_corpus = dot_to_object(resolved, resolved["training"]["dev_corpus"])
 | 
					 | 
				
			||||||
    scores = nlp.evaluate(list(dev_corpus(nlp)))
 | 
					    scores = nlp.evaluate(list(dev_corpus(nlp)))
 | 
				
			||||||
    assert scores["cats_score"]
 | 
					    assert scores["cats_score"]
 | 
				
			||||||
    # ensure the pipeline runs
 | 
					    # ensure the pipeline runs
 | 
				
			||||||
    doc = nlp("Quick test")
 | 
					    doc = nlp("Quick test")
 | 
				
			||||||
    assert doc.cats
 | 
					    assert doc.cats
 | 
				
			||||||
    extra_corpus = resolved["corpora"]["extra"]
 | 
					    extra_corpus = registry.resolve(nlp.config["corpora"])["extra"]
 | 
				
			||||||
    assert isinstance(extra_corpus, Callable)
 | 
					    assert isinstance(extra_corpus, Callable)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -89,9 +89,10 @@ def test_cat_readers(reader, additional_config):
 | 
				
			||||||
    config["corpora"]["@readers"] = reader
 | 
					    config["corpora"]["@readers"] = reader
 | 
				
			||||||
    config["corpora"].update(additional_config)
 | 
					    config["corpora"].update(additional_config)
 | 
				
			||||||
    nlp = load_model_from_config(config, auto_fill=True)
 | 
					    nlp = load_model_from_config(config, auto_fill=True)
 | 
				
			||||||
    resolved = resolve_training_config(nlp.config)
 | 
					    dot_names = ["training.train_corpus", "training.dev_corpus"]
 | 
				
			||||||
    train_corpus = dot_to_object(resolved, resolved["training"]["train_corpus"])
 | 
					    train_corpus, dev_corpus = resolve_dot_names(nlp.config, dot_names)
 | 
				
			||||||
    optimizer = resolved["training"]["optimizer"]
 | 
					    T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining)
 | 
				
			||||||
 | 
					    optimizer = T["optimizer"]
 | 
				
			||||||
    # simulate a training loop
 | 
					    # simulate a training loop
 | 
				
			||||||
    nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
 | 
					    nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
 | 
				
			||||||
    for example in train_corpus(nlp):
 | 
					    for example in train_corpus(nlp):
 | 
				
			||||||
| 
						 | 
					@ -100,7 +101,6 @@ def test_cat_readers(reader, additional_config):
 | 
				
			||||||
        assert sorted(list(set(example.y.cats.values()))) == [0.0, 1.0]
 | 
					        assert sorted(list(set(example.y.cats.values()))) == [0.0, 1.0]
 | 
				
			||||||
        nlp.update([example], sgd=optimizer)
 | 
					        nlp.update([example], sgd=optimizer)
 | 
				
			||||||
    # simulate performance benchmark on dev corpus
 | 
					    # simulate performance benchmark on dev corpus
 | 
				
			||||||
    dev_corpus = dot_to_object(resolved, resolved["training"]["dev_corpus"])
 | 
					 | 
				
			||||||
    dev_examples = list(dev_corpus(nlp))
 | 
					    dev_examples = list(dev_corpus(nlp))
 | 
				
			||||||
    for example in dev_examples:
 | 
					    for example in dev_examples:
 | 
				
			||||||
        # this shouldn't fail if each dev example has at least one positive label
 | 
					        # this shouldn't fail if each dev example has at least one positive label
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -0,0 +1,205 @@
 | 
				
			||||||
 | 
					from typing import Union, Dict, Optional, Any, List, Callable
 | 
				
			||||||
 | 
					from thinc.api import Config, fix_random_seed, set_gpu_allocator
 | 
				
			||||||
 | 
					from thinc.api import ConfigValidationError
 | 
				
			||||||
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					import srsly
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from .loop import create_before_to_disk_callback
 | 
				
			||||||
 | 
					from ..language import Language
 | 
				
			||||||
 | 
					from ..lookups import Lookups
 | 
				
			||||||
 | 
					from ..errors import Errors
 | 
				
			||||||
 | 
					from ..schemas import ConfigSchemaTraining, ConfigSchemaInit, ConfigSchemaPretrain
 | 
				
			||||||
 | 
					from ..util import registry, load_model_from_config, resolve_dot_names
 | 
				
			||||||
 | 
					from ..util import load_model, ensure_path, logger, OOV_RANK, DEFAULT_OOV_PROB
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def init_nlp(
 | 
				
			||||||
 | 
					    config: Config,
 | 
				
			||||||
 | 
					    *,
 | 
				
			||||||
 | 
					    use_gpu: int = -1,
 | 
				
			||||||
 | 
					    logger: Callable[[Any], Any] = logger,
 | 
				
			||||||
 | 
					    on_success: Callable[[str], None] = lambda x: None,
 | 
				
			||||||
 | 
					) -> Language:
 | 
				
			||||||
 | 
					    raw_config = config
 | 
				
			||||||
 | 
					    config = raw_config.interpolate()
 | 
				
			||||||
 | 
					    if config["training"]["seed"] is not None:
 | 
				
			||||||
 | 
					        fix_random_seed(config["training"]["seed"])
 | 
				
			||||||
 | 
					    allocator = config["training"]["gpu_allocator"]
 | 
				
			||||||
 | 
					    if use_gpu >= 0 and allocator:
 | 
				
			||||||
 | 
					        set_gpu_allocator(allocator)
 | 
				
			||||||
 | 
					    # Use original config here before it's resolved to functions
 | 
				
			||||||
 | 
					    sourced_components = get_sourced_components(config)
 | 
				
			||||||
 | 
					    nlp = load_model_from_config(raw_config, auto_fill=True)
 | 
				
			||||||
 | 
					    on_success("Set up nlp object from config")
 | 
				
			||||||
 | 
					    config = nlp.config.interpolate()
 | 
				
			||||||
 | 
					    # Resolve all training-relevant sections using the filled nlp config
 | 
				
			||||||
 | 
					    T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
 | 
				
			||||||
 | 
					    dot_names = [T["train_corpus"], T["dev_corpus"]]
 | 
				
			||||||
 | 
					    train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
 | 
				
			||||||
 | 
					    I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
 | 
				
			||||||
 | 
					    V = I["vocab"]
 | 
				
			||||||
 | 
					    init_vocab(nlp, data=V["data"], lookups=V["lookups"], vectors=V["vectors"])
 | 
				
			||||||
 | 
					    optimizer = T["optimizer"]
 | 
				
			||||||
 | 
					    before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
 | 
				
			||||||
 | 
					    # Components that shouldn't be updated during training
 | 
				
			||||||
 | 
					    frozen_components = T["frozen_components"]
 | 
				
			||||||
 | 
					    # Sourced components that require resume_training
 | 
				
			||||||
 | 
					    resume_components = [p for p in sourced_components if p not in frozen_components]
 | 
				
			||||||
 | 
					    logger.info(f"Pipeline: {nlp.pipe_names}")
 | 
				
			||||||
 | 
					    if resume_components:
 | 
				
			||||||
 | 
					        with nlp.select_pipes(enable=resume_components):
 | 
				
			||||||
 | 
					            logger.info(f"Resuming training for: {resume_components}")
 | 
				
			||||||
 | 
					            nlp.resume_training(sgd=optimizer)
 | 
				
			||||||
 | 
					    with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
 | 
				
			||||||
 | 
					        nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
 | 
				
			||||||
 | 
					        on_success(f"Initialized pipeline components")
 | 
				
			||||||
 | 
					    # Verify the config after calling 'begin_training' to ensure labels
 | 
				
			||||||
 | 
					    # are properly initialized
 | 
				
			||||||
 | 
					    verify_config(nlp)
 | 
				
			||||||
 | 
					    if "pretraining" in config and config["pretraining"]:
 | 
				
			||||||
 | 
					        P = registry.resolve(config["pretraining"], schema=ConfigSchemaPretrain)
 | 
				
			||||||
 | 
					        loaded = add_tok2vec_weights(nlp, P, I)
 | 
				
			||||||
 | 
					        if loaded and P["component"]:
 | 
				
			||||||
 | 
					            on_success(f"Loaded pretrained weights into component '{P['component']}'")
 | 
				
			||||||
 | 
					    nlp = before_to_disk(nlp)
 | 
				
			||||||
 | 
					    return nlp
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def must_reinitialize(train_config: Config, init_config: Config) -> bool:
 | 
				
			||||||
 | 
					    # TODO: do this better and more fine-grained
 | 
				
			||||||
 | 
					    return train_config.interpolate().to_str() == init_config.interpolate().to_str()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def init_vocab(
 | 
				
			||||||
 | 
					    nlp: Language,
 | 
				
			||||||
 | 
					    *,
 | 
				
			||||||
 | 
					    data: Optional[Path] = None,
 | 
				
			||||||
 | 
					    lookups: Optional[Lookups] = None,
 | 
				
			||||||
 | 
					    vectors: Optional[str] = None,
 | 
				
			||||||
 | 
					    on_success: Callable[[str], None] = lambda x: None,
 | 
				
			||||||
 | 
					) -> Language:
 | 
				
			||||||
 | 
					    if lookups:
 | 
				
			||||||
 | 
					        nlp.vocab.lookups = lookups
 | 
				
			||||||
 | 
					        on_success(f"Added vocab lookups: {', '.join(lookups.tables)}")
 | 
				
			||||||
 | 
					    data_path = ensure_path(data)
 | 
				
			||||||
 | 
					    if data_path is not None:
 | 
				
			||||||
 | 
					        lex_attrs = srsly.read_jsonl(data_path)
 | 
				
			||||||
 | 
					        for lexeme in nlp.vocab:
 | 
				
			||||||
 | 
					            lexeme.rank = OOV_RANK
 | 
				
			||||||
 | 
					        for attrs in lex_attrs:
 | 
				
			||||||
 | 
					            if "settings" in attrs:
 | 
				
			||||||
 | 
					                continue
 | 
				
			||||||
 | 
					            lexeme = nlp.vocab[attrs["orth"]]
 | 
				
			||||||
 | 
					            lexeme.set_attrs(**attrs)
 | 
				
			||||||
 | 
					        if len(nlp.vocab):
 | 
				
			||||||
 | 
					            oov_prob = min(lex.prob for lex in nlp.vocab) - 1
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            oov_prob = DEFAULT_OOV_PROB
 | 
				
			||||||
 | 
					        nlp.vocab.cfg.update({"oov_prob": oov_prob})
 | 
				
			||||||
 | 
					        on_success(f"Added {len(nlp.vocab)} lexical entries to the vocab")
 | 
				
			||||||
 | 
					    on_success("Created vocabulary")
 | 
				
			||||||
 | 
					    if vectors is not None:
 | 
				
			||||||
 | 
					        load_vectors_into_model(nlp, vectors)
 | 
				
			||||||
 | 
					        on_success(f"Added vectors: {vectors}")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def load_vectors_into_model(
 | 
				
			||||||
 | 
					    nlp: "Language", name: Union[str, Path], *, add_strings: bool = True
 | 
				
			||||||
 | 
					) -> None:
 | 
				
			||||||
 | 
					    """Load word vectors from an installed model or path into a model instance."""
 | 
				
			||||||
 | 
					    try:
 | 
				
			||||||
 | 
					        vectors_nlp = load_model(name)
 | 
				
			||||||
 | 
					    except ConfigValidationError as e:
 | 
				
			||||||
 | 
					        title = f"Config validation error for vectors {name}"
 | 
				
			||||||
 | 
					        desc = (
 | 
				
			||||||
 | 
					            "This typically means that there's a problem in the config.cfg included "
 | 
				
			||||||
 | 
					            "with the packaged vectors. Make sure that the vectors package you're "
 | 
				
			||||||
 | 
					            "loading is compatible with the current version of spaCy."
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        err = ConfigValidationError.from_error(config=None, title=title, desc=desc)
 | 
				
			||||||
 | 
					        raise err from None
 | 
				
			||||||
 | 
					    nlp.vocab.vectors = vectors_nlp.vocab.vectors
 | 
				
			||||||
 | 
					    if add_strings:
 | 
				
			||||||
 | 
					        # I guess we should add the strings from the vectors_nlp model?
 | 
				
			||||||
 | 
					        # E.g. if someone does a similarity query, they might expect the strings.
 | 
				
			||||||
 | 
					        for key in nlp.vocab.vectors.key2row:
 | 
				
			||||||
 | 
					            if key in vectors_nlp.vocab.strings:
 | 
				
			||||||
 | 
					                nlp.vocab.strings.add(vectors_nlp.vocab.strings[key])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def add_tok2vec_weights(
 | 
				
			||||||
 | 
					    nlp: Language, pretrain_config: Dict[str, Any], vocab_config: Dict[str, Any]
 | 
				
			||||||
 | 
					) -> bool:
 | 
				
			||||||
 | 
					    # Load pretrained tok2vec weights - cf. CLI command 'pretrain'
 | 
				
			||||||
 | 
					    P = pretrain_config
 | 
				
			||||||
 | 
					    V = vocab_config
 | 
				
			||||||
 | 
					    weights_data = None
 | 
				
			||||||
 | 
					    init_tok2vec = ensure_path(V["init_tok2vec"])
 | 
				
			||||||
 | 
					    if init_tok2vec is not None:
 | 
				
			||||||
 | 
					        if P["objective"].get("type") == "vectors" and not V["vectors"]:
 | 
				
			||||||
 | 
					            err = 'need initialize.vectors if pretraining.objective.type is "vectors"'
 | 
				
			||||||
 | 
					            errors = [{"loc": ["initialize", "vectors"], "msg": err}]
 | 
				
			||||||
 | 
					            raise ConfigValidationError(config=nlp.config, errors=errors)
 | 
				
			||||||
 | 
					        if not init_tok2vec.exists():
 | 
				
			||||||
 | 
					            err = f"can't find pretrained tok2vec: {init_tok2vec}"
 | 
				
			||||||
 | 
					            errors = [{"loc": ["initialize", "vectors", "init_tok2vec"], "msg": err}]
 | 
				
			||||||
 | 
					            raise ConfigValidationError(config=nlp.config, errors=errors)
 | 
				
			||||||
 | 
					        with init_tok2vec.open("rb") as file_:
 | 
				
			||||||
 | 
					            weights_data = file_.read()
 | 
				
			||||||
 | 
					    if weights_data is not None:
 | 
				
			||||||
 | 
					        tok2vec_component = P["component"]
 | 
				
			||||||
 | 
					        if tok2vec_component is None:
 | 
				
			||||||
 | 
					            desc = (
 | 
				
			||||||
 | 
					                f"To use pretrained tok2vec weights, [pretraining.component] "
 | 
				
			||||||
 | 
					                f"needs to specify the component that should load them."
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					            err = "component can't be null"
 | 
				
			||||||
 | 
					            errors = [{"loc": ["pretraining", "component"], "msg": err}]
 | 
				
			||||||
 | 
					            raise ConfigValidationError(
 | 
				
			||||||
 | 
					                config=nlp.config["pretraining"], errors=errors, desc=desc
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					        layer = nlp.get_pipe(tok2vec_component).model
 | 
				
			||||||
 | 
					        if P["layer"]:
 | 
				
			||||||
 | 
					            layer = layer.get_ref(P["layer"])
 | 
				
			||||||
 | 
					        layer.from_bytes(weights_data)
 | 
				
			||||||
 | 
					        return True
 | 
				
			||||||
 | 
					    return False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def verify_config(nlp: Language) -> None:
 | 
				
			||||||
 | 
					    """Perform additional checks based on the config, loaded nlp object and training data."""
 | 
				
			||||||
 | 
					    # TODO: maybe we should validate based on the actual components, the list
 | 
				
			||||||
 | 
					    # in config["nlp"]["pipeline"] instead?
 | 
				
			||||||
 | 
					    for pipe_config in nlp.config["components"].values():
 | 
				
			||||||
 | 
					        # We can't assume that the component name == the factory
 | 
				
			||||||
 | 
					        factory = pipe_config["factory"]
 | 
				
			||||||
 | 
					        if factory == "textcat":
 | 
				
			||||||
 | 
					            verify_textcat_config(nlp, pipe_config)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def verify_textcat_config(nlp: Language, pipe_config: Dict[str, Any]) -> None:
 | 
				
			||||||
 | 
					    # if 'positive_label' is provided: double check whether it's in the data and
 | 
				
			||||||
 | 
					    # the task is binary
 | 
				
			||||||
 | 
					    if pipe_config.get("positive_label"):
 | 
				
			||||||
 | 
					        textcat_labels = nlp.get_pipe("textcat").labels
 | 
				
			||||||
 | 
					        pos_label = pipe_config.get("positive_label")
 | 
				
			||||||
 | 
					        if pos_label not in textcat_labels:
 | 
				
			||||||
 | 
					            raise ValueError(
 | 
				
			||||||
 | 
					                Errors.E920.format(pos_label=pos_label, labels=textcat_labels)
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					        if len(list(textcat_labels)) != 2:
 | 
				
			||||||
 | 
					            raise ValueError(
 | 
				
			||||||
 | 
					                Errors.E919.format(pos_label=pos_label, labels=textcat_labels)
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def get_sourced_components(config: Union[Dict[str, Any], Config]) -> List[str]:
 | 
				
			||||||
 | 
					    """RETURNS (List[str]): All sourced components in the original config,
 | 
				
			||||||
 | 
					        e.g. {"source": "en_core_web_sm"}. If the config contains a key
 | 
				
			||||||
 | 
					        "factory", we assume it refers to a component factory.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    return [
 | 
				
			||||||
 | 
					        name
 | 
				
			||||||
 | 
					        for name, cfg in config.get("components", {}).items()
 | 
				
			||||||
 | 
					        if "factory" not in cfg and "source" in cfg
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
							
								
								
									
										301
									
								
								spacy/training/loop.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										301
									
								
								spacy/training/loop.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,301 @@
 | 
				
			||||||
 | 
					from typing import List, Callable, Tuple, Dict, Iterable, Iterator, Union, Any
 | 
				
			||||||
 | 
					from typing import Optional
 | 
				
			||||||
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					from timeit import default_timer as timer
 | 
				
			||||||
 | 
					from thinc.api import Optimizer, Config, constant, fix_random_seed, set_gpu_allocator
 | 
				
			||||||
 | 
					import random
 | 
				
			||||||
 | 
					import tqdm
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from .example import Example
 | 
				
			||||||
 | 
					from ..schemas import ConfigSchemaTraining
 | 
				
			||||||
 | 
					from ..language import Language
 | 
				
			||||||
 | 
					from ..errors import Errors
 | 
				
			||||||
 | 
					from ..util import resolve_dot_names, registry, logger
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def train(
 | 
				
			||||||
 | 
					    nlp: Language,
 | 
				
			||||||
 | 
					    output_path: Optional[Path] = None,
 | 
				
			||||||
 | 
					    *,
 | 
				
			||||||
 | 
					    use_gpu: int = -1,
 | 
				
			||||||
 | 
					    logger: Callable[[Any], Any] = logger,
 | 
				
			||||||
 | 
					) -> Optional[Path]:
 | 
				
			||||||
 | 
					    """Train a pipeline.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    nlp (Language): The initialized nlp object with the full config.
 | 
				
			||||||
 | 
					    output_path (Path): Optional output path to save trained model to.
 | 
				
			||||||
 | 
					    use_gpu (int): Whether to train on GPU. Make sure to call require_gpu
 | 
				
			||||||
 | 
					        before calling this function.
 | 
				
			||||||
 | 
					    logger (Callable[[Any], Any]): Optional logger exposing the methods info,
 | 
				
			||||||
 | 
					        error, debug and  warn. Defaults to regular spaCy logger but can be
 | 
				
			||||||
 | 
					        swapped for CLI logger.
 | 
				
			||||||
 | 
					    RETURNS (Path / None): The path to the final exported model.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Create iterator, which yields out info after each optimization step.
 | 
				
			||||||
 | 
					    config = nlp.config.interpolate()
 | 
				
			||||||
 | 
					    if config["training"]["seed"] is not None:
 | 
				
			||||||
 | 
					        fix_random_seed(config["training"]["seed"])
 | 
				
			||||||
 | 
					    allocator = config["training"]["gpu_allocator"]
 | 
				
			||||||
 | 
					    if use_gpu >= 0 and allocator:
 | 
				
			||||||
 | 
					        set_gpu_allocator(allocator)
 | 
				
			||||||
 | 
					    T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
 | 
				
			||||||
 | 
					    dot_names = [T["train_corpus"], T["dev_corpus"]]
 | 
				
			||||||
 | 
					    train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
 | 
				
			||||||
 | 
					    optimizer = T["optimizer"]
 | 
				
			||||||
 | 
					    score_weights = T["score_weights"]
 | 
				
			||||||
 | 
					    batcher = T["batcher"]
 | 
				
			||||||
 | 
					    train_logger = T["logger"]
 | 
				
			||||||
 | 
					    before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
 | 
				
			||||||
 | 
					    # Components that shouldn't be updated during training
 | 
				
			||||||
 | 
					    frozen_components = T["frozen_components"]
 | 
				
			||||||
 | 
					    # Create iterator, which yields out info after each optimization step.
 | 
				
			||||||
 | 
					    training_step_iterator = train_while_improving(
 | 
				
			||||||
 | 
					        nlp,
 | 
				
			||||||
 | 
					        optimizer,
 | 
				
			||||||
 | 
					        create_train_batches(train_corpus(nlp), batcher, T["max_epochs"]),
 | 
				
			||||||
 | 
					        create_evaluation_callback(nlp, dev_corpus, score_weights),
 | 
				
			||||||
 | 
					        dropout=T["dropout"],
 | 
				
			||||||
 | 
					        accumulate_gradient=T["accumulate_gradient"],
 | 
				
			||||||
 | 
					        patience=T["patience"],
 | 
				
			||||||
 | 
					        max_steps=T["max_steps"],
 | 
				
			||||||
 | 
					        eval_frequency=T["eval_frequency"],
 | 
				
			||||||
 | 
					        exclude=frozen_components,
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    logger.info(f"Pipeline: {nlp.pipe_names}")
 | 
				
			||||||
 | 
					    if frozen_components:
 | 
				
			||||||
 | 
					        logger.info(f"Frozen components: {frozen_components}")
 | 
				
			||||||
 | 
					    logger.info(f"Initial learn rate: {optimizer.learn_rate}")
 | 
				
			||||||
 | 
					    with nlp.select_pipes(disable=frozen_components):
 | 
				
			||||||
 | 
					        print_row, finalize_logger = train_logger(nlp)
 | 
				
			||||||
 | 
					    try:
 | 
				
			||||||
 | 
					        progress = tqdm.tqdm(total=T["eval_frequency"], leave=False)
 | 
				
			||||||
 | 
					        progress.set_description(f"Epoch 1")
 | 
				
			||||||
 | 
					        for batch, info, is_best_checkpoint in training_step_iterator:
 | 
				
			||||||
 | 
					            progress.update(1)
 | 
				
			||||||
 | 
					            if is_best_checkpoint is not None:
 | 
				
			||||||
 | 
					                progress.close()
 | 
				
			||||||
 | 
					                print_row(info)
 | 
				
			||||||
 | 
					                if is_best_checkpoint and output_path is not None:
 | 
				
			||||||
 | 
					                    with nlp.select_pipes(disable=frozen_components):
 | 
				
			||||||
 | 
					                        update_meta(T, nlp, info)
 | 
				
			||||||
 | 
					                    with nlp.use_params(optimizer.averages):
 | 
				
			||||||
 | 
					                        nlp = before_to_disk(nlp)
 | 
				
			||||||
 | 
					                        nlp.to_disk(output_path / "model-best")
 | 
				
			||||||
 | 
					                progress = tqdm.tqdm(total=T["eval_frequency"], leave=False)
 | 
				
			||||||
 | 
					                progress.set_description(f"Epoch {info['epoch']}")
 | 
				
			||||||
 | 
					    except Exception as e:
 | 
				
			||||||
 | 
					        finalize_logger()
 | 
				
			||||||
 | 
					        if output_path is not None:
 | 
				
			||||||
 | 
					            # We don't want to swallow the traceback if we don't have a
 | 
				
			||||||
 | 
					            # specific error.
 | 
				
			||||||
 | 
					            logger.warn(
 | 
				
			||||||
 | 
					                f"Aborting and saving the final best model. "
 | 
				
			||||||
 | 
					                f"Encountered exception: {str(e)}"
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					            nlp = before_to_disk(nlp)
 | 
				
			||||||
 | 
					            nlp.to_disk(output_path / "model-final")
 | 
				
			||||||
 | 
					        raise e
 | 
				
			||||||
 | 
					    finally:
 | 
				
			||||||
 | 
					        finalize_logger()
 | 
				
			||||||
 | 
					        if output_path is not None:
 | 
				
			||||||
 | 
					            final_model_path = output_path / "model-final"
 | 
				
			||||||
 | 
					            if optimizer.averages:
 | 
				
			||||||
 | 
					                with nlp.use_params(optimizer.averages):
 | 
				
			||||||
 | 
					                    nlp.to_disk(final_model_path)
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                nlp.to_disk(final_model_path)
 | 
				
			||||||
 | 
					            return final_model_path
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def train_while_improving(
 | 
				
			||||||
 | 
					    nlp: Language,
 | 
				
			||||||
 | 
					    optimizer: Optimizer,
 | 
				
			||||||
 | 
					    train_data,
 | 
				
			||||||
 | 
					    evaluate,
 | 
				
			||||||
 | 
					    *,
 | 
				
			||||||
 | 
					    dropout: float,
 | 
				
			||||||
 | 
					    eval_frequency: int,
 | 
				
			||||||
 | 
					    accumulate_gradient: int,
 | 
				
			||||||
 | 
					    patience: int,
 | 
				
			||||||
 | 
					    max_steps: int,
 | 
				
			||||||
 | 
					    exclude: List[str],
 | 
				
			||||||
 | 
					):
 | 
				
			||||||
 | 
					    """Train until an evaluation stops improving. Works as a generator,
 | 
				
			||||||
 | 
					    with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`,
 | 
				
			||||||
 | 
					    where info is a dict, and is_best_checkpoint is in [True, False, None] --
 | 
				
			||||||
 | 
					    None indicating that the iteration was not evaluated as a checkpoint.
 | 
				
			||||||
 | 
					    The evaluation is conducted by calling the evaluate callback.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Positional arguments:
 | 
				
			||||||
 | 
					        nlp: The spaCy pipeline to evaluate.
 | 
				
			||||||
 | 
					        optimizer: The optimizer callable.
 | 
				
			||||||
 | 
					        train_data (Iterable[Batch]): A generator of batches, with the training
 | 
				
			||||||
 | 
					            data. Each batch should be a Sized[Tuple[Input, Annot]]. The training
 | 
				
			||||||
 | 
					            data iterable needs to take care of iterating over the epochs and
 | 
				
			||||||
 | 
					            shuffling.
 | 
				
			||||||
 | 
					        evaluate (Callable[[], Tuple[float, Any]]): A callback to perform evaluation.
 | 
				
			||||||
 | 
					            The callback should take no arguments and return a tuple
 | 
				
			||||||
 | 
					            `(main_score, other_scores)`. The main_score should be a float where
 | 
				
			||||||
 | 
					            higher is better. other_scores can be any object.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Every iteration, the function yields out a tuple with:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * batch: A list of Example objects.
 | 
				
			||||||
 | 
					    * info: A dict with various information about the last update (see below).
 | 
				
			||||||
 | 
					    * is_best_checkpoint: A value in None, False, True, indicating whether this
 | 
				
			||||||
 | 
					        was the best evaluation so far. You should use this to save the model
 | 
				
			||||||
 | 
					        checkpoints during training. If None, evaluation was not conducted on
 | 
				
			||||||
 | 
					        that iteration. False means evaluation was conducted, but a previous
 | 
				
			||||||
 | 
					        evaluation was better.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    The info dict provides the following information:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        epoch (int): How many passes over the data have been completed.
 | 
				
			||||||
 | 
					        step (int): How many steps have been completed.
 | 
				
			||||||
 | 
					        score (float): The main score from the last evaluation.
 | 
				
			||||||
 | 
					        other_scores: : The other scores from the last evaluation.
 | 
				
			||||||
 | 
					        losses: The accumulated losses throughout training.
 | 
				
			||||||
 | 
					        checkpoints: A list of previous results, where each result is a
 | 
				
			||||||
 | 
					            (score, step, epoch) tuple.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    if isinstance(dropout, float):
 | 
				
			||||||
 | 
					        dropouts = constant(dropout)
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        dropouts = dropout
 | 
				
			||||||
 | 
					    results = []
 | 
				
			||||||
 | 
					    losses = {}
 | 
				
			||||||
 | 
					    words_seen = 0
 | 
				
			||||||
 | 
					    start_time = timer()
 | 
				
			||||||
 | 
					    for step, (epoch, batch) in enumerate(train_data):
 | 
				
			||||||
 | 
					        dropout = next(dropouts)
 | 
				
			||||||
 | 
					        for subbatch in subdivide_batch(batch, accumulate_gradient):
 | 
				
			||||||
 | 
					            nlp.update(
 | 
				
			||||||
 | 
					                subbatch, drop=dropout, losses=losses, sgd=False, exclude=exclude
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					        # TODO: refactor this so we don't have to run it separately in here
 | 
				
			||||||
 | 
					        for name, proc in nlp.pipeline:
 | 
				
			||||||
 | 
					            if (
 | 
				
			||||||
 | 
					                name not in exclude
 | 
				
			||||||
 | 
					                and hasattr(proc, "model")
 | 
				
			||||||
 | 
					                and proc.model not in (True, False, None)
 | 
				
			||||||
 | 
					            ):
 | 
				
			||||||
 | 
					                proc.model.finish_update(optimizer)
 | 
				
			||||||
 | 
					        optimizer.step_schedules()
 | 
				
			||||||
 | 
					        if not (step % eval_frequency):
 | 
				
			||||||
 | 
					            if optimizer.averages:
 | 
				
			||||||
 | 
					                with nlp.use_params(optimizer.averages):
 | 
				
			||||||
 | 
					                    score, other_scores = evaluate()
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                score, other_scores = evaluate()
 | 
				
			||||||
 | 
					            results.append((score, step))
 | 
				
			||||||
 | 
					            is_best_checkpoint = score == max(results)[0]
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            score, other_scores = (None, None)
 | 
				
			||||||
 | 
					            is_best_checkpoint = None
 | 
				
			||||||
 | 
					        words_seen += sum(len(eg) for eg in batch)
 | 
				
			||||||
 | 
					        info = {
 | 
				
			||||||
 | 
					            "epoch": epoch,
 | 
				
			||||||
 | 
					            "step": step,
 | 
				
			||||||
 | 
					            "score": score,
 | 
				
			||||||
 | 
					            "other_scores": other_scores,
 | 
				
			||||||
 | 
					            "losses": losses,
 | 
				
			||||||
 | 
					            "checkpoints": results,
 | 
				
			||||||
 | 
					            "seconds": int(timer() - start_time),
 | 
				
			||||||
 | 
					            "words": words_seen,
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					        yield batch, info, is_best_checkpoint
 | 
				
			||||||
 | 
					        if is_best_checkpoint is not None:
 | 
				
			||||||
 | 
					            losses = {}
 | 
				
			||||||
 | 
					        # Stop if no improvement in `patience` updates (if specified)
 | 
				
			||||||
 | 
					        best_score, best_step = max(results)
 | 
				
			||||||
 | 
					        if patience and (step - best_step) >= patience:
 | 
				
			||||||
 | 
					            break
 | 
				
			||||||
 | 
					        # Stop if we've exhausted our max steps (if specified)
 | 
				
			||||||
 | 
					        if max_steps and step >= max_steps:
 | 
				
			||||||
 | 
					            break
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def subdivide_batch(batch, accumulate_gradient):
 | 
				
			||||||
 | 
					    batch = list(batch)
 | 
				
			||||||
 | 
					    batch.sort(key=lambda eg: len(eg.predicted))
 | 
				
			||||||
 | 
					    sub_len = len(batch) // accumulate_gradient
 | 
				
			||||||
 | 
					    start = 0
 | 
				
			||||||
 | 
					    for i in range(accumulate_gradient):
 | 
				
			||||||
 | 
					        subbatch = batch[start : start + sub_len]
 | 
				
			||||||
 | 
					        if subbatch:
 | 
				
			||||||
 | 
					            yield subbatch
 | 
				
			||||||
 | 
					        start += len(subbatch)
 | 
				
			||||||
 | 
					    subbatch = batch[start:]
 | 
				
			||||||
 | 
					    if subbatch:
 | 
				
			||||||
 | 
					        yield subbatch
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def create_evaluation_callback(
 | 
				
			||||||
 | 
					    nlp: Language, dev_corpus: Callable, weights: Dict[str, float]
 | 
				
			||||||
 | 
					) -> Callable[[], Tuple[float, Dict[str, float]]]:
 | 
				
			||||||
 | 
					    weights = {key: value for key, value in weights.items() if value is not None}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def evaluate() -> Tuple[float, Dict[str, float]]:
 | 
				
			||||||
 | 
					        dev_examples = list(dev_corpus(nlp))
 | 
				
			||||||
 | 
					        scores = nlp.evaluate(dev_examples)
 | 
				
			||||||
 | 
					        # Calculate a weighted sum based on score_weights for the main score.
 | 
				
			||||||
 | 
					        # We can only consider scores that are ints/floats, not dicts like
 | 
				
			||||||
 | 
					        # entity scores per type etc.
 | 
				
			||||||
 | 
					        for key, value in scores.items():
 | 
				
			||||||
 | 
					            if key in weights and not isinstance(value, (int, float)):
 | 
				
			||||||
 | 
					                raise ValueError(Errors.E915.format(name=key, score_type=type(value)))
 | 
				
			||||||
 | 
					        try:
 | 
				
			||||||
 | 
					            weighted_score = sum(
 | 
				
			||||||
 | 
					                scores.get(s, 0.0) * weights.get(s, 0.0) for s in weights
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					        except KeyError as e:
 | 
				
			||||||
 | 
					            keys = list(scores.keys())
 | 
				
			||||||
 | 
					            err = Errors.E983.format(dict="score_weights", key=str(e), keys=keys)
 | 
				
			||||||
 | 
					            raise KeyError(err) from None
 | 
				
			||||||
 | 
					        return weighted_score, scores
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return evaluate
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def create_train_batches(
 | 
				
			||||||
 | 
					    iterator: Iterator[Example],
 | 
				
			||||||
 | 
					    batcher: Callable[[Iterable[Example]], Iterable[Example]],
 | 
				
			||||||
 | 
					    max_epochs: int,
 | 
				
			||||||
 | 
					):
 | 
				
			||||||
 | 
					    epoch = 0
 | 
				
			||||||
 | 
					    examples = list(iterator)
 | 
				
			||||||
 | 
					    if not examples:
 | 
				
			||||||
 | 
					        # Raise error if no data
 | 
				
			||||||
 | 
					        raise ValueError(Errors.E986)
 | 
				
			||||||
 | 
					    while max_epochs < 1 or epoch != max_epochs:
 | 
				
			||||||
 | 
					        random.shuffle(examples)
 | 
				
			||||||
 | 
					        for batch in batcher(examples):
 | 
				
			||||||
 | 
					            yield epoch, batch
 | 
				
			||||||
 | 
					        epoch += 1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def update_meta(
 | 
				
			||||||
 | 
					    training: Union[Dict[str, Any], Config], nlp: Language, info: Dict[str, Any]
 | 
				
			||||||
 | 
					) -> None:
 | 
				
			||||||
 | 
					    nlp.meta["performance"] = {}
 | 
				
			||||||
 | 
					    for metric in training["score_weights"]:
 | 
				
			||||||
 | 
					        if metric is not None:
 | 
				
			||||||
 | 
					            nlp.meta["performance"][metric] = info["other_scores"].get(metric, 0.0)
 | 
				
			||||||
 | 
					    for pipe_name in nlp.pipe_names:
 | 
				
			||||||
 | 
					        nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def create_before_to_disk_callback(
 | 
				
			||||||
 | 
					    callback: Optional[Callable[[Language], Language]]
 | 
				
			||||||
 | 
					) -> Callable[[Language], Language]:
 | 
				
			||||||
 | 
					    def before_to_disk(nlp: Language) -> Language:
 | 
				
			||||||
 | 
					        if not callback:
 | 
				
			||||||
 | 
					            return nlp
 | 
				
			||||||
 | 
					        modified_nlp = callback(nlp)
 | 
				
			||||||
 | 
					        if not isinstance(modified_nlp, Language):
 | 
				
			||||||
 | 
					            err = Errors.E914.format(name="before_to_disk", value=type(modified_nlp))
 | 
				
			||||||
 | 
					            raise ValueError(err)
 | 
				
			||||||
 | 
					        return modified_nlp
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return before_to_disk
 | 
				
			||||||
							
								
								
									
										267
									
								
								spacy/training/pretrain.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										267
									
								
								spacy/training/pretrain.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,267 @@
 | 
				
			||||||
 | 
					from typing import Optional, Callable, Any, Iterable, Union, List
 | 
				
			||||||
 | 
					from thinc.api import Config, fix_random_seed, set_gpu_allocator, Model, Optimizer
 | 
				
			||||||
 | 
					from thinc.api import set_dropout_rate, to_categorical, CosineDistance, L2Distance
 | 
				
			||||||
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					from functools import partial
 | 
				
			||||||
 | 
					from collections import Counter
 | 
				
			||||||
 | 
					import srsly
 | 
				
			||||||
 | 
					import numpy
 | 
				
			||||||
 | 
					import time
 | 
				
			||||||
 | 
					import re
 | 
				
			||||||
 | 
					from wasabi import msg
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from .example import Example
 | 
				
			||||||
 | 
					from ..tokens import Doc
 | 
				
			||||||
 | 
					from ..attrs import ID
 | 
				
			||||||
 | 
					from ..ml.models.multi_task import build_cloze_multi_task_model
 | 
				
			||||||
 | 
					from ..ml.models.multi_task import build_cloze_characters_multi_task_model
 | 
				
			||||||
 | 
					from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain
 | 
				
			||||||
 | 
					from ..util import registry, load_model_from_config, dot_to_object, logger
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def pretrain(
 | 
				
			||||||
 | 
					    config: Config,
 | 
				
			||||||
 | 
					    output_dir: Path,
 | 
				
			||||||
 | 
					    resume_path: Optional[Path] = None,
 | 
				
			||||||
 | 
					    epoch_resume: Optional[int] = None,
 | 
				
			||||||
 | 
					    use_gpu: int = -1,
 | 
				
			||||||
 | 
					    logger: Callable[[Any], Any] = logger,
 | 
				
			||||||
 | 
					):
 | 
				
			||||||
 | 
					    if config["training"]["seed"] is not None:
 | 
				
			||||||
 | 
					        fix_random_seed(config["training"]["seed"])
 | 
				
			||||||
 | 
					    allocator = config["training"]["gpu_allocator"]
 | 
				
			||||||
 | 
					    if use_gpu >= 0 and allocator:
 | 
				
			||||||
 | 
					        set_gpu_allocator(allocator)
 | 
				
			||||||
 | 
					    nlp = load_model_from_config(config)
 | 
				
			||||||
 | 
					    T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining)
 | 
				
			||||||
 | 
					    P = registry.resolve(nlp.config["pretraining"], schema=ConfigSchemaPretrain)
 | 
				
			||||||
 | 
					    corpus = dot_to_object(T, P["corpus"])
 | 
				
			||||||
 | 
					    batcher = P["batcher"]
 | 
				
			||||||
 | 
					    model = create_pretraining_model(nlp, P)
 | 
				
			||||||
 | 
					    optimizer = P["optimizer"]
 | 
				
			||||||
 | 
					    # Load in pretrained weights to resume from
 | 
				
			||||||
 | 
					    if resume_path is not None:
 | 
				
			||||||
 | 
					        _resume_model(model, resume_path, epoch_resume)
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        # Without '--resume-path' the '--epoch-resume' argument is ignored
 | 
				
			||||||
 | 
					        epoch_resume = 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # TODO: move this to logger function?
 | 
				
			||||||
 | 
					    tracker = ProgressTracker(frequency=10000)
 | 
				
			||||||
 | 
					    msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}")
 | 
				
			||||||
 | 
					    row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
 | 
				
			||||||
 | 
					    msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def _save_model(epoch, is_temp=False):
 | 
				
			||||||
 | 
					        is_temp_str = ".temp" if is_temp else ""
 | 
				
			||||||
 | 
					        with model.use_params(optimizer.averages):
 | 
				
			||||||
 | 
					            with (output_dir / f"model{epoch}{is_temp_str}.bin").open("wb") as file_:
 | 
				
			||||||
 | 
					                file_.write(model.get_ref("tok2vec").to_bytes())
 | 
				
			||||||
 | 
					            log = {
 | 
				
			||||||
 | 
					                "nr_word": tracker.nr_word,
 | 
				
			||||||
 | 
					                "loss": tracker.loss,
 | 
				
			||||||
 | 
					                "epoch_loss": tracker.epoch_loss,
 | 
				
			||||||
 | 
					                "epoch": epoch,
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					            with (output_dir / "log.jsonl").open("a") as file_:
 | 
				
			||||||
 | 
					                file_.write(srsly.json_dumps(log) + "\n")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    objective = create_objective(P["objective"])
 | 
				
			||||||
 | 
					    # TODO: I think we probably want this to look more like the
 | 
				
			||||||
 | 
					    # 'create_train_batches' function?
 | 
				
			||||||
 | 
					    for epoch in range(epoch_resume, P["max_epochs"]):
 | 
				
			||||||
 | 
					        for batch_id, batch in enumerate(batcher(corpus(nlp))):
 | 
				
			||||||
 | 
					            docs = ensure_docs(batch)
 | 
				
			||||||
 | 
					            loss = make_update(model, docs, optimizer, objective)
 | 
				
			||||||
 | 
					            progress = tracker.update(epoch, loss, docs)
 | 
				
			||||||
 | 
					            if progress:
 | 
				
			||||||
 | 
					                msg.row(progress, **row_settings)
 | 
				
			||||||
 | 
					            if P["n_save_every"] and (batch_id % P["n_save_every"] == 0):
 | 
				
			||||||
 | 
					                _save_model(epoch, is_temp=True)
 | 
				
			||||||
 | 
					        _save_model(epoch)
 | 
				
			||||||
 | 
					        tracker.epoch_loss = 0.0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def ensure_docs(examples_or_docs: Iterable[Union[Doc, Example]]) -> List[Doc]:
 | 
				
			||||||
 | 
					    docs = []
 | 
				
			||||||
 | 
					    for eg_or_doc in examples_or_docs:
 | 
				
			||||||
 | 
					        if isinstance(eg_or_doc, Doc):
 | 
				
			||||||
 | 
					            docs.append(eg_or_doc)
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            docs.append(eg_or_doc.reference)
 | 
				
			||||||
 | 
					    return docs
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def _resume_model(
 | 
				
			||||||
 | 
					    model: Model,
 | 
				
			||||||
 | 
					    resume_path: Path,
 | 
				
			||||||
 | 
					    epoch_resume: int,
 | 
				
			||||||
 | 
					    logger: Callable[[Any], Any] = logger,
 | 
				
			||||||
 | 
					) -> None:
 | 
				
			||||||
 | 
					    logger.info(f"Resume training tok2vec from: {resume_path}")
 | 
				
			||||||
 | 
					    with resume_path.open("rb") as file_:
 | 
				
			||||||
 | 
					        weights_data = file_.read()
 | 
				
			||||||
 | 
					        model.get_ref("tok2vec").from_bytes(weights_data)
 | 
				
			||||||
 | 
					    # Parse the epoch number from the given weight file
 | 
				
			||||||
 | 
					    model_name = re.search(r"model\d+\.bin", str(resume_path))
 | 
				
			||||||
 | 
					    if model_name:
 | 
				
			||||||
 | 
					        # Default weight file name so read epoch_start from it by cutting off 'model' and '.bin'
 | 
				
			||||||
 | 
					        epoch_resume = int(model_name.group(0)[5:][:-4]) + 1
 | 
				
			||||||
 | 
					        logger.info(f"Resuming from epoch: {epoch_resume}")
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        logger.info(f"Resuming from epoch: {epoch_resume}")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def make_update(
 | 
				
			||||||
 | 
					    model: Model, docs: Iterable[Doc], optimizer: Optimizer, objective_func: Callable
 | 
				
			||||||
 | 
					) -> float:
 | 
				
			||||||
 | 
					    """Perform an update over a single batch of documents.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    docs (iterable): A batch of `Doc` objects.
 | 
				
			||||||
 | 
					    optimizer (callable): An optimizer.
 | 
				
			||||||
 | 
					    RETURNS loss: A float for the loss.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    predictions, backprop = model.begin_update(docs)
 | 
				
			||||||
 | 
					    loss, gradients = objective_func(model.ops, docs, predictions)
 | 
				
			||||||
 | 
					    backprop(gradients)
 | 
				
			||||||
 | 
					    model.finish_update(optimizer)
 | 
				
			||||||
 | 
					    # Don't want to return a cupy object here
 | 
				
			||||||
 | 
					    # The gradients are modified in-place by the BERT MLM,
 | 
				
			||||||
 | 
					    # so we get an accurate loss
 | 
				
			||||||
 | 
					    return float(loss)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def create_objective(config: Config):
 | 
				
			||||||
 | 
					    """Create the objective for pretraining.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    We'd like to replace this with a registry function but it's tricky because
 | 
				
			||||||
 | 
					    we're also making a model choice based on this. For now we hard-code support
 | 
				
			||||||
 | 
					    for two types (characters, vectors). For characters you can specify
 | 
				
			||||||
 | 
					    n_characters, for vectors you can specify the loss.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Bleh.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    objective_type = config["type"]
 | 
				
			||||||
 | 
					    if objective_type == "characters":
 | 
				
			||||||
 | 
					        return partial(get_characters_loss, nr_char=config["n_characters"])
 | 
				
			||||||
 | 
					    elif objective_type == "vectors":
 | 
				
			||||||
 | 
					        if config["loss"] == "cosine":
 | 
				
			||||||
 | 
					            distance = CosineDistance(normalize=True, ignore_zeros=True)
 | 
				
			||||||
 | 
					            return partial(get_vectors_loss, distance=distance)
 | 
				
			||||||
 | 
					        elif config["loss"] == "L2":
 | 
				
			||||||
 | 
					            distance = L2Distance(normalize=True, ignore_zeros=True)
 | 
				
			||||||
 | 
					            return partial(get_vectors_loss, distance=distance)
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            raise ValueError("Unexpected loss type", config["loss"])
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        raise ValueError("Unexpected objective_type", objective_type)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def get_vectors_loss(ops, docs, prediction, distance):
 | 
				
			||||||
 | 
					    """Compute a loss based on a distance between the documents' vectors and
 | 
				
			||||||
 | 
					    the prediction.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    # The simplest way to implement this would be to vstack the
 | 
				
			||||||
 | 
					    # token.vector values, but that's a bit inefficient, especially on GPU.
 | 
				
			||||||
 | 
					    # Instead we fetch the index into the vectors table for each of our tokens,
 | 
				
			||||||
 | 
					    # and look them up all at once. This prevents data copying.
 | 
				
			||||||
 | 
					    ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
 | 
				
			||||||
 | 
					    target = docs[0].vocab.vectors.data[ids]
 | 
				
			||||||
 | 
					    d_target, loss = distance(prediction, target)
 | 
				
			||||||
 | 
					    return loss, d_target
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def get_characters_loss(ops, docs, prediction, nr_char):
 | 
				
			||||||
 | 
					    """Compute a loss based on a number of characters predicted from the docs."""
 | 
				
			||||||
 | 
					    target_ids = numpy.vstack([doc.to_utf8_array(nr_char=nr_char) for doc in docs])
 | 
				
			||||||
 | 
					    target_ids = target_ids.reshape((-1,))
 | 
				
			||||||
 | 
					    target = ops.asarray(to_categorical(target_ids, n_classes=256), dtype="f")
 | 
				
			||||||
 | 
					    target = target.reshape((-1, 256 * nr_char))
 | 
				
			||||||
 | 
					    diff = prediction - target
 | 
				
			||||||
 | 
					    loss = (diff ** 2).sum()
 | 
				
			||||||
 | 
					    d_target = diff / float(prediction.shape[0])
 | 
				
			||||||
 | 
					    return loss, d_target
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def create_pretraining_model(nlp, pretrain_config):
 | 
				
			||||||
 | 
					    """Define a network for the pretraining. We simply add an output layer onto
 | 
				
			||||||
 | 
					    the tok2vec input model. The tok2vec input model needs to be a model that
 | 
				
			||||||
 | 
					    takes a batch of Doc objects (as a list), and returns a list of arrays.
 | 
				
			||||||
 | 
					    Each array in the output needs to have one row per token in the doc.
 | 
				
			||||||
 | 
					    The actual tok2vec layer is stored as a reference, and only this bit will be
 | 
				
			||||||
 | 
					    serialized to file and read back in when calling the 'train' command.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    component = nlp.get_pipe(pretrain_config["component"])
 | 
				
			||||||
 | 
					    if pretrain_config.get("layer"):
 | 
				
			||||||
 | 
					        tok2vec = component.model.get_ref(pretrain_config["layer"])
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        tok2vec = component.model
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # TODO
 | 
				
			||||||
 | 
					    maxout_pieces = 3
 | 
				
			||||||
 | 
					    hidden_size = 300
 | 
				
			||||||
 | 
					    if pretrain_config["objective"]["type"] == "vectors":
 | 
				
			||||||
 | 
					        model = build_cloze_multi_task_model(
 | 
				
			||||||
 | 
					            nlp.vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					    elif pretrain_config["objective"]["type"] == "characters":
 | 
				
			||||||
 | 
					        model = build_cloze_characters_multi_task_model(
 | 
				
			||||||
 | 
					            nlp.vocab,
 | 
				
			||||||
 | 
					            tok2vec,
 | 
				
			||||||
 | 
					            hidden_size=hidden_size,
 | 
				
			||||||
 | 
					            maxout_pieces=maxout_pieces,
 | 
				
			||||||
 | 
					            nr_char=pretrain_config["objective"]["n_characters"],
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					    model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")])
 | 
				
			||||||
 | 
					    set_dropout_rate(model, pretrain_config["dropout"])
 | 
				
			||||||
 | 
					    return model
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class ProgressTracker:
 | 
				
			||||||
 | 
					    def __init__(self, frequency=1000000):
 | 
				
			||||||
 | 
					        self.loss = 0.0
 | 
				
			||||||
 | 
					        self.prev_loss = 0.0
 | 
				
			||||||
 | 
					        self.nr_word = 0
 | 
				
			||||||
 | 
					        self.words_per_epoch = Counter()
 | 
				
			||||||
 | 
					        self.frequency = frequency
 | 
				
			||||||
 | 
					        self.last_time = time.time()
 | 
				
			||||||
 | 
					        self.last_update = 0
 | 
				
			||||||
 | 
					        self.epoch_loss = 0.0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def update(self, epoch, loss, docs):
 | 
				
			||||||
 | 
					        self.loss += loss
 | 
				
			||||||
 | 
					        self.epoch_loss += loss
 | 
				
			||||||
 | 
					        words_in_batch = sum(len(doc) for doc in docs)
 | 
				
			||||||
 | 
					        self.words_per_epoch[epoch] += words_in_batch
 | 
				
			||||||
 | 
					        self.nr_word += words_in_batch
 | 
				
			||||||
 | 
					        words_since_update = self.nr_word - self.last_update
 | 
				
			||||||
 | 
					        if words_since_update >= self.frequency:
 | 
				
			||||||
 | 
					            wps = words_since_update / (time.time() - self.last_time)
 | 
				
			||||||
 | 
					            self.last_update = self.nr_word
 | 
				
			||||||
 | 
					            self.last_time = time.time()
 | 
				
			||||||
 | 
					            loss_per_word = self.loss - self.prev_loss
 | 
				
			||||||
 | 
					            status = (
 | 
				
			||||||
 | 
					                epoch,
 | 
				
			||||||
 | 
					                self.nr_word,
 | 
				
			||||||
 | 
					                _smart_round(self.loss, width=10),
 | 
				
			||||||
 | 
					                _smart_round(loss_per_word, width=6),
 | 
				
			||||||
 | 
					                int(wps),
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					            self.prev_loss = float(self.loss)
 | 
				
			||||||
 | 
					            return status
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            return None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def _smart_round(
 | 
				
			||||||
 | 
					    figure: Union[float, int], width: int = 10, max_decimal: int = 4
 | 
				
			||||||
 | 
					) -> str:
 | 
				
			||||||
 | 
					    """Round large numbers as integers, smaller numbers as decimals."""
 | 
				
			||||||
 | 
					    n_digits = len(str(int(figure)))
 | 
				
			||||||
 | 
					    n_decimal = width - (n_digits + 1)
 | 
				
			||||||
 | 
					    if n_decimal <= 1:
 | 
				
			||||||
 | 
					        return str(int(figure))
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        n_decimal = min(n_decimal, max_decimal)
 | 
				
			||||||
 | 
					        format_str = "%." + str(n_decimal) + "f"
 | 
				
			||||||
 | 
					        return format_str % figure
 | 
				
			||||||
| 
						 | 
					@ -8,6 +8,7 @@ import re
 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
import thinc
 | 
					import thinc
 | 
				
			||||||
from thinc.api import NumpyOps, get_current_ops, Adam, Config, Optimizer
 | 
					from thinc.api import NumpyOps, get_current_ops, Adam, Config, Optimizer
 | 
				
			||||||
 | 
					from thinc.api import ConfigValidationError
 | 
				
			||||||
import functools
 | 
					import functools
 | 
				
			||||||
import itertools
 | 
					import itertools
 | 
				
			||||||
import numpy.random
 | 
					import numpy.random
 | 
				
			||||||
| 
						 | 
					@ -56,6 +57,7 @@ if TYPE_CHECKING:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
OOV_RANK = numpy.iinfo(numpy.uint64).max
 | 
					OOV_RANK = numpy.iinfo(numpy.uint64).max
 | 
				
			||||||
 | 
					DEFAULT_OOV_PROB = -20
 | 
				
			||||||
LEXEME_NORM_LANGS = ["da", "de", "el", "en", "id", "lb", "pt", "ru", "sr", "ta", "th"]
 | 
					LEXEME_NORM_LANGS = ["da", "de", "el", "en", "id", "lb", "pt", "ru", "sr", "ta", "th"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Default order of sections in the config.cfg. Not all sections needs to exist,
 | 
					# Default order of sections in the config.cfg. Not all sections needs to exist,
 | 
				
			||||||
| 
						 | 
					@ -239,20 +241,6 @@ def get_module_path(module: ModuleType) -> Path:
 | 
				
			||||||
    return Path(sys.modules[module.__module__].__file__).parent
 | 
					    return Path(sys.modules[module.__module__].__file__).parent
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def load_vectors_into_model(
 | 
					 | 
				
			||||||
    nlp: "Language", name: Union[str, Path], *, add_strings=True
 | 
					 | 
				
			||||||
) -> None:
 | 
					 | 
				
			||||||
    """Load word vectors from an installed model or path into a model instance."""
 | 
					 | 
				
			||||||
    vectors_nlp = load_model(name)
 | 
					 | 
				
			||||||
    nlp.vocab.vectors = vectors_nlp.vocab.vectors
 | 
					 | 
				
			||||||
    if add_strings:
 | 
					 | 
				
			||||||
        # I guess we should add the strings from the vectors_nlp model?
 | 
					 | 
				
			||||||
        # E.g. if someone does a similarity query, they might expect the strings.
 | 
					 | 
				
			||||||
        for key in nlp.vocab.vectors.key2row:
 | 
					 | 
				
			||||||
            if key in vectors_nlp.vocab.strings:
 | 
					 | 
				
			||||||
                nlp.vocab.strings.add(vectors_nlp.vocab.strings[key])
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def load_model(
 | 
					def load_model(
 | 
				
			||||||
    name: Union[str, Path],
 | 
					    name: Union[str, Path],
 | 
				
			||||||
    *,
 | 
					    *,
 | 
				
			||||||
| 
						 | 
					@ -391,32 +379,9 @@ def load_model_from_config(
 | 
				
			||||||
    return nlp
 | 
					    return nlp
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def resolve_training_config(
 | 
					 | 
				
			||||||
    config: Config,
 | 
					 | 
				
			||||||
    exclude: Iterable[str] = ("nlp", "components"),
 | 
					 | 
				
			||||||
    validate: bool = True,
 | 
					 | 
				
			||||||
) -> Dict[str, Any]:
 | 
					 | 
				
			||||||
    """Resolve the config sections relevant for trainig and create all objects.
 | 
					 | 
				
			||||||
    Mostly used in the CLI to separate training config (not resolved by default
 | 
					 | 
				
			||||||
    because not runtime-relevant – an nlp object should load fine even if it's
 | 
					 | 
				
			||||||
    [training] block refers to functions that are not available etc.).
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    config (Config): The config to resolve.
 | 
					 | 
				
			||||||
    exclude (Iterable[str]): The config blocks to exclude. Those blocks won't
 | 
					 | 
				
			||||||
        be available in the final resolved config.
 | 
					 | 
				
			||||||
    validate (bool): Whether to validate the config.
 | 
					 | 
				
			||||||
    RETURNS (Dict[str, Any]): The resolved config.
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    config = config.copy()
 | 
					 | 
				
			||||||
    for key in exclude:
 | 
					 | 
				
			||||||
        if key in config:
 | 
					 | 
				
			||||||
            config.pop(key)
 | 
					 | 
				
			||||||
    return registry.resolve(config, validate=validate)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def resolve_dot_names(
 | 
					def resolve_dot_names(
 | 
				
			||||||
    config: Config, dot_names: List[Optional[str]]
 | 
					    config: Config, dot_names: List[Optional[str]]
 | 
				
			||||||
) -> List[Optional[Callable]]:
 | 
					) -> Tuple[Any]:
 | 
				
			||||||
    """Resolve one or more "dot notation" names, e.g. corpora.train.
 | 
					    """Resolve one or more "dot notation" names, e.g. corpora.train.
 | 
				
			||||||
    The paths could point anywhere into the config, so we don't know which
 | 
					    The paths could point anywhere into the config, so we don't know which
 | 
				
			||||||
    top-level section we'll be looking within.
 | 
					    top-level section we'll be looking within.
 | 
				
			||||||
| 
						 | 
					@ -424,18 +389,42 @@ def resolve_dot_names(
 | 
				
			||||||
    We resolve the whole top-level section, although we could resolve less --
 | 
					    We resolve the whole top-level section, although we could resolve less --
 | 
				
			||||||
    we could find the lowest part of the tree.
 | 
					    we could find the lowest part of the tree.
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
 | 
					    # TODO: include schema?
 | 
				
			||||||
 | 
					    # TODO: clean this up and avoid duplication
 | 
				
			||||||
    resolved = {}
 | 
					    resolved = {}
 | 
				
			||||||
    output = []
 | 
					    output = []
 | 
				
			||||||
 | 
					    errors = []
 | 
				
			||||||
    for name in dot_names:
 | 
					    for name in dot_names:
 | 
				
			||||||
        if name is None:
 | 
					        if name is None:
 | 
				
			||||||
            output.append(name)
 | 
					            output.append(name)
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            section = name.split(".")[0]
 | 
					            section = name.split(".")[0]
 | 
				
			||||||
            # We want to avoid resolving the same thing twice.
 | 
					            # We want to avoid resolving the same thing twice
 | 
				
			||||||
            if section not in resolved:
 | 
					            if section not in resolved:
 | 
				
			||||||
                resolved[section] = registry.resolve(config[section])
 | 
					                resolved[section] = registry.resolve(config[section])
 | 
				
			||||||
 | 
					            try:
 | 
				
			||||||
                output.append(dot_to_object(resolved, name))
 | 
					                output.append(dot_to_object(resolved, name))
 | 
				
			||||||
    return output
 | 
					            except KeyError:
 | 
				
			||||||
 | 
					                msg = f"not a valid section reference: {name}"
 | 
				
			||||||
 | 
					                errors.append({"loc": name.split("."), "msg": msg})
 | 
				
			||||||
 | 
					    objects = []
 | 
				
			||||||
 | 
					    for ref in output:
 | 
				
			||||||
 | 
					        if not isinstance(ref, str):
 | 
				
			||||||
 | 
					            msg = f"not a valid section reference: {ref} ({type(ref)})"
 | 
				
			||||||
 | 
					            errors.append({"loc": ref.split("."), "msg": msg})
 | 
				
			||||||
 | 
					            continue
 | 
				
			||||||
 | 
					        section = ref.split(".")[0]
 | 
				
			||||||
 | 
					        # We want to avoid resolving the same thing twice
 | 
				
			||||||
 | 
					        if section not in resolved:
 | 
				
			||||||
 | 
					            resolved[section] = registry.resolve(config[section])
 | 
				
			||||||
 | 
					        try:
 | 
				
			||||||
 | 
					            objects.append(dot_to_object(resolved, ref))
 | 
				
			||||||
 | 
					        except KeyError:
 | 
				
			||||||
 | 
					            msg = f"not a valid section reference: {name}"
 | 
				
			||||||
 | 
					            errors.append({"loc": ref.split("."), "msg": msg})
 | 
				
			||||||
 | 
					    if errors:
 | 
				
			||||||
 | 
					        raise ConfigValidationError(config=config, errors=errors)
 | 
				
			||||||
 | 
					    return tuple(objects)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def load_model_from_init_py(
 | 
					def load_model_from_init_py(
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user