diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index 5569e630d..7368bcef3 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -15,8 +15,7 @@ from .debug_config import debug_config # noqa: F401 from .debug_model import debug_model # noqa: F401 from .evaluate import evaluate # noqa: F401 from .convert import convert # noqa: F401 -#from .init_model import init_model # noqa: F401 -from .init_pipeline import init_pipeline # noqa: F401 +from .init_pipeline import init_pipeline_cli # noqa: F401 from .init_config import init_config, fill_config # noqa: F401 from .validate import validate # noqa: F401 from .project.clone import project_clone # noqa: F401 diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index 7ff2c6199..c41905970 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -10,13 +10,12 @@ from click import NoSuchOption from click.parser import split_arg_string from typer.main import get_command from contextlib import contextmanager -from thinc.api import Config, ConfigValidationError +from thinc.api import Config, ConfigValidationError, require_gpu from configparser import InterpolationError import os from ..schemas import ProjectConfigSchema, validate from ..util import import_file, run_command, make_tempdir, registry, logger -from ..util import ensure_path if TYPE_CHECKING: from pathy import Pathy # noqa: F401 @@ -276,18 +275,6 @@ def import_code(code_path: Optional[Union[Path, str]]) -> None: msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1) -def get_sourced_components(config: Union[Dict[str, Any], Config]) -> List[str]: - """RETURNS (List[str]): All sourced components in the original config, - e.g. {"source": "en_core_web_sm"}. If the config contains a key - "factory", we assume it refers to a component factory. - """ - return [ - name - for name, cfg in config.get("components", {}).items() - if "factory" not in cfg and "source" in cfg - ] - - def upload_file(src: Path, dest: Union[str, "Pathy"]) -> None: """Upload a file. @@ -459,3 +446,23 @@ def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[in p = int(p) result.append(p) return result + + +class CliLogger: + """Helper mocking up the most commonly used logger methods. Can be passed + into functions like train() to make them output pretty-printed messages + on the CLI and regular logging if used from within Python. + """ + + debug = msg.text + info = msg.info + warn = msg.info + error = msg.fail + + +def setup_gpu(use_gpu: int): + if use_gpu >= 0: + msg.info(f"Using GPU: {use_gpu}") + require_gpu(use_gpu) + else: + msg.info("Using CPU") diff --git a/spacy/cli/debug_config.py b/spacy/cli/debug_config.py index 131fecf6d..d1dcc45b9 100644 --- a/spacy/cli/debug_config.py +++ b/spacy/cli/debug_config.py @@ -1,7 +1,7 @@ from typing import Optional, Dict, Any, Union, List from pathlib import Path from wasabi import msg, table -from thinc.api import Config, ConfigValidationError +from thinc.api import Config from thinc.config import VARIABLE_RE import typer @@ -52,10 +52,8 @@ def debug_config( with show_validation_error(config_path): config = util.load_config(config_path, overrides=overrides) nlp = util.load_model_from_config(config) - # Use the resolved config here in case user has one function returning - # a dict of corpora etc. - resolved = util.resolve_training_config(nlp.config) - check_section_refs(resolved, ["training.dev_corpus", "training.train_corpus"]) + dot_names = ["training.dev_corpus", "training.train_corpus"] + util.resolve_dot_names(nlp.config, dot_names) msg.good("Config is valid") if show_vars: variables = get_variables(config) @@ -97,23 +95,3 @@ def get_variables(config: Config) -> Dict[str, Any]: value = util.dot_to_object(config, path) result[variable] = repr(value) return result - - -def check_section_refs(config: Config, fields: List[str]) -> None: - """Validate fields in the config that refer to other sections or values - (e.g. in the corpora) and make sure that those references exist. - """ - errors = [] - for field in fields: - # If the field doesn't exist in the config, we ignore it - try: - value = util.dot_to_object(config, field) - except KeyError: - continue - try: - util.dot_to_object(config, value) - except KeyError: - msg = f"not a valid section reference: {value}" - errors.append({"loc": field.split("."), "msg": msg}) - if errors: - raise ConfigValidationError(config=config, errors=errors) diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 302bfd563..f0e76be2b 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -7,10 +7,13 @@ from wasabi import Printer, MESSAGES, msg import typer from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides -from ._util import import_code, debug_cli, get_sourced_components +from ._util import import_code, debug_cli from ..training import Corpus, Example +from ..training.initialize import get_sourced_components +from ..schemas import ConfigSchemaTraining from ..pipeline._parser_internals import nonproj from ..language import Language +from ..util import registry from .. import util @@ -94,26 +97,13 @@ def debug_data( with show_validation_error(config_path): cfg = util.load_config(config_path, overrides=config_overrides) nlp = util.load_model_from_config(cfg) - C = util.resolve_training_config(nlp.config) + T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining) # Use original config here, not resolved version sourced_components = get_sourced_components(cfg) - frozen_components = C["training"]["frozen_components"] + frozen_components = T["frozen_components"] resume_components = [p for p in sourced_components if p not in frozen_components] pipeline = nlp.pipe_names factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names] - tag_map_path = util.ensure_path(C["training"]["tag_map"]) - tag_map = {} - if tag_map_path is not None: - tag_map = srsly.read_json(tag_map_path) - morph_rules_path = util.ensure_path(C["training"]["morph_rules"]) - morph_rules = {} - if morph_rules_path is not None: - morph_rules = srsly.read_json(morph_rules_path) - # Replace tag map with provided mapping - nlp.vocab.morphology.load_tag_map(tag_map) - # Load morph rules - nlp.vocab.morphology.load_morph_exceptions(morph_rules) - msg.divider("Data file validation") # Create the gold corpus to be able to better analyze data @@ -145,10 +135,10 @@ def debug_data( train_texts = gold_train_data["texts"] dev_texts = gold_dev_data["texts"] - frozen_components = C["training"]["frozen_components"] + frozen_components = T["frozen_components"] msg.divider("Training stats") - msg.text(f"Language: {C['nlp']['lang']}") + msg.text(f"Language: {nlp.lang}") msg.text(f"Training pipeline: {', '.join(pipeline)}") if resume_components: msg.text(f"Components from other pipelines: {', '.join(resume_components)}") @@ -355,6 +345,7 @@ def debug_data( if "tagger" in factory_names: msg.divider("Part-of-speech Tagging") labels = [label for label in gold_train_data["tags"]] + # TODO: does this need to be updated? tag_map = nlp.vocab.morphology.tag_map msg.info(f"{len(labels)} label(s) in data ({len(tag_map)} label(s) in tag map)") labels_with_counts = _format_labels( diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py index 6f554ed2d..f8fc687fa 100644 --- a/spacy/cli/debug_model.py +++ b/spacy/cli/debug_model.py @@ -4,12 +4,14 @@ from pathlib import Path from spacy.training import Example from spacy.util import dot_to_object from wasabi import msg -from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam +from thinc.api import fix_random_seed, set_dropout_rate, Adam from thinc.api import Model, data_validation, set_gpu_allocator import typer from ._util import Arg, Opt, debug_cli, show_validation_error -from ._util import parse_config_overrides, string_to_list +from ._util import parse_config_overrides, string_to_list, setup_gpu +from ..schemas import ConfigSchemaTraining +from ..util import registry from .. import util @@ -37,11 +39,7 @@ def debug_model_cli( DOCS: https://nightly.spacy.io/api/cli#debug-model """ - if use_gpu >= 0: - msg.info("Using GPU") - require_gpu(use_gpu) - else: - msg.info("Using CPU") + setup_gpu(use_gpu) layers = string_to_list(layers, intify=True) print_settings = { "dimensions": dimensions, @@ -65,8 +63,8 @@ def debug_model_cli( set_gpu_allocator(allocator) with show_validation_error(config_path): nlp = util.load_model_from_config(raw_config) - C = util.resolve_training_config(nlp.config) - seed = C["training"]["seed"] + T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining) + seed = T["seed"] if seed is not None: msg.info(f"Fixing random seed: {seed}") fix_random_seed(seed) @@ -77,7 +75,7 @@ def debug_model_cli( exits=1, ) model = pipe.model - debug_model(C, nlp, model, print_settings=print_settings) + debug_model(T, nlp, model, print_settings=print_settings) def debug_model( diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index f9954d9ad..4c1eeb9e8 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -3,11 +3,11 @@ from wasabi import Printer from pathlib import Path import re import srsly -from thinc.api import require_gpu, fix_random_seed +from thinc.api import fix_random_seed from ..training import Corpus from ..tokens import Doc -from ._util import app, Arg, Opt +from ._util import app, Arg, Opt, setup_gpu from ..scorer import Scorer from .. import util from .. import displacy @@ -61,8 +61,7 @@ def evaluate( ) -> Scorer: msg = Printer(no_print=silent, pretty=not silent) fix_random_seed() - if use_gpu >= 0: - require_gpu(use_gpu) + setup_gpu(use_gpu) data_path = util.ensure_path(data_path) output_path = util.ensure_path(output) displacy_path = util.ensure_path(displacy_path) diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py index 0c4b6ec70..de1dc8a46 100644 --- a/spacy/cli/init_pipeline.py +++ b/spacy/cli/init_pipeline.py @@ -1,22 +1,13 @@ -from typing import Optional, Dict, Callable, Any +from typing import Optional import logging from pathlib import Path from wasabi import msg import typer -from thinc.api import Config, fix_random_seed, set_gpu_allocator -import srsly from .. import util -from ..util import registry, resolve_dot_names, OOV_RANK -from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain, ConfigSchemaInit -from ..language import Language -from ..lookups import Lookups -from ..errors import Errors +from ..training.initialize import init_nlp from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error -from ._util import import_code, get_sourced_components - - -DEFAULT_OOV_PROB = -20 +from ._util import import_code, CliLogger, setup_gpu @init_cli.command( @@ -31,178 +22,16 @@ def init_pipeline_cli( output_path: Path = Arg(..., help="Output directory for the prepared data"), code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), + use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU") # fmt: on ): util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR) overrides = parse_config_overrides(ctx.args) import_code(code_path) + setup_gpu(use_gpu) with show_validation_error(config_path): config = util.load_config(config_path, overrides=overrides) - nlp = init_pipeline(config) + with show_validation_error(hint_fill=False): + nlp = init_nlp(config, use_gpu=use_gpu, logger=CliLogger, on_succcess=msg.good) nlp.to_disk(output_path) msg.good(f"Saved initialized pipeline to {output_path}") - - -def init_pipeline(config: Config, use_gpu: int = -1) -> Language: - raw_config = config - config = raw_config.interpolate() - if config["training"]["seed"] is not None: - fix_random_seed(config["training"]["seed"]) - allocator = config["training"]["gpu_allocator"] - if use_gpu >= 0 and allocator: - set_gpu_allocator(allocator) - # Use original config here before it's resolved to functions - sourced_components = get_sourced_components(config) - with show_validation_error(): - nlp = util.load_model_from_config(raw_config, auto_fill=True) - msg.good("Set up nlp object from config") - config = nlp.config.interpolate() - # Resolve all training-relevant sections using the filled nlp config - T = registry.resolve(config["training"], schema=ConfigSchemaTraining) - dot_names = [T["train_corpus"], T["dev_corpus"]] - train_corpus, dev_corpus = resolve_dot_names(config, dot_names) - I = registry.resolve(config["initialize"], schema=ConfigSchemaInit) - V = I["vocab"] - init_vocab(nlp, data=V["data"], lookups=V["lookups"], vectors=V["vectors"]) - optimizer = T["optimizer"] - before_to_disk = create_before_to_disk_callback(T["before_to_disk"]) - # Components that shouldn't be updated during training - frozen_components = T["frozen_components"] - # Sourced components that require resume_training - resume_components = [p for p in sourced_components if p not in frozen_components] - msg.info(f"Pipeline: {nlp.pipe_names}") - if resume_components: - with nlp.select_pipes(enable=resume_components): - msg.info(f"Resuming training for: {resume_components}") - nlp.resume_training(sgd=optimizer) - with nlp.select_pipes(disable=[*frozen_components, *resume_components]): - nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer) - msg.good(f"Initialized pipeline components") - # Verify the config after calling 'begin_training' to ensure labels - # are properly initialized - verify_config(nlp) - if "pretraining" in config and config["pretraining"]: - P = registry.resolve(config["pretraining"], schema=ConfigSchemaPretrain) - add_tok2vec_weights(nlp, P, I) - # TODO: this should be handled better? - nlp = before_to_disk(nlp) - return nlp - - -def init_vocab( - nlp: Language, - *, - data: Optional[Path] = None, - lookups: Optional[Lookups] = None, - vectors: Optional[str] = None, -) -> Language: - if lookups: - nlp.vocab.lookups = lookups - msg.good(f"Added vocab lookups: {', '.join(lookups.tables)}") - data_path = util.ensure_path(data) - if data_path is not None: - lex_attrs = srsly.read_jsonl(data_path) - for lexeme in nlp.vocab: - lexeme.rank = OOV_RANK - for attrs in lex_attrs: - if "settings" in attrs: - continue - lexeme = nlp.vocab[attrs["orth"]] - lexeme.set_attrs(**attrs) - if len(nlp.vocab): - oov_prob = min(lex.prob for lex in nlp.vocab) - 1 - else: - oov_prob = DEFAULT_OOV_PROB - nlp.vocab.cfg.update({"oov_prob": oov_prob}) - msg.good(f"Added {len(nlp.vocab)} lexical entries to the vocab") - msg.good("Created vocabulary") - if vectors is not None: - add_vectors(nlp, vectors) - msg.good(f"Added vectors: {vectors}") - - -def add_tok2vec_weights( - nlp: Language, pretrain_config: Dict[str, Any], vocab_config: Dict[str, Any] -) -> None: - # Load pretrained tok2vec weights - cf. CLI command 'pretrain' - P = pretrain_config - V = vocab_config - weights_data = None - init_tok2vec = util.ensure_path(V["init_tok2vec"]) - if init_tok2vec is not None: - if P["objective"].get("type") == "vectors" and not V["vectors"]: - err = "Need initialize.vectors if pretraining.objective.type is vectors" - msg.fail(err, exits=1) - if not init_tok2vec.exists(): - msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1) - with init_tok2vec.open("rb") as file_: - weights_data = file_.read() - if weights_data is not None: - tok2vec_component = P["component"] - if tok2vec_component is None: - msg.fail( - f"To use pretrained tok2vec weights, [pretraining.component] " - f"needs to specify the component that should load them.", - exits=1, - ) - layer = nlp.get_pipe(tok2vec_component).model - if P["layer"]: - layer = layer.get_ref(P["layer"]) - layer.from_bytes(weights_data) - msg.good(f"Loaded pretrained weights into component '{tok2vec_component}'") - - -def add_vectors(nlp: Language, vectors: str) -> None: - title = f"Config validation error for vectors {vectors}" - desc = ( - "This typically means that there's a problem in the config.cfg included " - "with the packaged vectors. Make sure that the vectors package you're " - "loading is compatible with the current version of spaCy." - ) - with show_validation_error( - title=title, desc=desc, hint_fill=False, show_config=False - ): - util.load_vectors_into_model(nlp, vectors) - msg(f"Added {len(nlp.vocab.vectors)} vectors from {vectors}") - - -def verify_config(nlp: Language) -> None: - """Perform additional checks based on the config, loaded nlp object and training data.""" - # TODO: maybe we should validate based on the actual components, the list - # in config["nlp"]["pipeline"] instead? - for pipe_config in nlp.config["components"].values(): - # We can't assume that the component name == the factory - factory = pipe_config["factory"] - if factory == "textcat": - verify_textcat_config(nlp, pipe_config) - - -def verify_textcat_config(nlp: Language, pipe_config: Dict[str, Any]) -> None: - # if 'positive_label' is provided: double check whether it's in the data and - # the task is binary - if pipe_config.get("positive_label"): - textcat_labels = nlp.get_pipe("textcat").labels - pos_label = pipe_config.get("positive_label") - if pos_label not in textcat_labels: - raise ValueError( - Errors.E920.format(pos_label=pos_label, labels=textcat_labels) - ) - if len(list(textcat_labels)) != 2: - raise ValueError( - Errors.E919.format(pos_label=pos_label, labels=textcat_labels) - ) - - -def create_before_to_disk_callback( - callback: Optional[Callable[[Language], Language]] -) -> Callable[[Language], Language]: - def before_to_disk(nlp: Language) -> Language: - if not callback: - return nlp - modified_nlp = callback(nlp) - if not isinstance(modified_nlp, Language): - err = Errors.E914.format(name="before_to_disk", value=type(modified_nlp)) - raise ValueError(err) - return modified_nlp - - return before_to_disk diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 29e220b95..6494486a9 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -1,25 +1,13 @@ from typing import Optional -import numpy -import time -import re -from collections import Counter from pathlib import Path -from thinc.api import require_gpu, set_gpu_allocator -from thinc.api import set_dropout_rate, to_categorical, fix_random_seed -from thinc.api import Config, CosineDistance, L2Distance from wasabi import msg -import srsly -from functools import partial import typer +import re from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error -from ._util import import_code -from ..ml.models.multi_task import build_cloze_multi_task_model -from ..ml.models.multi_task import build_cloze_characters_multi_task_model -from ..tokens import Doc -from ..attrs import ID -from .. import util -from ..util import dot_to_object +from ._util import import_code, setup_gpu, CliLogger +from ..training.pretrain import pretrain +from ..util import load_config @app.command( @@ -61,15 +49,11 @@ def pretrain_cli( config_overrides = parse_config_overrides(ctx.args) import_code(code_path) verify_cli_args(config_path, output_dir, resume_path, epoch_resume) - if use_gpu >= 0: - msg.info("Using GPU") - require_gpu(use_gpu) - else: - msg.info("Using CPU") + setup_gpu(use_gpu) msg.info(f"Loading config from: {config_path}") with show_validation_error(config_path): - raw_config = util.load_config( + raw_config = load_config( config_path, overrides=config_overrides, interpolate=False ) config = raw_config.interpolate() @@ -89,250 +73,11 @@ def pretrain_cli( resume_path=resume_path, epoch_resume=epoch_resume, use_gpu=use_gpu, + logger=CliLogger, ) - - -def pretrain( - config: Config, - output_dir: Path, - resume_path: Optional[Path] = None, - epoch_resume: Optional[int] = None, - use_gpu: int = -1, -): - if config["training"]["seed"] is not None: - fix_random_seed(config["training"]["seed"]) - allocator = config["training"]["gpu_allocator"] - if use_gpu >= 0 and allocator: - set_gpu_allocator(allocator) - nlp = util.load_model_from_config(config) - C = util.resolve_training_config(nlp.config) - P_cfg = C["pretraining"] - corpus = dot_to_object(C, P_cfg["corpus"]) - batcher = P_cfg["batcher"] - model = create_pretraining_model(nlp, C["pretraining"]) - optimizer = C["pretraining"]["optimizer"] - # Load in pretrained weights to resume from - if resume_path is not None: - _resume_model(model, resume_path, epoch_resume) - else: - # Without '--resume-path' the '--epoch-resume' argument is ignored - epoch_resume = 0 - - tracker = ProgressTracker(frequency=10000) - msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}") - row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")} - msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings) - - def _save_model(epoch, is_temp=False): - is_temp_str = ".temp" if is_temp else "" - with model.use_params(optimizer.averages): - with (output_dir / f"model{epoch}{is_temp_str}.bin").open("wb") as file_: - file_.write(model.get_ref("tok2vec").to_bytes()) - log = { - "nr_word": tracker.nr_word, - "loss": tracker.loss, - "epoch_loss": tracker.epoch_loss, - "epoch": epoch, - } - with (output_dir / "log.jsonl").open("a") as file_: - file_.write(srsly.json_dumps(log) + "\n") - - objective = create_objective(P_cfg["objective"]) - # TODO: I think we probably want this to look more like the - # 'create_train_batches' function? - for epoch in range(epoch_resume, P_cfg["max_epochs"]): - for batch_id, batch in enumerate(batcher(corpus(nlp))): - docs = ensure_docs(batch) - loss = make_update(model, docs, optimizer, objective) - progress = tracker.update(epoch, loss, docs) - if progress: - msg.row(progress, **row_settings) - if P_cfg["n_save_every"] and (batch_id % P_cfg["n_save_every"] == 0): - _save_model(epoch, is_temp=True) - _save_model(epoch) - tracker.epoch_loss = 0.0 msg.good("Successfully finished pretrain") -def ensure_docs(examples_or_docs): - docs = [] - for eg_or_doc in examples_or_docs: - if isinstance(eg_or_doc, Doc): - docs.append(eg_or_doc) - else: - docs.append(eg_or_doc.reference) - return docs - - -def _resume_model(model, resume_path, epoch_resume): - msg.info(f"Resume training tok2vec from: {resume_path}") - with resume_path.open("rb") as file_: - weights_data = file_.read() - model.get_ref("tok2vec").from_bytes(weights_data) - # Parse the epoch number from the given weight file - model_name = re.search(r"model\d+\.bin", str(resume_path)) - if model_name: - # Default weight file name so read epoch_start from it by cutting off 'model' and '.bin' - epoch_resume = int(model_name.group(0)[5:][:-4]) + 1 - msg.info(f"Resuming from epoch: {epoch_resume}") - else: - msg.info(f"Resuming from epoch: {epoch_resume}") - - -def make_update(model, docs, optimizer, objective_func): - """Perform an update over a single batch of documents. - - docs (iterable): A batch of `Doc` objects. - optimizer (callable): An optimizer. - RETURNS loss: A float for the loss. - """ - predictions, backprop = model.begin_update(docs) - loss, gradients = objective_func(model.ops, docs, predictions) - backprop(gradients) - model.finish_update(optimizer) - # Don't want to return a cupy object here - # The gradients are modified in-place by the BERT MLM, - # so we get an accurate loss - return float(loss) - - -def create_objective(config): - """Create the objective for pretraining. - - We'd like to replace this with a registry function but it's tricky because - we're also making a model choice based on this. For now we hard-code support - for two types (characters, vectors). For characters you can specify - n_characters, for vectors you can specify the loss. - - Bleh. - """ - objective_type = config["type"] - if objective_type == "characters": - return partial(get_characters_loss, nr_char=config["n_characters"]) - elif objective_type == "vectors": - if config["loss"] == "cosine": - return partial( - get_vectors_loss, - distance=CosineDistance(normalize=True, ignore_zeros=True), - ) - elif config["loss"] == "L2": - return partial( - get_vectors_loss, distance=L2Distance(normalize=True, ignore_zeros=True) - ) - else: - raise ValueError("Unexpected loss type", config["loss"]) - else: - raise ValueError("Unexpected objective_type", objective_type) - - -def get_vectors_loss(ops, docs, prediction, distance): - """Compute a loss based on a distance between the documents' vectors and - the prediction. - """ - # The simplest way to implement this would be to vstack the - # token.vector values, but that's a bit inefficient, especially on GPU. - # Instead we fetch the index into the vectors table for each of our tokens, - # and look them up all at once. This prevents data copying. - ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs]) - target = docs[0].vocab.vectors.data[ids] - d_target, loss = distance(prediction, target) - return loss, d_target - - -def get_characters_loss(ops, docs, prediction, nr_char): - """Compute a loss based on a number of characters predicted from the docs.""" - target_ids = numpy.vstack([doc.to_utf8_array(nr_char=nr_char) for doc in docs]) - target_ids = target_ids.reshape((-1,)) - target = ops.asarray(to_categorical(target_ids, n_classes=256), dtype="f") - target = target.reshape((-1, 256 * nr_char)) - diff = prediction - target - loss = (diff ** 2).sum() - d_target = diff / float(prediction.shape[0]) - return loss, d_target - - -def create_pretraining_model(nlp, pretrain_config): - """Define a network for the pretraining. We simply add an output layer onto - the tok2vec input model. The tok2vec input model needs to be a model that - takes a batch of Doc objects (as a list), and returns a list of arrays. - Each array in the output needs to have one row per token in the doc. - The actual tok2vec layer is stored as a reference, and only this bit will be - serialized to file and read back in when calling the 'train' command. - """ - component = nlp.get_pipe(pretrain_config["component"]) - if pretrain_config.get("layer"): - tok2vec = component.model.get_ref(pretrain_config["layer"]) - else: - tok2vec = component.model - - # TODO - maxout_pieces = 3 - hidden_size = 300 - if pretrain_config["objective"]["type"] == "vectors": - model = build_cloze_multi_task_model( - nlp.vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces - ) - elif pretrain_config["objective"]["type"] == "characters": - model = build_cloze_characters_multi_task_model( - nlp.vocab, - tok2vec, - hidden_size=hidden_size, - maxout_pieces=maxout_pieces, - nr_char=pretrain_config["objective"]["n_characters"], - ) - model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")]) - set_dropout_rate(model, pretrain_config["dropout"]) - return model - - -class ProgressTracker: - def __init__(self, frequency=1000000): - self.loss = 0.0 - self.prev_loss = 0.0 - self.nr_word = 0 - self.words_per_epoch = Counter() - self.frequency = frequency - self.last_time = time.time() - self.last_update = 0 - self.epoch_loss = 0.0 - - def update(self, epoch, loss, docs): - self.loss += loss - self.epoch_loss += loss - words_in_batch = sum(len(doc) for doc in docs) - self.words_per_epoch[epoch] += words_in_batch - self.nr_word += words_in_batch - words_since_update = self.nr_word - self.last_update - if words_since_update >= self.frequency: - wps = words_since_update / (time.time() - self.last_time) - self.last_update = self.nr_word - self.last_time = time.time() - loss_per_word = self.loss - self.prev_loss - status = ( - epoch, - self.nr_word, - _smart_round(self.loss, width=10), - _smart_round(loss_per_word, width=6), - int(wps), - ) - self.prev_loss = float(self.loss) - return status - else: - return None - - -def _smart_round(figure, width=10, max_decimal=4): - """Round large numbers as integers, smaller numbers as decimals.""" - n_digits = len(str(int(figure))) - n_decimal = width - (n_digits + 1) - if n_decimal <= 1: - return str(int(figure)) - else: - n_decimal = min(n_decimal, max_decimal) - format_str = "%." + str(n_decimal) + "f" - return format_str % figure - - def verify_cli_args(config_path, output_dir, resume_path, epoch_resume): if not config_path or not config_path.exists(): msg.fail("Config file not found", config_path, exits=1) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index afaf230d1..aa0e71b5a 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -1,24 +1,16 @@ -from typing import Optional, Dict, Any, Tuple, Union, Callable, List -from timeit import default_timer as timer -import tqdm +from typing import Optional from pathlib import Path from wasabi import msg -import thinc -import thinc.schedules -from thinc.api import Config, Optimizer, require_gpu, fix_random_seed, set_gpu_allocator -import random +from thinc.api import Config import typer import logging -from .init_pipeline import init_pipeline -from .init_pipeline import create_before_to_disk_callback from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error -from ._util import import_code +from ._util import import_code, CliLogger, setup_gpu from ..language import Language +from ..training.loop import train +from ..training.initialize import init_nlp, must_reinitialize from .. import util -from ..errors import Errors -from ..util import resolve_dot_names, registry -from ..schemas import ConfigSchemaTraining @app.command( @@ -52,31 +44,33 @@ def train_cli( verify_cli_args(config_path, output_path) overrides = parse_config_overrides(ctx.args) import_code(code_path) - if use_gpu >= 0: - msg.info(f"Using GPU: {use_gpu}") - require_gpu(use_gpu) - else: - msg.info("Using CPU") - config = util.load_config(config_path, overrides=overrides, interpolate=False) + setup_gpu(use_gpu) + with show_validation_error(config_path): + config = util.load_config(config_path, overrides=overrides, interpolate=False) msg.divider("Initializing pipeline") - nlp = init_nlp(config, output_path) + nlp = init_pipeline(config, output_path, use_gpu=use_gpu) msg.divider("Training pipeline") - train(nlp, output_path, use_gpu=use_gpu) + final_path = train(nlp, output_path, use_gpu=use_gpu, logger=CliLogger) + if final_path: + msg.good(f"Saved pipeline to output directory", final_path) -def init_nlp(config: Config, output_path: Optional[Path]) -> Language: +def init_pipeline( + config: Config, output_path: Optional[Path], *, use_gpu: int = -1 +) -> Language: + init_kwargs = {"use_gpu": use_gpu, "logger": CliLogger, "on_success": msg.good} if output_path is not None: init_path = output_path / "model-initial" if not init_path.exists(): msg.info(f"Initializing the pipeline in {init_path}") - nlp = init_pipeline(config) + nlp = init_nlp(config, **init_kwargs) nlp.to_disk(init_path) msg.good(f"Saved initialized pipeline to {init_path}") else: nlp = util.load_model(init_path) if must_reinitialize(config, nlp.config): msg.warn("Config has changed: need to re-initialize pipeline") - nlp = init_pipeline(config) + nlp = init_nlp(config, **init_kwargs) nlp.to_disk(init_path) msg.good(f"Re-initialized pipeline in {init_path}") else: @@ -88,279 +82,7 @@ def init_nlp(config: Config, output_path: Optional[Path]) -> Language: "the vocabulary, vectors and label scheme. To take advantage of this, " "provide an output directory." ) - return init_pipeline(config) - - -def train( - nlp: Language, output_path: Optional[Path] = None, *, use_gpu: int = -1 -) -> None: - # Create iterator, which yields out info after each optimization step. - config = nlp.config.interpolate() - if config["training"]["seed"] is not None: - fix_random_seed(config["training"]["seed"]) - allocator = config["training"]["gpu_allocator"] - if use_gpu >= 0 and allocator: - set_gpu_allocator(allocator) - T = registry.resolve(config["training"], schema=ConfigSchemaTraining) - dot_names = [T["train_corpus"], T["dev_corpus"]] - train_corpus, dev_corpus = resolve_dot_names(config, dot_names) - optimizer = T["optimizer"] - score_weights = T["score_weights"] - batcher = T["batcher"] - train_logger = T["logger"] - before_to_disk = create_before_to_disk_callback(T["before_to_disk"]) - # Components that shouldn't be updated during training - frozen_components = T["frozen_components"] - # Create iterator, which yields out info after each optimization step. - training_step_iterator = train_while_improving( - nlp, - optimizer, - create_train_batches(train_corpus(nlp), batcher, T["max_epochs"]), - create_evaluation_callback(nlp, dev_corpus, score_weights), - dropout=T["dropout"], - accumulate_gradient=T["accumulate_gradient"], - patience=T["patience"], - max_steps=T["max_steps"], - eval_frequency=T["eval_frequency"], - exclude=frozen_components, - ) - msg.info(f"Pipeline: {nlp.pipe_names}") - if frozen_components: - msg.info(f"Frozen components: {frozen_components}") - msg.info(f"Initial learn rate: {optimizer.learn_rate}") - with nlp.select_pipes(disable=frozen_components): - print_row, finalize_logger = train_logger(nlp) - - try: - progress = tqdm.tqdm(total=T["eval_frequency"], leave=False) - progress.set_description(f"Epoch 1") - for batch, info, is_best_checkpoint in training_step_iterator: - progress.update(1) - if is_best_checkpoint is not None: - progress.close() - print_row(info) - if is_best_checkpoint and output_path is not None: - with nlp.select_pipes(disable=frozen_components): - update_meta(T, nlp, info) - with nlp.use_params(optimizer.averages): - nlp = before_to_disk(nlp) - nlp.to_disk(output_path / "model-best") - progress = tqdm.tqdm(total=T["eval_frequency"], leave=False) - progress.set_description(f"Epoch {info['epoch']}") - except Exception as e: - finalize_logger() - if output_path is not None: - # We don't want to swallow the traceback if we don't have a - # specific error. - msg.warn( - f"Aborting and saving the final best model. " - f"Encountered exception: {str(e)}" - ) - nlp = before_to_disk(nlp) - nlp.to_disk(output_path / "model-final") - raise e - finally: - finalize_logger() - if output_path is not None: - final_model_path = output_path / "model-final" - if optimizer.averages: - with nlp.use_params(optimizer.averages): - nlp.to_disk(final_model_path) - else: - nlp.to_disk(final_model_path) - msg.good(f"Saved pipeline to output directory {final_model_path}") - - -def must_reinitialize(train_config: Config, init_config: Config) -> bool: - # TODO: do this better and more fine-grained - return train_config.interpolate().to_str() == init_config.interpolate().to_str() - - -def add_vectors(nlp: Language, vectors: str) -> None: - title = f"Config validation error for vectors {vectors}" - desc = ( - "This typically means that there's a problem in the config.cfg included " - "with the packaged vectors. Make sure that the vectors package you're " - "loading is compatible with the current version of spaCy." - ) - with show_validation_error( - title=title, desc=desc, hint_fill=False, show_config=False - ): - util.load_vectors_into_model(nlp, vectors) - - -def create_train_batches(iterator, batcher, max_epochs: int): - epoch = 0 - examples = list(iterator) - if not examples: - # Raise error if no data - raise ValueError(Errors.E986) - while max_epochs < 1 or epoch != max_epochs: - random.shuffle(examples) - for batch in batcher(examples): - yield epoch, batch - epoch += 1 - - -def create_evaluation_callback( - nlp: Language, dev_corpus: Callable, weights: Dict[str, float] -) -> Callable[[], Tuple[float, Dict[str, float]]]: - weights = {key: value for key, value in weights.items() if value is not None} - - def evaluate() -> Tuple[float, Dict[str, float]]: - dev_examples = list(dev_corpus(nlp)) - scores = nlp.evaluate(dev_examples) - # Calculate a weighted sum based on score_weights for the main score. - # We can only consider scores that are ints/floats, not dicts like - # entity scores per type etc. - for key, value in scores.items(): - if key in weights and not isinstance(value, (int, float)): - raise ValueError(Errors.E915.format(name=key, score_type=type(value))) - try: - weighted_score = sum( - scores.get(s, 0.0) * weights.get(s, 0.0) for s in weights - ) - except KeyError as e: - keys = list(scores.keys()) - err = Errors.E983.format(dict="score_weights", key=str(e), keys=keys) - raise KeyError(err) from None - return weighted_score, scores - - return evaluate - - -def train_while_improving( - nlp: Language, - optimizer: Optimizer, - train_data, - evaluate, - *, - dropout: float, - eval_frequency: int, - accumulate_gradient: int, - patience: int, - max_steps: int, - exclude: List[str], -): - """Train until an evaluation stops improving. Works as a generator, - with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`, - where info is a dict, and is_best_checkpoint is in [True, False, None] -- - None indicating that the iteration was not evaluated as a checkpoint. - The evaluation is conducted by calling the evaluate callback. - - Positional arguments: - nlp: The spaCy pipeline to evaluate. - optimizer: The optimizer callable. - train_data (Iterable[Batch]): A generator of batches, with the training - data. Each batch should be a Sized[Tuple[Input, Annot]]. The training - data iterable needs to take care of iterating over the epochs and - shuffling. - evaluate (Callable[[], Tuple[float, Any]]): A callback to perform evaluation. - The callback should take no arguments and return a tuple - `(main_score, other_scores)`. The main_score should be a float where - higher is better. other_scores can be any object. - - Every iteration, the function yields out a tuple with: - - * batch: A list of Example objects. - * info: A dict with various information about the last update (see below). - * is_best_checkpoint: A value in None, False, True, indicating whether this - was the best evaluation so far. You should use this to save the model - checkpoints during training. If None, evaluation was not conducted on - that iteration. False means evaluation was conducted, but a previous - evaluation was better. - - The info dict provides the following information: - - epoch (int): How many passes over the data have been completed. - step (int): How many steps have been completed. - score (float): The main score from the last evaluation. - other_scores: : The other scores from the last evaluation. - losses: The accumulated losses throughout training. - checkpoints: A list of previous results, where each result is a - (score, step, epoch) tuple. - """ - if isinstance(dropout, float): - dropouts = thinc.schedules.constant(dropout) - else: - dropouts = dropout - results = [] - losses = {} - words_seen = 0 - start_time = timer() - for step, (epoch, batch) in enumerate(train_data): - dropout = next(dropouts) - for subbatch in subdivide_batch(batch, accumulate_gradient): - nlp.update( - subbatch, drop=dropout, losses=losses, sgd=False, exclude=exclude - ) - # TODO: refactor this so we don't have to run it separately in here - for name, proc in nlp.pipeline: - if ( - name not in exclude - and hasattr(proc, "model") - and proc.model not in (True, False, None) - ): - proc.model.finish_update(optimizer) - optimizer.step_schedules() - if not (step % eval_frequency): - if optimizer.averages: - with nlp.use_params(optimizer.averages): - score, other_scores = evaluate() - else: - score, other_scores = evaluate() - results.append((score, step)) - is_best_checkpoint = score == max(results)[0] - else: - score, other_scores = (None, None) - is_best_checkpoint = None - words_seen += sum(len(eg) for eg in batch) - info = { - "epoch": epoch, - "step": step, - "score": score, - "other_scores": other_scores, - "losses": losses, - "checkpoints": results, - "seconds": int(timer() - start_time), - "words": words_seen, - } - yield batch, info, is_best_checkpoint - if is_best_checkpoint is not None: - losses = {} - # Stop if no improvement in `patience` updates (if specified) - best_score, best_step = max(results) - if patience and (step - best_step) >= patience: - break - # Stop if we've exhausted our max steps (if specified) - if max_steps and step >= max_steps: - break - - -def subdivide_batch(batch, accumulate_gradient): - batch = list(batch) - batch.sort(key=lambda eg: len(eg.predicted)) - sub_len = len(batch) // accumulate_gradient - start = 0 - for i in range(accumulate_gradient): - subbatch = batch[start : start + sub_len] - if subbatch: - yield subbatch - start += len(subbatch) - subbatch = batch[start:] - if subbatch: - yield subbatch - - -def update_meta( - training: Union[Dict[str, Any], Config], nlp: Language, info: Dict[str, Any] -) -> None: - nlp.meta["performance"] = {} - for metric in training["score_weights"]: - if metric is not None: - nlp.meta["performance"][metric] = info["other_scores"].get(metric, 0.0) - for pipe_name in nlp.pipe_names: - nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name] + return init_nlp(config, **init_kwargs) def verify_cli_args(config_path: Path, output_path: Optional[Path] = None) -> None: @@ -371,17 +93,3 @@ def verify_cli_args(config_path: Path, output_path: Optional[Path] = None) -> No if not output_path.exists(): output_path.mkdir() msg.good(f"Created output directory: {output_path}") - - -# TODO: this is currently imported by the ray extension and not used otherwise -def load_from_paths( - config: Config, -) -> Tuple[List[Dict[str, str]], Dict[str, dict], bytes]: - weights_data = None - init_tok2vec = util.ensure_path(config["training"]["init_tok2vec"]) - if init_tok2vec is not None: - if not init_tok2vec.exists(): - msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1) - with init_tok2vec.open("rb") as file_: - weights_data = file_.read() - return None, {}, {}, weights_data diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index 232b53e1d..02e189834 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -9,10 +9,10 @@ from spacy.pipeline import TextCategorizer from spacy.tokens import Doc from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL from spacy.scorer import Scorer +from spacy.training import Example +from spacy.training.initialize import verify_textcat_config from ..util import make_tempdir -from ...cli.train import verify_textcat_config -from ...training import Example TRAIN_DATA = [ diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index caf4ea890..ee103208c 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -7,7 +7,6 @@ from spacy.cli.init_config import init_config, RECOMMENDATIONS from spacy.cli._util import validate_project_commands, parse_config_overrides from spacy.cli._util import load_project_config, substitute_project_variables from spacy.cli._util import string_to_list, OVERRIDES_ENV_VAR -from spacy.cli.debug_config import check_section_refs from thinc.api import ConfigValidationError, Config import srsly import os @@ -414,15 +413,3 @@ def test_string_to_list(value): def test_string_to_list_intify(value): assert string_to_list(value, intify=False) == ["1", "2", "3"] assert string_to_list(value, intify=True) == [1, 2, 3] - - -def test_check_section_refs(): - config = {"a": {"b": {"c": "a.d.e"}, "d": {"e": 1}}, "f": {"g": "d.f"}} - config = Config(config) - # Valid section reference - check_section_refs(config, ["a.b.c"]) - # Section that doesn't exist in this config - check_section_refs(config, ["x.y.z"]) - # Invalid section reference - with pytest.raises(ConfigValidationError): - check_section_refs(config, ["a.b.c", "f.g"]) diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index 4e079d29e..e6ef45f90 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -7,7 +7,6 @@ from spacy import util from spacy import prefer_gpu, require_gpu from spacy.ml._precomputable_affine import PrecomputableAffine from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding -from thinc.api import Optimizer @pytest.fixture @@ -158,16 +157,3 @@ def test_dot_to_dict(dot_notation, expected): result = util.dot_to_dict(dot_notation) assert result == expected assert util.dict_to_dot(result) == dot_notation - - -def test_resolve_training_config(): - config = { - "nlp": {"lang": "en", "disabled": []}, - "training": {"dropout": 0.1, "optimizer": {"@optimizers": "Adam.v1"}}, - "corpora": {}, - } - resolved = util.resolve_training_config(config) - assert resolved["training"]["dropout"] == 0.1 - assert isinstance(resolved["training"]["optimizer"], Optimizer) - assert resolved["corpora"] == {} - assert "nlp" not in resolved diff --git a/spacy/tests/test_util.py b/spacy/tests/test_util.py index 0647b8556..f48cfba00 100644 --- a/spacy/tests/test_util.py +++ b/spacy/tests/test_util.py @@ -1,14 +1,15 @@ import pytest -from .util import get_random_doc - from spacy import util from spacy.util import dot_to_object, SimpleFrozenList -from thinc.api import Config, Optimizer +from thinc.api import Config, Optimizer, ConfigValidationError from spacy.training.batchers import minibatch_by_words -from ..lang.en import English -from ..lang.nl import Dutch -from ..language import DEFAULT_CONFIG_PATH +from spacy.lang.en import English +from spacy.lang.nl import Dutch +from spacy.language import DEFAULT_CONFIG_PATH +from spacy.schemas import ConfigSchemaTraining + +from .util import get_random_doc @pytest.mark.parametrize( @@ -101,8 +102,8 @@ def test_util_dot_section(): dot_to_object(en_nlp.config, "nlp.pipeline.tagger") with pytest.raises(KeyError): dot_to_object(en_nlp.config, "nlp.unknownattribute") - resolved = util.resolve_training_config(nl_nlp.config) - assert isinstance(dot_to_object(resolved, "training.optimizer"), Optimizer) + T = util.registry.resolve(nl_nlp.config["training"], schema=ConfigSchemaTraining) + assert isinstance(dot_to_object({"training": T}, "training.optimizer"), Optimizer) def test_simple_frozen_list(): @@ -120,3 +121,17 @@ def test_simple_frozen_list(): t = SimpleFrozenList(["foo", "bar"], error="Error!") with pytest.raises(NotImplementedError): t.append("baz") + + +def test_resolve_dot_names(): + config = { + "training": {"optimizer": {"@optimizers": "Adam.v1"}}, + "foo": {"bar": "training.optimizer", "baz": "training.xyz"}, + } + result = util.resolve_dot_names(config, ["foo.bar"]) + assert isinstance(result[0], Optimizer) + with pytest.raises(ConfigValidationError) as e: + util.resolve_dot_names(config, ["foo.baz", "foo.bar"]) + errors = e.value.errors + assert len(errors) == 1 + assert errors[0]["loc"] == ["training", "xyz"] diff --git a/spacy/tests/training/test_readers.py b/spacy/tests/training/test_readers.py index c06c9d282..7d94d5ddc 100644 --- a/spacy/tests/training/test_readers.py +++ b/spacy/tests/training/test_readers.py @@ -2,8 +2,8 @@ from typing import Dict, Iterable, Callable import pytest from thinc.api import Config from spacy import Language -from spacy.util import load_model_from_config, registry, dot_to_object -from spacy.util import resolve_training_config +from spacy.util import load_model_from_config, registry, resolve_dot_names +from spacy.schemas import ConfigSchemaTraining from spacy.training import Example @@ -39,21 +39,21 @@ def test_readers(): config = Config().from_str(config_string) nlp = load_model_from_config(config, auto_fill=True) - resolved = resolve_training_config(nlp.config) - train_corpus = dot_to_object(resolved, resolved["training"]["train_corpus"]) + dot_names = ["training.train_corpus", "training.dev_corpus"] + train_corpus, dev_corpus = resolve_dot_names(nlp.config, dot_names) assert isinstance(train_corpus, Callable) - optimizer = resolved["training"]["optimizer"] + T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining) + optimizer = T["optimizer"] # simulate a training loop nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer) for example in train_corpus(nlp): nlp.update([example], sgd=optimizer) - dev_corpus = dot_to_object(resolved, resolved["training"]["dev_corpus"]) scores = nlp.evaluate(list(dev_corpus(nlp))) assert scores["cats_score"] # ensure the pipeline runs doc = nlp("Quick test") assert doc.cats - extra_corpus = resolved["corpora"]["extra"] + extra_corpus = registry.resolve(nlp.config["corpora"])["extra"] assert isinstance(extra_corpus, Callable) @@ -89,9 +89,10 @@ def test_cat_readers(reader, additional_config): config["corpora"]["@readers"] = reader config["corpora"].update(additional_config) nlp = load_model_from_config(config, auto_fill=True) - resolved = resolve_training_config(nlp.config) - train_corpus = dot_to_object(resolved, resolved["training"]["train_corpus"]) - optimizer = resolved["training"]["optimizer"] + dot_names = ["training.train_corpus", "training.dev_corpus"] + train_corpus, dev_corpus = resolve_dot_names(nlp.config, dot_names) + T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining) + optimizer = T["optimizer"] # simulate a training loop nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer) for example in train_corpus(nlp): @@ -100,7 +101,6 @@ def test_cat_readers(reader, additional_config): assert sorted(list(set(example.y.cats.values()))) == [0.0, 1.0] nlp.update([example], sgd=optimizer) # simulate performance benchmark on dev corpus - dev_corpus = dot_to_object(resolved, resolved["training"]["dev_corpus"]) dev_examples = list(dev_corpus(nlp)) for example in dev_examples: # this shouldn't fail if each dev example has at least one positive label diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index e69de29bb..8938886fe 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -0,0 +1,205 @@ +from typing import Union, Dict, Optional, Any, List, Callable +from thinc.api import Config, fix_random_seed, set_gpu_allocator +from thinc.api import ConfigValidationError +from pathlib import Path +import srsly + +from .loop import create_before_to_disk_callback +from ..language import Language +from ..lookups import Lookups +from ..errors import Errors +from ..schemas import ConfigSchemaTraining, ConfigSchemaInit, ConfigSchemaPretrain +from ..util import registry, load_model_from_config, resolve_dot_names +from ..util import load_model, ensure_path, logger, OOV_RANK, DEFAULT_OOV_PROB + + +def init_nlp( + config: Config, + *, + use_gpu: int = -1, + logger: Callable[[Any], Any] = logger, + on_success: Callable[[str], None] = lambda x: None, +) -> Language: + raw_config = config + config = raw_config.interpolate() + if config["training"]["seed"] is not None: + fix_random_seed(config["training"]["seed"]) + allocator = config["training"]["gpu_allocator"] + if use_gpu >= 0 and allocator: + set_gpu_allocator(allocator) + # Use original config here before it's resolved to functions + sourced_components = get_sourced_components(config) + nlp = load_model_from_config(raw_config, auto_fill=True) + on_success("Set up nlp object from config") + config = nlp.config.interpolate() + # Resolve all training-relevant sections using the filled nlp config + T = registry.resolve(config["training"], schema=ConfigSchemaTraining) + dot_names = [T["train_corpus"], T["dev_corpus"]] + train_corpus, dev_corpus = resolve_dot_names(config, dot_names) + I = registry.resolve(config["initialize"], schema=ConfigSchemaInit) + V = I["vocab"] + init_vocab(nlp, data=V["data"], lookups=V["lookups"], vectors=V["vectors"]) + optimizer = T["optimizer"] + before_to_disk = create_before_to_disk_callback(T["before_to_disk"]) + # Components that shouldn't be updated during training + frozen_components = T["frozen_components"] + # Sourced components that require resume_training + resume_components = [p for p in sourced_components if p not in frozen_components] + logger.info(f"Pipeline: {nlp.pipe_names}") + if resume_components: + with nlp.select_pipes(enable=resume_components): + logger.info(f"Resuming training for: {resume_components}") + nlp.resume_training(sgd=optimizer) + with nlp.select_pipes(disable=[*frozen_components, *resume_components]): + nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer) + on_success(f"Initialized pipeline components") + # Verify the config after calling 'begin_training' to ensure labels + # are properly initialized + verify_config(nlp) + if "pretraining" in config and config["pretraining"]: + P = registry.resolve(config["pretraining"], schema=ConfigSchemaPretrain) + loaded = add_tok2vec_weights(nlp, P, I) + if loaded and P["component"]: + on_success(f"Loaded pretrained weights into component '{P['component']}'") + nlp = before_to_disk(nlp) + return nlp + + +def must_reinitialize(train_config: Config, init_config: Config) -> bool: + # TODO: do this better and more fine-grained + return train_config.interpolate().to_str() == init_config.interpolate().to_str() + + +def init_vocab( + nlp: Language, + *, + data: Optional[Path] = None, + lookups: Optional[Lookups] = None, + vectors: Optional[str] = None, + on_success: Callable[[str], None] = lambda x: None, +) -> Language: + if lookups: + nlp.vocab.lookups = lookups + on_success(f"Added vocab lookups: {', '.join(lookups.tables)}") + data_path = ensure_path(data) + if data_path is not None: + lex_attrs = srsly.read_jsonl(data_path) + for lexeme in nlp.vocab: + lexeme.rank = OOV_RANK + for attrs in lex_attrs: + if "settings" in attrs: + continue + lexeme = nlp.vocab[attrs["orth"]] + lexeme.set_attrs(**attrs) + if len(nlp.vocab): + oov_prob = min(lex.prob for lex in nlp.vocab) - 1 + else: + oov_prob = DEFAULT_OOV_PROB + nlp.vocab.cfg.update({"oov_prob": oov_prob}) + on_success(f"Added {len(nlp.vocab)} lexical entries to the vocab") + on_success("Created vocabulary") + if vectors is not None: + load_vectors_into_model(nlp, vectors) + on_success(f"Added vectors: {vectors}") + + +def load_vectors_into_model( + nlp: "Language", name: Union[str, Path], *, add_strings: bool = True +) -> None: + """Load word vectors from an installed model or path into a model instance.""" + try: + vectors_nlp = load_model(name) + except ConfigValidationError as e: + title = f"Config validation error for vectors {name}" + desc = ( + "This typically means that there's a problem in the config.cfg included " + "with the packaged vectors. Make sure that the vectors package you're " + "loading is compatible with the current version of spaCy." + ) + err = ConfigValidationError.from_error(config=None, title=title, desc=desc) + raise err from None + nlp.vocab.vectors = vectors_nlp.vocab.vectors + if add_strings: + # I guess we should add the strings from the vectors_nlp model? + # E.g. if someone does a similarity query, they might expect the strings. + for key in nlp.vocab.vectors.key2row: + if key in vectors_nlp.vocab.strings: + nlp.vocab.strings.add(vectors_nlp.vocab.strings[key]) + + +def add_tok2vec_weights( + nlp: Language, pretrain_config: Dict[str, Any], vocab_config: Dict[str, Any] +) -> bool: + # Load pretrained tok2vec weights - cf. CLI command 'pretrain' + P = pretrain_config + V = vocab_config + weights_data = None + init_tok2vec = ensure_path(V["init_tok2vec"]) + if init_tok2vec is not None: + if P["objective"].get("type") == "vectors" and not V["vectors"]: + err = 'need initialize.vectors if pretraining.objective.type is "vectors"' + errors = [{"loc": ["initialize", "vectors"], "msg": err}] + raise ConfigValidationError(config=nlp.config, errors=errors) + if not init_tok2vec.exists(): + err = f"can't find pretrained tok2vec: {init_tok2vec}" + errors = [{"loc": ["initialize", "vectors", "init_tok2vec"], "msg": err}] + raise ConfigValidationError(config=nlp.config, errors=errors) + with init_tok2vec.open("rb") as file_: + weights_data = file_.read() + if weights_data is not None: + tok2vec_component = P["component"] + if tok2vec_component is None: + desc = ( + f"To use pretrained tok2vec weights, [pretraining.component] " + f"needs to specify the component that should load them." + ) + err = "component can't be null" + errors = [{"loc": ["pretraining", "component"], "msg": err}] + raise ConfigValidationError( + config=nlp.config["pretraining"], errors=errors, desc=desc + ) + layer = nlp.get_pipe(tok2vec_component).model + if P["layer"]: + layer = layer.get_ref(P["layer"]) + layer.from_bytes(weights_data) + return True + return False + + +def verify_config(nlp: Language) -> None: + """Perform additional checks based on the config, loaded nlp object and training data.""" + # TODO: maybe we should validate based on the actual components, the list + # in config["nlp"]["pipeline"] instead? + for pipe_config in nlp.config["components"].values(): + # We can't assume that the component name == the factory + factory = pipe_config["factory"] + if factory == "textcat": + verify_textcat_config(nlp, pipe_config) + + +def verify_textcat_config(nlp: Language, pipe_config: Dict[str, Any]) -> None: + # if 'positive_label' is provided: double check whether it's in the data and + # the task is binary + if pipe_config.get("positive_label"): + textcat_labels = nlp.get_pipe("textcat").labels + pos_label = pipe_config.get("positive_label") + if pos_label not in textcat_labels: + raise ValueError( + Errors.E920.format(pos_label=pos_label, labels=textcat_labels) + ) + if len(list(textcat_labels)) != 2: + raise ValueError( + Errors.E919.format(pos_label=pos_label, labels=textcat_labels) + ) + + +def get_sourced_components(config: Union[Dict[str, Any], Config]) -> List[str]: + """RETURNS (List[str]): All sourced components in the original config, + e.g. {"source": "en_core_web_sm"}. If the config contains a key + "factory", we assume it refers to a component factory. + """ + return [ + name + for name, cfg in config.get("components", {}).items() + if "factory" not in cfg and "source" in cfg + ] diff --git a/spacy/training/loop.py b/spacy/training/loop.py new file mode 100644 index 000000000..3e3e9f5ce --- /dev/null +++ b/spacy/training/loop.py @@ -0,0 +1,301 @@ +from typing import List, Callable, Tuple, Dict, Iterable, Iterator, Union, Any +from typing import Optional +from pathlib import Path +from timeit import default_timer as timer +from thinc.api import Optimizer, Config, constant, fix_random_seed, set_gpu_allocator +import random +import tqdm + +from .example import Example +from ..schemas import ConfigSchemaTraining +from ..language import Language +from ..errors import Errors +from ..util import resolve_dot_names, registry, logger + + +def train( + nlp: Language, + output_path: Optional[Path] = None, + *, + use_gpu: int = -1, + logger: Callable[[Any], Any] = logger, +) -> Optional[Path]: + """Train a pipeline. + + nlp (Language): The initialized nlp object with the full config. + output_path (Path): Optional output path to save trained model to. + use_gpu (int): Whether to train on GPU. Make sure to call require_gpu + before calling this function. + logger (Callable[[Any], Any]): Optional logger exposing the methods info, + error, debug and warn. Defaults to regular spaCy logger but can be + swapped for CLI logger. + RETURNS (Path / None): The path to the final exported model. + """ + + # Create iterator, which yields out info after each optimization step. + config = nlp.config.interpolate() + if config["training"]["seed"] is not None: + fix_random_seed(config["training"]["seed"]) + allocator = config["training"]["gpu_allocator"] + if use_gpu >= 0 and allocator: + set_gpu_allocator(allocator) + T = registry.resolve(config["training"], schema=ConfigSchemaTraining) + dot_names = [T["train_corpus"], T["dev_corpus"]] + train_corpus, dev_corpus = resolve_dot_names(config, dot_names) + optimizer = T["optimizer"] + score_weights = T["score_weights"] + batcher = T["batcher"] + train_logger = T["logger"] + before_to_disk = create_before_to_disk_callback(T["before_to_disk"]) + # Components that shouldn't be updated during training + frozen_components = T["frozen_components"] + # Create iterator, which yields out info after each optimization step. + training_step_iterator = train_while_improving( + nlp, + optimizer, + create_train_batches(train_corpus(nlp), batcher, T["max_epochs"]), + create_evaluation_callback(nlp, dev_corpus, score_weights), + dropout=T["dropout"], + accumulate_gradient=T["accumulate_gradient"], + patience=T["patience"], + max_steps=T["max_steps"], + eval_frequency=T["eval_frequency"], + exclude=frozen_components, + ) + logger.info(f"Pipeline: {nlp.pipe_names}") + if frozen_components: + logger.info(f"Frozen components: {frozen_components}") + logger.info(f"Initial learn rate: {optimizer.learn_rate}") + with nlp.select_pipes(disable=frozen_components): + print_row, finalize_logger = train_logger(nlp) + try: + progress = tqdm.tqdm(total=T["eval_frequency"], leave=False) + progress.set_description(f"Epoch 1") + for batch, info, is_best_checkpoint in training_step_iterator: + progress.update(1) + if is_best_checkpoint is not None: + progress.close() + print_row(info) + if is_best_checkpoint and output_path is not None: + with nlp.select_pipes(disable=frozen_components): + update_meta(T, nlp, info) + with nlp.use_params(optimizer.averages): + nlp = before_to_disk(nlp) + nlp.to_disk(output_path / "model-best") + progress = tqdm.tqdm(total=T["eval_frequency"], leave=False) + progress.set_description(f"Epoch {info['epoch']}") + except Exception as e: + finalize_logger() + if output_path is not None: + # We don't want to swallow the traceback if we don't have a + # specific error. + logger.warn( + f"Aborting and saving the final best model. " + f"Encountered exception: {str(e)}" + ) + nlp = before_to_disk(nlp) + nlp.to_disk(output_path / "model-final") + raise e + finally: + finalize_logger() + if output_path is not None: + final_model_path = output_path / "model-final" + if optimizer.averages: + with nlp.use_params(optimizer.averages): + nlp.to_disk(final_model_path) + else: + nlp.to_disk(final_model_path) + return final_model_path + + +def train_while_improving( + nlp: Language, + optimizer: Optimizer, + train_data, + evaluate, + *, + dropout: float, + eval_frequency: int, + accumulate_gradient: int, + patience: int, + max_steps: int, + exclude: List[str], +): + """Train until an evaluation stops improving. Works as a generator, + with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`, + where info is a dict, and is_best_checkpoint is in [True, False, None] -- + None indicating that the iteration was not evaluated as a checkpoint. + The evaluation is conducted by calling the evaluate callback. + + Positional arguments: + nlp: The spaCy pipeline to evaluate. + optimizer: The optimizer callable. + train_data (Iterable[Batch]): A generator of batches, with the training + data. Each batch should be a Sized[Tuple[Input, Annot]]. The training + data iterable needs to take care of iterating over the epochs and + shuffling. + evaluate (Callable[[], Tuple[float, Any]]): A callback to perform evaluation. + The callback should take no arguments and return a tuple + `(main_score, other_scores)`. The main_score should be a float where + higher is better. other_scores can be any object. + + Every iteration, the function yields out a tuple with: + + * batch: A list of Example objects. + * info: A dict with various information about the last update (see below). + * is_best_checkpoint: A value in None, False, True, indicating whether this + was the best evaluation so far. You should use this to save the model + checkpoints during training. If None, evaluation was not conducted on + that iteration. False means evaluation was conducted, but a previous + evaluation was better. + + The info dict provides the following information: + + epoch (int): How many passes over the data have been completed. + step (int): How many steps have been completed. + score (float): The main score from the last evaluation. + other_scores: : The other scores from the last evaluation. + losses: The accumulated losses throughout training. + checkpoints: A list of previous results, where each result is a + (score, step, epoch) tuple. + """ + if isinstance(dropout, float): + dropouts = constant(dropout) + else: + dropouts = dropout + results = [] + losses = {} + words_seen = 0 + start_time = timer() + for step, (epoch, batch) in enumerate(train_data): + dropout = next(dropouts) + for subbatch in subdivide_batch(batch, accumulate_gradient): + nlp.update( + subbatch, drop=dropout, losses=losses, sgd=False, exclude=exclude + ) + # TODO: refactor this so we don't have to run it separately in here + for name, proc in nlp.pipeline: + if ( + name not in exclude + and hasattr(proc, "model") + and proc.model not in (True, False, None) + ): + proc.model.finish_update(optimizer) + optimizer.step_schedules() + if not (step % eval_frequency): + if optimizer.averages: + with nlp.use_params(optimizer.averages): + score, other_scores = evaluate() + else: + score, other_scores = evaluate() + results.append((score, step)) + is_best_checkpoint = score == max(results)[0] + else: + score, other_scores = (None, None) + is_best_checkpoint = None + words_seen += sum(len(eg) for eg in batch) + info = { + "epoch": epoch, + "step": step, + "score": score, + "other_scores": other_scores, + "losses": losses, + "checkpoints": results, + "seconds": int(timer() - start_time), + "words": words_seen, + } + yield batch, info, is_best_checkpoint + if is_best_checkpoint is not None: + losses = {} + # Stop if no improvement in `patience` updates (if specified) + best_score, best_step = max(results) + if patience and (step - best_step) >= patience: + break + # Stop if we've exhausted our max steps (if specified) + if max_steps and step >= max_steps: + break + + +def subdivide_batch(batch, accumulate_gradient): + batch = list(batch) + batch.sort(key=lambda eg: len(eg.predicted)) + sub_len = len(batch) // accumulate_gradient + start = 0 + for i in range(accumulate_gradient): + subbatch = batch[start : start + sub_len] + if subbatch: + yield subbatch + start += len(subbatch) + subbatch = batch[start:] + if subbatch: + yield subbatch + + +def create_evaluation_callback( + nlp: Language, dev_corpus: Callable, weights: Dict[str, float] +) -> Callable[[], Tuple[float, Dict[str, float]]]: + weights = {key: value for key, value in weights.items() if value is not None} + + def evaluate() -> Tuple[float, Dict[str, float]]: + dev_examples = list(dev_corpus(nlp)) + scores = nlp.evaluate(dev_examples) + # Calculate a weighted sum based on score_weights for the main score. + # We can only consider scores that are ints/floats, not dicts like + # entity scores per type etc. + for key, value in scores.items(): + if key in weights and not isinstance(value, (int, float)): + raise ValueError(Errors.E915.format(name=key, score_type=type(value))) + try: + weighted_score = sum( + scores.get(s, 0.0) * weights.get(s, 0.0) for s in weights + ) + except KeyError as e: + keys = list(scores.keys()) + err = Errors.E983.format(dict="score_weights", key=str(e), keys=keys) + raise KeyError(err) from None + return weighted_score, scores + + return evaluate + + +def create_train_batches( + iterator: Iterator[Example], + batcher: Callable[[Iterable[Example]], Iterable[Example]], + max_epochs: int, +): + epoch = 0 + examples = list(iterator) + if not examples: + # Raise error if no data + raise ValueError(Errors.E986) + while max_epochs < 1 or epoch != max_epochs: + random.shuffle(examples) + for batch in batcher(examples): + yield epoch, batch + epoch += 1 + + +def update_meta( + training: Union[Dict[str, Any], Config], nlp: Language, info: Dict[str, Any] +) -> None: + nlp.meta["performance"] = {} + for metric in training["score_weights"]: + if metric is not None: + nlp.meta["performance"][metric] = info["other_scores"].get(metric, 0.0) + for pipe_name in nlp.pipe_names: + nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name] + + +def create_before_to_disk_callback( + callback: Optional[Callable[[Language], Language]] +) -> Callable[[Language], Language]: + def before_to_disk(nlp: Language) -> Language: + if not callback: + return nlp + modified_nlp = callback(nlp) + if not isinstance(modified_nlp, Language): + err = Errors.E914.format(name="before_to_disk", value=type(modified_nlp)) + raise ValueError(err) + return modified_nlp + + return before_to_disk diff --git a/spacy/training/pretrain.py b/spacy/training/pretrain.py new file mode 100644 index 000000000..1e0f055ee --- /dev/null +++ b/spacy/training/pretrain.py @@ -0,0 +1,267 @@ +from typing import Optional, Callable, Any, Iterable, Union, List +from thinc.api import Config, fix_random_seed, set_gpu_allocator, Model, Optimizer +from thinc.api import set_dropout_rate, to_categorical, CosineDistance, L2Distance +from pathlib import Path +from functools import partial +from collections import Counter +import srsly +import numpy +import time +import re +from wasabi import msg + +from .example import Example +from ..tokens import Doc +from ..attrs import ID +from ..ml.models.multi_task import build_cloze_multi_task_model +from ..ml.models.multi_task import build_cloze_characters_multi_task_model +from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain +from ..util import registry, load_model_from_config, dot_to_object, logger + + +def pretrain( + config: Config, + output_dir: Path, + resume_path: Optional[Path] = None, + epoch_resume: Optional[int] = None, + use_gpu: int = -1, + logger: Callable[[Any], Any] = logger, +): + if config["training"]["seed"] is not None: + fix_random_seed(config["training"]["seed"]) + allocator = config["training"]["gpu_allocator"] + if use_gpu >= 0 and allocator: + set_gpu_allocator(allocator) + nlp = load_model_from_config(config) + T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining) + P = registry.resolve(nlp.config["pretraining"], schema=ConfigSchemaPretrain) + corpus = dot_to_object(T, P["corpus"]) + batcher = P["batcher"] + model = create_pretraining_model(nlp, P) + optimizer = P["optimizer"] + # Load in pretrained weights to resume from + if resume_path is not None: + _resume_model(model, resume_path, epoch_resume) + else: + # Without '--resume-path' the '--epoch-resume' argument is ignored + epoch_resume = 0 + + # TODO: move this to logger function? + tracker = ProgressTracker(frequency=10000) + msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}") + row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")} + msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings) + + def _save_model(epoch, is_temp=False): + is_temp_str = ".temp" if is_temp else "" + with model.use_params(optimizer.averages): + with (output_dir / f"model{epoch}{is_temp_str}.bin").open("wb") as file_: + file_.write(model.get_ref("tok2vec").to_bytes()) + log = { + "nr_word": tracker.nr_word, + "loss": tracker.loss, + "epoch_loss": tracker.epoch_loss, + "epoch": epoch, + } + with (output_dir / "log.jsonl").open("a") as file_: + file_.write(srsly.json_dumps(log) + "\n") + + objective = create_objective(P["objective"]) + # TODO: I think we probably want this to look more like the + # 'create_train_batches' function? + for epoch in range(epoch_resume, P["max_epochs"]): + for batch_id, batch in enumerate(batcher(corpus(nlp))): + docs = ensure_docs(batch) + loss = make_update(model, docs, optimizer, objective) + progress = tracker.update(epoch, loss, docs) + if progress: + msg.row(progress, **row_settings) + if P["n_save_every"] and (batch_id % P["n_save_every"] == 0): + _save_model(epoch, is_temp=True) + _save_model(epoch) + tracker.epoch_loss = 0.0 + + +def ensure_docs(examples_or_docs: Iterable[Union[Doc, Example]]) -> List[Doc]: + docs = [] + for eg_or_doc in examples_or_docs: + if isinstance(eg_or_doc, Doc): + docs.append(eg_or_doc) + else: + docs.append(eg_or_doc.reference) + return docs + + +def _resume_model( + model: Model, + resume_path: Path, + epoch_resume: int, + logger: Callable[[Any], Any] = logger, +) -> None: + logger.info(f"Resume training tok2vec from: {resume_path}") + with resume_path.open("rb") as file_: + weights_data = file_.read() + model.get_ref("tok2vec").from_bytes(weights_data) + # Parse the epoch number from the given weight file + model_name = re.search(r"model\d+\.bin", str(resume_path)) + if model_name: + # Default weight file name so read epoch_start from it by cutting off 'model' and '.bin' + epoch_resume = int(model_name.group(0)[5:][:-4]) + 1 + logger.info(f"Resuming from epoch: {epoch_resume}") + else: + logger.info(f"Resuming from epoch: {epoch_resume}") + + +def make_update( + model: Model, docs: Iterable[Doc], optimizer: Optimizer, objective_func: Callable +) -> float: + """Perform an update over a single batch of documents. + + docs (iterable): A batch of `Doc` objects. + optimizer (callable): An optimizer. + RETURNS loss: A float for the loss. + """ + predictions, backprop = model.begin_update(docs) + loss, gradients = objective_func(model.ops, docs, predictions) + backprop(gradients) + model.finish_update(optimizer) + # Don't want to return a cupy object here + # The gradients are modified in-place by the BERT MLM, + # so we get an accurate loss + return float(loss) + + +def create_objective(config: Config): + """Create the objective for pretraining. + + We'd like to replace this with a registry function but it's tricky because + we're also making a model choice based on this. For now we hard-code support + for two types (characters, vectors). For characters you can specify + n_characters, for vectors you can specify the loss. + + Bleh. + """ + objective_type = config["type"] + if objective_type == "characters": + return partial(get_characters_loss, nr_char=config["n_characters"]) + elif objective_type == "vectors": + if config["loss"] == "cosine": + distance = CosineDistance(normalize=True, ignore_zeros=True) + return partial(get_vectors_loss, distance=distance) + elif config["loss"] == "L2": + distance = L2Distance(normalize=True, ignore_zeros=True) + return partial(get_vectors_loss, distance=distance) + else: + raise ValueError("Unexpected loss type", config["loss"]) + else: + raise ValueError("Unexpected objective_type", objective_type) + + +def get_vectors_loss(ops, docs, prediction, distance): + """Compute a loss based on a distance between the documents' vectors and + the prediction. + """ + # The simplest way to implement this would be to vstack the + # token.vector values, but that's a bit inefficient, especially on GPU. + # Instead we fetch the index into the vectors table for each of our tokens, + # and look them up all at once. This prevents data copying. + ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs]) + target = docs[0].vocab.vectors.data[ids] + d_target, loss = distance(prediction, target) + return loss, d_target + + +def get_characters_loss(ops, docs, prediction, nr_char): + """Compute a loss based on a number of characters predicted from the docs.""" + target_ids = numpy.vstack([doc.to_utf8_array(nr_char=nr_char) for doc in docs]) + target_ids = target_ids.reshape((-1,)) + target = ops.asarray(to_categorical(target_ids, n_classes=256), dtype="f") + target = target.reshape((-1, 256 * nr_char)) + diff = prediction - target + loss = (diff ** 2).sum() + d_target = diff / float(prediction.shape[0]) + return loss, d_target + + +def create_pretraining_model(nlp, pretrain_config): + """Define a network for the pretraining. We simply add an output layer onto + the tok2vec input model. The tok2vec input model needs to be a model that + takes a batch of Doc objects (as a list), and returns a list of arrays. + Each array in the output needs to have one row per token in the doc. + The actual tok2vec layer is stored as a reference, and only this bit will be + serialized to file and read back in when calling the 'train' command. + """ + component = nlp.get_pipe(pretrain_config["component"]) + if pretrain_config.get("layer"): + tok2vec = component.model.get_ref(pretrain_config["layer"]) + else: + tok2vec = component.model + + # TODO + maxout_pieces = 3 + hidden_size = 300 + if pretrain_config["objective"]["type"] == "vectors": + model = build_cloze_multi_task_model( + nlp.vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces + ) + elif pretrain_config["objective"]["type"] == "characters": + model = build_cloze_characters_multi_task_model( + nlp.vocab, + tok2vec, + hidden_size=hidden_size, + maxout_pieces=maxout_pieces, + nr_char=pretrain_config["objective"]["n_characters"], + ) + model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")]) + set_dropout_rate(model, pretrain_config["dropout"]) + return model + + +class ProgressTracker: + def __init__(self, frequency=1000000): + self.loss = 0.0 + self.prev_loss = 0.0 + self.nr_word = 0 + self.words_per_epoch = Counter() + self.frequency = frequency + self.last_time = time.time() + self.last_update = 0 + self.epoch_loss = 0.0 + + def update(self, epoch, loss, docs): + self.loss += loss + self.epoch_loss += loss + words_in_batch = sum(len(doc) for doc in docs) + self.words_per_epoch[epoch] += words_in_batch + self.nr_word += words_in_batch + words_since_update = self.nr_word - self.last_update + if words_since_update >= self.frequency: + wps = words_since_update / (time.time() - self.last_time) + self.last_update = self.nr_word + self.last_time = time.time() + loss_per_word = self.loss - self.prev_loss + status = ( + epoch, + self.nr_word, + _smart_round(self.loss, width=10), + _smart_round(loss_per_word, width=6), + int(wps), + ) + self.prev_loss = float(self.loss) + return status + else: + return None + + +def _smart_round( + figure: Union[float, int], width: int = 10, max_decimal: int = 4 +) -> str: + """Round large numbers as integers, smaller numbers as decimals.""" + n_digits = len(str(int(figure))) + n_decimal = width - (n_digits + 1) + if n_decimal <= 1: + return str(int(figure)) + else: + n_decimal = min(n_decimal, max_decimal) + format_str = "%." + str(n_decimal) + "f" + return format_str % figure diff --git a/spacy/util.py b/spacy/util.py index cab7af8fb..9d7199d7f 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -8,6 +8,7 @@ import re from pathlib import Path import thinc from thinc.api import NumpyOps, get_current_ops, Adam, Config, Optimizer +from thinc.api import ConfigValidationError import functools import itertools import numpy.random @@ -56,6 +57,7 @@ if TYPE_CHECKING: OOV_RANK = numpy.iinfo(numpy.uint64).max +DEFAULT_OOV_PROB = -20 LEXEME_NORM_LANGS = ["da", "de", "el", "en", "id", "lb", "pt", "ru", "sr", "ta", "th"] # Default order of sections in the config.cfg. Not all sections needs to exist, @@ -239,20 +241,6 @@ def get_module_path(module: ModuleType) -> Path: return Path(sys.modules[module.__module__].__file__).parent -def load_vectors_into_model( - nlp: "Language", name: Union[str, Path], *, add_strings=True -) -> None: - """Load word vectors from an installed model or path into a model instance.""" - vectors_nlp = load_model(name) - nlp.vocab.vectors = vectors_nlp.vocab.vectors - if add_strings: - # I guess we should add the strings from the vectors_nlp model? - # E.g. if someone does a similarity query, they might expect the strings. - for key in nlp.vocab.vectors.key2row: - if key in vectors_nlp.vocab.strings: - nlp.vocab.strings.add(vectors_nlp.vocab.strings[key]) - - def load_model( name: Union[str, Path], *, @@ -391,32 +379,9 @@ def load_model_from_config( return nlp -def resolve_training_config( - config: Config, - exclude: Iterable[str] = ("nlp", "components"), - validate: bool = True, -) -> Dict[str, Any]: - """Resolve the config sections relevant for trainig and create all objects. - Mostly used in the CLI to separate training config (not resolved by default - because not runtime-relevant – an nlp object should load fine even if it's - [training] block refers to functions that are not available etc.). - - config (Config): The config to resolve. - exclude (Iterable[str]): The config blocks to exclude. Those blocks won't - be available in the final resolved config. - validate (bool): Whether to validate the config. - RETURNS (Dict[str, Any]): The resolved config. - """ - config = config.copy() - for key in exclude: - if key in config: - config.pop(key) - return registry.resolve(config, validate=validate) - - def resolve_dot_names( config: Config, dot_names: List[Optional[str]] -) -> List[Optional[Callable]]: +) -> Tuple[Any]: """Resolve one or more "dot notation" names, e.g. corpora.train. The paths could point anywhere into the config, so we don't know which top-level section we'll be looking within. @@ -424,18 +389,42 @@ def resolve_dot_names( We resolve the whole top-level section, although we could resolve less -- we could find the lowest part of the tree. """ + # TODO: include schema? + # TODO: clean this up and avoid duplication resolved = {} output = [] + errors = [] for name in dot_names: if name is None: output.append(name) else: section = name.split(".")[0] - # We want to avoid resolving the same thing twice. + # We want to avoid resolving the same thing twice if section not in resolved: resolved[section] = registry.resolve(config[section]) - output.append(dot_to_object(resolved, name)) - return output + try: + output.append(dot_to_object(resolved, name)) + except KeyError: + msg = f"not a valid section reference: {name}" + errors.append({"loc": name.split("."), "msg": msg}) + objects = [] + for ref in output: + if not isinstance(ref, str): + msg = f"not a valid section reference: {ref} ({type(ref)})" + errors.append({"loc": ref.split("."), "msg": msg}) + continue + section = ref.split(".")[0] + # We want to avoid resolving the same thing twice + if section not in resolved: + resolved[section] = registry.resolve(config[section]) + try: + objects.append(dot_to_object(resolved, ref)) + except KeyError: + msg = f"not a valid section reference: {name}" + errors.append({"loc": ref.split("."), "msg": msg}) + if errors: + raise ConfigValidationError(config=config, errors=errors) + return tuple(objects) def load_model_from_init_py(