From 553bfea6418e76c28b8786de35df7a3df0e0b56a Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 28 Sep 2020 10:53:17 +0200 Subject: [PATCH] Fix commands --- spacy/cli/_util.py | 22 ++++++++ spacy/cli/init_model.py | 68 ----------------------- spacy/cli/init_pipeline.py | 110 +++++++++++++++++++++++++++++-------- spacy/cli/train.py | 84 +++++++++------------------- 4 files changed, 134 insertions(+), 150 deletions(-) delete mode 100644 spacy/cli/init_model.py diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index 68cb572ea..6eafee4df 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -16,6 +16,7 @@ import os from ..schemas import ProjectConfigSchema, validate from ..util import import_file, run_command, make_tempdir, registry, logger +from ..util import ensure_path if TYPE_CHECKING: from pathy import Pathy # noqa: F401 @@ -458,3 +459,24 @@ def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[in p = int(p) result.append(p) return result + + +def load_from_paths( + config: Config, +) -> Tuple[List[Dict[str, str]], Dict[str, dict], bytes]: + # TODO: separate checks from loading + raw_text = ensure_path(config["training"]["raw_text"]) + if raw_text is not None: + if not raw_text.exists(): + msg.fail("Can't find raw text", raw_text, exits=1) + raw_text = list(srsly.read_jsonl(config["training"]["raw_text"])) + tag_map = {} + morph_rules = {} + weights_data = None + init_tok2vec = ensure_path(config["training"]["init_tok2vec"]) + if init_tok2vec is not None: + if not init_tok2vec.exists(): + msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1) + with init_tok2vec.open("rb") as file_: + weights_data = file_.read() + return raw_text, tag_map, morph_rules, weights_data diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py deleted file mode 100644 index 4194f1bd0..000000000 --- a/spacy/cli/init_model.py +++ /dev/null @@ -1,68 +0,0 @@ -from typing import Optional, List, Dict, Any, Union, IO -import math -from tqdm import tqdm -import numpy -from ast import literal_eval -from pathlib import Path -from preshed.counter import PreshCounter -import tarfile -import gzip -import zipfile -import srsly -import warnings -from wasabi import msg, Printer -import typer -from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error - -DEFAULT_OOV_PROB = -20 - - -#@init_cli.command("vocab") -#@app.command( -# "init-model", -# context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, -# hidden=True, # hide this from main CLI help but still allow it to work with warning -#) -def init_model_cli( - # fmt: off - ctx: typer.Context, # This is only used to read additional arguments - lang: str = Arg(..., help="Pipeline language"), - output_dir: Path = Arg(..., help="Pipeline output directory"), - freqs_loc: Optional[Path] = Arg(None, help="Location of words frequencies file", exists=True), - clusters_loc: Optional[Path] = Opt(None, "--clusters-loc", "-c", help="Optional location of brown clusters data", exists=True), - jsonl_loc: Optional[Path] = Opt(None, "--jsonl-loc", "-j", help="Location of JSONL-formatted attributes file", exists=True), - vectors_loc: Optional[Path] = Opt(None, "--vectors-loc", "-v", help="Optional vectors file in Word2Vec format", exists=True), - prune_vectors: int = Opt(-1, "--prune-vectors", "-V", help="Optional number of vectors to prune to"), - truncate_vectors: int = Opt(0, "--truncate-vectors", "-t", help="Optional number of vectors to truncate to when reading in vectors file"), - vectors_name: Optional[str] = Opt(None, "--vectors-name", "-vn", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"), - model_name: Optional[str] = Opt(None, "--meta-name", "-mn", help="Optional name of the package for the pipeline meta"), - base_model: Optional[str] = Opt(None, "--base", "-b", help="Name of or path to base pipeline to start with (mostly relevant for pipelines with custom tokenizers)") - # fmt: on -): - """ - Create a new blank pipeline directory with vocab and vectors from raw data. - If vectors are provided in Word2Vec format, they can be either a .txt or - zipped as a .zip or .tar.gz. - - DOCS: https://nightly.spacy.io/api/cli#init-vocab - """ - if ctx.command.name == "init-model": - msg.warn( - "The init-model command is now called 'init vocab'. You can run " - "'python -m spacy init --help' for an overview of the other " - "available initialization commands." - ) - init_vocab( - lang, - output_dir, - freqs_loc=freqs_loc, - clusters_loc=clusters_loc, - jsonl_loc=jsonl_loc, - vectors_loc=vectors_loc, - prune_vectors=prune_vectors, - truncate_vectors=truncate_vectors, - vectors_name=vectors_name, - model_name=model_name, - base_model=base_model, - silent=False, - ) diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py index 34b396a50..ca70b51d1 100644 --- a/spacy/cli/init_pipeline.py +++ b/spacy/cli/init_pipeline.py @@ -1,18 +1,17 @@ -from typing import Optional, Dict, Any, Tuple, Union, Callable, List +from typing import Optional, Dict, Callable, Any import logging -import srsly from pathlib import Path from wasabi import msg import typer -from thinc.api import Config, fix_random_seed +from thinc.api import Config, fix_random_seed, set_gpu_allocator -from .train import create_before_to_disk_callback from .. import util -from ..util import registry -from ..schemas import ConfigSchemaTraining +from ..util import registry, resolve_dot_names +from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain +from ..language import Language +from ..errors import Errors from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error -from ._util import import_code, get_sourced_components -from ..util import resolve_dot_names +from ._util import import_code, get_sourced_components, load_from_paths @init_cli.command( @@ -31,10 +30,12 @@ def init_pipeline_cli( util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR) overrides = parse_config_overrides(ctx.args) import_code(code_path) - config = util.load_config(config_path, overrides=overrides) with show_validation_error(config_path): - nlp = init_pipeline(config) + config = util.load_config(config_path, overrides=overrides) + nlp = init_pipeline(config) nlp.to_disk(output_path) + # TODO: add more instructions + msg.good(f"Saved initialized pipeline to {output_path}") def must_initialize(init_path: Path, config_path: Path, overrides: Dict) -> bool: @@ -51,7 +52,7 @@ def must_initialize(init_path: Path, config_path: Path, overrides: Dict) -> bool return False -def init_pipeline(config: Config, use_gpu=-1): +def init_pipeline(config: Config, use_gpu: int = -1) -> Language: raw_config = config config = raw_config.interpolate() if config["training"]["seed"] is not None: @@ -61,22 +62,19 @@ def init_pipeline(config: Config, use_gpu=-1): set_gpu_allocator(allocator) # Use original config here before it's resolved to functions sourced_components = get_sourced_components(config) - nlp = util.load_model_from_config(raw_config) + with show_validation_error(): + nlp = util.load_model_from_config(raw_config) + msg.good("Set up nlp object from config") # Resolve all training-relevant sections using the filled nlp config - T = registry.resolve( - config["training"], - schema=ConfigSchemaTraining, - validate=True, - ) + T = registry.resolve(config["training"], schema=ConfigSchemaTraining) dot_names = [T["train_corpus"], T["dev_corpus"], T["raw_text"]] train_corpus, dev_corpus, raw_text = resolve_dot_names(config, dot_names) util.load_vocab_data_into_model(nlp, lookups=T["lookups"]) + msg.good("Created vocabulary") if T["vectors"] is not None: add_vectors(nlp, T["vectors"]) - score_weights = T["score_weights"] + msg.good(f"Added vectors: {T['vectors']}") optimizer = T["optimizer"] - batcher = T["batcher"] - train_logger = T["logger"] before_to_disk = create_before_to_disk_callback(T["before_to_disk"]) # Components that shouldn't be updated during training frozen_components = T["frozen_components"] @@ -89,13 +87,23 @@ def init_pipeline(config: Config, use_gpu=-1): nlp.resume_training(sgd=optimizer) with nlp.select_pipes(disable=[*frozen_components, *resume_components]): nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer) + msg.good(f"Initialized pipeline components") # Verify the config after calling 'begin_training' to ensure labels # are properly initialized verify_config(nlp) + if "pretraining" in config and config["pretraining"]: + P = registry.resolve(config["pretraining"], schema=ConfigSchemaPretrain) + add_tok2vec_weights({"training": T, "pretraining": P}, nlp) + # TODO: this should be handled better? + nlp = before_to_disk(nlp) + return nlp + +def add_tok2vec_weights(config: Config, nlp: Language) -> None: # Load pretrained tok2vec weights - cf. CLI command 'pretrain' + weights_data = load_from_paths(config) if weights_data is not None: - tok2vec_component = C["pretraining"]["component"] + tok2vec_component = config["pretraining"]["component"] if tok2vec_component is None: msg.fail( f"To use pretrained tok2vec weights, [pretraining.component] " @@ -103,9 +111,63 @@ def init_pipeline(config: Config, use_gpu=-1): exits=1, ) layer = nlp.get_pipe(tok2vec_component).model - tok2vec_layer = C["pretraining"]["layer"] + tok2vec_layer = config["pretraining"]["layer"] if tok2vec_layer: layer = layer.get_ref(tok2vec_layer) layer.from_bytes(weights_data) - msg.info(f"Loaded pretrained weights into component '{tok2vec_component}'") - return nlp + msg.good(f"Loaded pretrained weights into component '{tok2vec_component}'") + + +def add_vectors(nlp: Language, vectors: str) -> None: + title = f"Config validation error for vectors {vectors}" + desc = ( + "This typically means that there's a problem in the config.cfg included " + "with the packaged vectors. Make sure that the vectors package you're " + "loading is compatible with the current version of spaCy." + ) + with show_validation_error( + title=title, desc=desc, hint_fill=False, show_config=False + ): + util.load_vectors_into_model(nlp, vectors) + + +def verify_config(nlp: Language) -> None: + """Perform additional checks based on the config, loaded nlp object and training data.""" + # TODO: maybe we should validate based on the actual components, the list + # in config["nlp"]["pipeline"] instead? + for pipe_config in nlp.config["components"].values(): + # We can't assume that the component name == the factory + factory = pipe_config["factory"] + if factory == "textcat": + verify_textcat_config(nlp, pipe_config) + + +def verify_textcat_config(nlp: Language, pipe_config: Dict[str, Any]) -> None: + # if 'positive_label' is provided: double check whether it's in the data and + # the task is binary + if pipe_config.get("positive_label"): + textcat_labels = nlp.get_pipe("textcat").labels + pos_label = pipe_config.get("positive_label") + if pos_label not in textcat_labels: + raise ValueError( + Errors.E920.format(pos_label=pos_label, labels=textcat_labels) + ) + if len(list(textcat_labels)) != 2: + raise ValueError( + Errors.E919.format(pos_label=pos_label, labels=textcat_labels) + ) + + +def create_before_to_disk_callback( + callback: Optional[Callable[[Language], Language]] +) -> Callable[[Language], Language]: + def before_to_disk(nlp: Language) -> Language: + if not callback: + return nlp + modified_nlp = callback(nlp) + if not isinstance(modified_nlp, Language): + err = Errors.E914.format(name="before_to_disk", value=type(modified_nlp)) + raise ValueError(err) + return modified_nlp + + return before_to_disk diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 8a360ad44..3476d5966 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -1,6 +1,5 @@ from typing import Optional, Dict, Any, Tuple, Union, Callable, List from timeit import default_timer as timer -import srsly import tqdm from pathlib import Path from wasabi import msg @@ -11,13 +10,17 @@ import random import typer import logging +from .init_pipeline import init_pipeline, must_initialize +from .init_pipeline import create_before_to_disk_callback from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error -from ._util import import_code, get_sourced_components +from ._util import import_code +from ._util import load_from_paths # noqa: F401 (needed for Ray extension for now) from ..language import Language from .. import util from ..training.example import Example from ..errors import Errors -from ..util import resolve_dot_names +from ..util import resolve_dot_names, registry +from ..schemas import ConfigSchemaTraining @app.command( @@ -56,25 +59,35 @@ def train_cli( require_gpu(use_gpu) else: msg.info("Using CPU") - config = util.load_config( - config_path, overrides=config_overrides, interpolate=False - ) + config = util.load_config(config_path, overrides=overrides, interpolate=False) + msg.divider("Initializing pipeline") + # TODO: add warnings / --initialize (?) argument if output_path is None: nlp = init_pipeline(config) else: - init_path = output_path / "model-initial" - if must_reinitialize(config, init_path): + init_path = output_path / "model-initial" + if must_initialize(config, init_path): nlp = init_pipeline(config) nlp.to_disk(init_path) + msg.good(f"Saved initialized pipeline to {init_path}") else: - nlp = spacy.load(output_path / "model-initial") - msg.info("Start training") - train(nlp, config, output_path) + nlp = util.load_model(init_path) + msg.good(f"Loaded initialized pipeline from {init_path}") + msg.divider("Training pipeline") + train(nlp, output_path, use_gpu=use_gpu) -def train(nlp: Language, output_path: Optional[Path]=None) -> None: +def train( + nlp: Language, output_path: Optional[Path] = None, *, use_gpu: int = -1 +) -> None: + # TODO: random seed, GPU allocator # Create iterator, which yields out info after each optimization step. config = nlp.config.interpolate() + if config["training"]["seed"] is not None: + fix_random_seed(config["training"]["seed"]) + allocator = config["training"]["gpu_allocator"] + if use_gpu >= 0 and allocator: + set_gpu_allocator(allocator) T = registry.resolve(config["training"], schema=ConfigSchemaTraining) dot_names = [T["train_corpus"], T["dev_corpus"], T["raw_text"]] train_corpus, dev_corpus, raw_text = resolve_dot_names(config, dot_names) @@ -85,9 +98,7 @@ def train(nlp: Language, output_path: Optional[Path]=None) -> None: before_to_disk = create_before_to_disk_callback(T["before_to_disk"]) # Components that shouldn't be updated during training frozen_components = T["frozen_components"] - # Create iterator, which yields out info after each optimization step. - msg.info("Start training") training_step_iterator = train_while_improving( nlp, optimizer, @@ -101,7 +112,7 @@ def train(nlp: Language, output_path: Optional[Path]=None) -> None: raw_text=raw_text, exclude=frozen_components, ) - msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}") + msg.info(f"Initial learn rate: {optimizer.learn_rate}") with nlp.select_pipes(disable=frozen_components): print_row, finalize_logger = train_logger(nlp) @@ -145,7 +156,6 @@ def train(nlp: Language, output_path: Optional[Path]=None) -> None: msg.good(f"Saved pipeline to output directory {final_model_path}") - def add_vectors(nlp: Language, vectors: str) -> None: title = f"Config validation error for vectors {vectors}" desc = ( @@ -199,21 +209,6 @@ def create_evaluation_callback( return evaluate -def create_before_to_disk_callback( - callback: Optional[Callable[[Language], Language]] -) -> Callable[[Language], Language]: - def before_to_disk(nlp: Language) -> Language: - if not callback: - return nlp - modified_nlp = callback(nlp) - if not isinstance(modified_nlp, Language): - err = Errors.E914.format(name="before_to_disk", value=type(modified_nlp)) - raise ValueError(err) - return modified_nlp - - return before_to_disk - - def train_while_improving( nlp: Language, optimizer: Optimizer, @@ -370,30 +365,3 @@ def verify_cli_args(config_path: Path, output_path: Optional[Path] = None) -> No if not output_path.exists(): output_path.mkdir() msg.good(f"Created output directory: {output_path}") - - -def verify_config(nlp: Language) -> None: - """Perform additional checks based on the config, loaded nlp object and training data.""" - # TODO: maybe we should validate based on the actual components, the list - # in config["nlp"]["pipeline"] instead? - for pipe_config in nlp.config["components"].values(): - # We can't assume that the component name == the factory - factory = pipe_config["factory"] - if factory == "textcat": - verify_textcat_config(nlp, pipe_config) - - -def verify_textcat_config(nlp: Language, pipe_config: Dict[str, Any]) -> None: - # if 'positive_label' is provided: double check whether it's in the data and - # the task is binary - if pipe_config.get("positive_label"): - textcat_labels = nlp.get_pipe("textcat").labels - pos_label = pipe_config.get("positive_label") - if pos_label not in textcat_labels: - raise ValueError( - Errors.E920.format(pos_label=pos_label, labels=textcat_labels) - ) - if len(list(textcat_labels)) != 2: - raise ValueError( - Errors.E919.format(pos_label=pos_label, labels=textcat_labels) - )