mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-11 17:56:30 +03:00
Fix typos and refactor CLI logging
This commit is contained in:
parent
2e9c9e74af
commit
a139fe672b
|
@ -448,19 +448,8 @@ def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[in
|
|||
return result
|
||||
|
||||
|
||||
class CliLogger:
|
||||
"""Helper mocking up the most commonly used logger methods. Can be passed
|
||||
into functions like train() to make them output pretty-printed messages
|
||||
on the CLI and regular logging if used from within Python.
|
||||
"""
|
||||
|
||||
debug = msg.text
|
||||
info = msg.info
|
||||
warn = msg.info
|
||||
error = msg.fail
|
||||
|
||||
|
||||
def setup_gpu(use_gpu: int):
|
||||
def setup_gpu(use_gpu: int) -> None:
|
||||
"""Configure the GPU and log info."""
|
||||
if use_gpu >= 0:
|
||||
msg.info(f"Using GPU: {use_gpu}")
|
||||
require_gpu(use_gpu)
|
||||
|
|
|
@ -7,7 +7,7 @@ import typer
|
|||
from .. import util
|
||||
from ..training.initialize import init_nlp
|
||||
from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
|
||||
from ._util import import_code, CliLogger, setup_gpu
|
||||
from ._util import import_code, setup_gpu
|
||||
|
||||
|
||||
@init_cli.command(
|
||||
|
@ -32,6 +32,6 @@ def init_pipeline_cli(
|
|||
with show_validation_error(config_path):
|
||||
config = util.load_config(config_path, overrides=overrides)
|
||||
with show_validation_error(hint_fill=False):
|
||||
nlp = init_nlp(config, use_gpu=use_gpu, logger=CliLogger, on_succcess=msg.good)
|
||||
nlp = init_nlp(config, use_gpu=use_gpu, silent=False)
|
||||
nlp.to_disk(output_path)
|
||||
msg.good(f"Saved initialized pipeline to {output_path}")
|
||||
|
|
|
@ -5,7 +5,7 @@ import typer
|
|||
import re
|
||||
|
||||
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
|
||||
from ._util import import_code, setup_gpu, CliLogger
|
||||
from ._util import import_code, setup_gpu
|
||||
from ..training.pretrain import pretrain
|
||||
from ..util import load_config
|
||||
|
||||
|
@ -73,7 +73,7 @@ def pretrain_cli(
|
|||
resume_path=resume_path,
|
||||
epoch_resume=epoch_resume,
|
||||
use_gpu=use_gpu,
|
||||
logger=CliLogger,
|
||||
silent=False,
|
||||
)
|
||||
msg.good("Successfully finished pretrain")
|
||||
|
||||
|
|
|
@ -6,7 +6,7 @@ import typer
|
|||
import logging
|
||||
|
||||
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
|
||||
from ._util import import_code, CliLogger, setup_gpu
|
||||
from ._util import import_code, setup_gpu
|
||||
from ..language import Language
|
||||
from ..training.loop import train
|
||||
from ..training.initialize import init_nlp, must_reinitialize
|
||||
|
@ -50,15 +50,13 @@ def train_cli(
|
|||
msg.divider("Initializing pipeline")
|
||||
nlp = init_pipeline(config, output_path, use_gpu=use_gpu)
|
||||
msg.divider("Training pipeline")
|
||||
final_path = train(nlp, output_path, use_gpu=use_gpu, logger=CliLogger)
|
||||
if final_path:
|
||||
msg.good(f"Saved pipeline to output directory", final_path)
|
||||
train(nlp, output_path, use_gpu=use_gpu, silent=False)
|
||||
|
||||
|
||||
def init_pipeline(
|
||||
config: Config, output_path: Optional[Path], *, use_gpu: int = -1
|
||||
) -> Language:
|
||||
init_kwargs = {"use_gpu": use_gpu, "logger": CliLogger, "on_success": msg.good}
|
||||
init_kwargs = {"use_gpu": use_gpu, "silent": False}
|
||||
if output_path is not None:
|
||||
init_path = output_path / "model-initial"
|
||||
if not init_path.exists():
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
from typing import Union, Dict, Optional, Any, List, Callable
|
||||
from typing import Union, Dict, Optional, Any, List
|
||||
from thinc.api import Config, fix_random_seed, set_gpu_allocator
|
||||
from thinc.api import ConfigValidationError
|
||||
from pathlib import Path
|
||||
from wasabi import Printer
|
||||
import srsly
|
||||
|
||||
from .loop import create_before_to_disk_callback
|
||||
|
@ -10,16 +11,11 @@ from ..lookups import Lookups
|
|||
from ..errors import Errors
|
||||
from ..schemas import ConfigSchemaTraining, ConfigSchemaInit, ConfigSchemaPretrain
|
||||
from ..util import registry, load_model_from_config, resolve_dot_names
|
||||
from ..util import load_model, ensure_path, logger, OOV_RANK, DEFAULT_OOV_PROB
|
||||
from ..util import load_model, ensure_path, OOV_RANK, DEFAULT_OOV_PROB
|
||||
|
||||
|
||||
def init_nlp(
|
||||
config: Config,
|
||||
*,
|
||||
use_gpu: int = -1,
|
||||
logger: Callable[[Any], Any] = logger,
|
||||
on_success: Callable[[str], None] = lambda x: None,
|
||||
) -> Language:
|
||||
def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> Language:
|
||||
msg = Printer(no_print=silent)
|
||||
raw_config = config
|
||||
config = raw_config.interpolate()
|
||||
if config["training"]["seed"] is not None:
|
||||
|
@ -30,7 +26,7 @@ def init_nlp(
|
|||
# Use original config here before it's resolved to functions
|
||||
sourced_components = get_sourced_components(config)
|
||||
nlp = load_model_from_config(raw_config, auto_fill=True)
|
||||
on_success("Set up nlp object from config")
|
||||
msg.good("Set up nlp object from config")
|
||||
config = nlp.config.interpolate()
|
||||
# Resolve all training-relevant sections using the filled nlp config
|
||||
T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
|
||||
|
@ -38,29 +34,31 @@ def init_nlp(
|
|||
train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
|
||||
I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
|
||||
V = I["vocab"]
|
||||
init_vocab(nlp, data=V["data"], lookups=V["lookups"], vectors=V["vectors"])
|
||||
init_vocab(
|
||||
nlp, data=V["data"], lookups=V["lookups"], vectors=V["vectors"], silent=silent
|
||||
)
|
||||
optimizer = T["optimizer"]
|
||||
before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
|
||||
# Components that shouldn't be updated during training
|
||||
frozen_components = T["frozen_components"]
|
||||
# Sourced components that require resume_training
|
||||
resume_components = [p for p in sourced_components if p not in frozen_components]
|
||||
logger.info(f"Pipeline: {nlp.pipe_names}")
|
||||
msg.info(f"Pipeline: {nlp.pipe_names}")
|
||||
if resume_components:
|
||||
with nlp.select_pipes(enable=resume_components):
|
||||
logger.info(f"Resuming training for: {resume_components}")
|
||||
msg.info(f"Resuming training for: {resume_components}")
|
||||
nlp.resume_training(sgd=optimizer)
|
||||
with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
|
||||
nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
|
||||
on_success(f"Initialized pipeline components")
|
||||
msg.good(f"Initialized pipeline components")
|
||||
# Verify the config after calling 'begin_training' to ensure labels
|
||||
# are properly initialized
|
||||
verify_config(nlp)
|
||||
if "pretraining" in config and config["pretraining"]:
|
||||
P = registry.resolve(config["pretraining"], schema=ConfigSchemaPretrain)
|
||||
loaded = add_tok2vec_weights(nlp, P, I)
|
||||
loaded = add_tok2vec_weights(nlp, P, V)
|
||||
if loaded and P["component"]:
|
||||
on_success(f"Loaded pretrained weights into component '{P['component']}'")
|
||||
msg.good(f"Loaded pretrained weights into component '{P['component']}'")
|
||||
nlp = before_to_disk(nlp)
|
||||
return nlp
|
||||
|
||||
|
@ -76,11 +74,12 @@ def init_vocab(
|
|||
data: Optional[Path] = None,
|
||||
lookups: Optional[Lookups] = None,
|
||||
vectors: Optional[str] = None,
|
||||
on_success: Callable[[str], None] = lambda x: None,
|
||||
silent: bool = True,
|
||||
) -> Language:
|
||||
msg = Printer(no_print=silent)
|
||||
if lookups:
|
||||
nlp.vocab.lookups = lookups
|
||||
on_success(f"Added vocab lookups: {', '.join(lookups.tables)}")
|
||||
msg.good(f"Added vocab lookups: {', '.join(lookups.tables)}")
|
||||
data_path = ensure_path(data)
|
||||
if data_path is not None:
|
||||
lex_attrs = srsly.read_jsonl(data_path)
|
||||
|
@ -96,11 +95,11 @@ def init_vocab(
|
|||
else:
|
||||
oov_prob = DEFAULT_OOV_PROB
|
||||
nlp.vocab.cfg.update({"oov_prob": oov_prob})
|
||||
on_success(f"Added {len(nlp.vocab)} lexical entries to the vocab")
|
||||
on_success("Created vocabulary")
|
||||
msg.good(f"Added {len(nlp.vocab)} lexical entries to the vocab")
|
||||
msg.good("Created vocabulary")
|
||||
if vectors is not None:
|
||||
load_vectors_into_model(nlp, vectors)
|
||||
on_success(f"Added vectors: {vectors}")
|
||||
msg.good(f"Added vectors: {vectors}")
|
||||
|
||||
|
||||
def load_vectors_into_model(
|
||||
|
@ -137,8 +136,8 @@ def add_tok2vec_weights(
|
|||
init_tok2vec = ensure_path(V["init_tok2vec"])
|
||||
if init_tok2vec is not None:
|
||||
if P["objective"].get("type") == "vectors" and not V["vectors"]:
|
||||
err = 'need initialize.vectors if pretraining.objective.type is "vectors"'
|
||||
errors = [{"loc": ["initialize", "vectors"], "msg": err}]
|
||||
err = 'need initialize.vocab.vectors if pretraining.objective.type is "vectors"'
|
||||
errors = [{"loc": ["initialize", "vocab"], "msg": err}]
|
||||
raise ConfigValidationError(config=nlp.config, errors=errors)
|
||||
if not init_tok2vec.exists():
|
||||
err = f"can't find pretrained tok2vec: {init_tok2vec}"
|
||||
|
|
|
@ -5,12 +5,13 @@ from timeit import default_timer as timer
|
|||
from thinc.api import Optimizer, Config, constant, fix_random_seed, set_gpu_allocator
|
||||
import random
|
||||
import tqdm
|
||||
from wasabi import Printer
|
||||
|
||||
from .example import Example
|
||||
from ..schemas import ConfigSchemaTraining
|
||||
from ..language import Language
|
||||
from ..errors import Errors
|
||||
from ..util import resolve_dot_names, registry, logger
|
||||
from ..util import resolve_dot_names, registry
|
||||
|
||||
|
||||
def train(
|
||||
|
@ -18,8 +19,8 @@ def train(
|
|||
output_path: Optional[Path] = None,
|
||||
*,
|
||||
use_gpu: int = -1,
|
||||
logger: Callable[[Any], Any] = logger,
|
||||
) -> Optional[Path]:
|
||||
silent: bool = False,
|
||||
) -> None:
|
||||
"""Train a pipeline.
|
||||
|
||||
nlp (Language): The initialized nlp object with the full config.
|
||||
|
@ -31,7 +32,7 @@ def train(
|
|||
swapped for CLI logger.
|
||||
RETURNS (Path / None): The path to the final exported model.
|
||||
"""
|
||||
|
||||
msg = Printer(no_print=silent)
|
||||
# Create iterator, which yields out info after each optimization step.
|
||||
config = nlp.config.interpolate()
|
||||
if config["training"]["seed"] is not None:
|
||||
|
@ -62,10 +63,10 @@ def train(
|
|||
eval_frequency=T["eval_frequency"],
|
||||
exclude=frozen_components,
|
||||
)
|
||||
logger.info(f"Pipeline: {nlp.pipe_names}")
|
||||
msg.info(f"Pipeline: {nlp.pipe_names}")
|
||||
if frozen_components:
|
||||
logger.info(f"Frozen components: {frozen_components}")
|
||||
logger.info(f"Initial learn rate: {optimizer.learn_rate}")
|
||||
msg.info(f"Frozen components: {frozen_components}")
|
||||
msg.info(f"Initial learn rate: {optimizer.learn_rate}")
|
||||
with nlp.select_pipes(disable=frozen_components):
|
||||
print_row, finalize_logger = train_logger(nlp)
|
||||
try:
|
||||
|
@ -89,7 +90,7 @@ def train(
|
|||
if output_path is not None:
|
||||
# We don't want to swallow the traceback if we don't have a
|
||||
# specific error.
|
||||
logger.warn(
|
||||
msg.warn(
|
||||
f"Aborting and saving the final best model. "
|
||||
f"Encountered exception: {str(e)}"
|
||||
)
|
||||
|
@ -105,7 +106,7 @@ def train(
|
|||
nlp.to_disk(final_model_path)
|
||||
else:
|
||||
nlp.to_disk(final_model_path)
|
||||
return final_model_path
|
||||
msg.good(f"Saved pipeline to output directory", final_model_path)
|
||||
|
||||
|
||||
def train_while_improving(
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Optional, Callable, Any, Iterable, Union, List
|
||||
from typing import Optional, Callable, Iterable, Union, List
|
||||
from thinc.api import Config, fix_random_seed, set_gpu_allocator, Model, Optimizer
|
||||
from thinc.api import set_dropout_rate, to_categorical, CosineDistance, L2Distance
|
||||
from pathlib import Path
|
||||
|
@ -8,7 +8,7 @@ import srsly
|
|||
import numpy
|
||||
import time
|
||||
import re
|
||||
from wasabi import msg
|
||||
from wasabi import Printer
|
||||
|
||||
from .example import Example
|
||||
from ..tokens import Doc
|
||||
|
@ -16,7 +16,7 @@ from ..attrs import ID
|
|||
from ..ml.models.multi_task import build_cloze_multi_task_model
|
||||
from ..ml.models.multi_task import build_cloze_characters_multi_task_model
|
||||
from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain
|
||||
from ..util import registry, load_model_from_config, dot_to_object, logger
|
||||
from ..util import registry, load_model_from_config, dot_to_object
|
||||
|
||||
|
||||
def pretrain(
|
||||
|
@ -25,8 +25,9 @@ def pretrain(
|
|||
resume_path: Optional[Path] = None,
|
||||
epoch_resume: Optional[int] = None,
|
||||
use_gpu: int = -1,
|
||||
logger: Callable[[Any], Any] = logger,
|
||||
silent: bool = True,
|
||||
):
|
||||
msg = Printer(no_print=silent)
|
||||
if config["training"]["seed"] is not None:
|
||||
fix_random_seed(config["training"]["seed"])
|
||||
allocator = config["training"]["gpu_allocator"]
|
||||
|
@ -42,11 +43,10 @@ def pretrain(
|
|||
optimizer = P["optimizer"]
|
||||
# Load in pretrained weights to resume from
|
||||
if resume_path is not None:
|
||||
_resume_model(model, resume_path, epoch_resume)
|
||||
_resume_model(model, resume_path, epoch_resume, silent=silent)
|
||||
else:
|
||||
# Without '--resume-path' the '--epoch-resume' argument is ignored
|
||||
epoch_resume = 0
|
||||
|
||||
# TODO: move this to logger function?
|
||||
tracker = ProgressTracker(frequency=10000)
|
||||
msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}")
|
||||
|
@ -94,12 +94,10 @@ def ensure_docs(examples_or_docs: Iterable[Union[Doc, Example]]) -> List[Doc]:
|
|||
|
||||
|
||||
def _resume_model(
|
||||
model: Model,
|
||||
resume_path: Path,
|
||||
epoch_resume: int,
|
||||
logger: Callable[[Any], Any] = logger,
|
||||
model: Model, resume_path: Path, epoch_resume: int, silent: bool = True,
|
||||
) -> None:
|
||||
logger.info(f"Resume training tok2vec from: {resume_path}")
|
||||
msg = Printer(no_print=silent)
|
||||
msg.info(f"Resume training tok2vec from: {resume_path}")
|
||||
with resume_path.open("rb") as file_:
|
||||
weights_data = file_.read()
|
||||
model.get_ref("tok2vec").from_bytes(weights_data)
|
||||
|
@ -108,9 +106,9 @@ def _resume_model(
|
|||
if model_name:
|
||||
# Default weight file name so read epoch_start from it by cutting off 'model' and '.bin'
|
||||
epoch_resume = int(model_name.group(0)[5:][:-4]) + 1
|
||||
logger.info(f"Resuming from epoch: {epoch_resume}")
|
||||
msg.info(f"Resuming from epoch: {epoch_resume}")
|
||||
else:
|
||||
logger.info(f"Resuming from epoch: {epoch_resume}")
|
||||
msg.info(f"Resuming from epoch: {epoch_resume}")
|
||||
|
||||
|
||||
def make_update(
|
||||
|
|
Loading…
Reference in New Issue
Block a user