Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Ines Montani 2020-10-03 16:08:27 +02:00
commit 3b8f352eda
6 changed files with 119 additions and 86 deletions

View File

@ -1,6 +1,6 @@
# fmt: off # fmt: off
__title__ = "spacy-nightly" __title__ = "spacy-nightly"
__version__ = "3.0.0a29" __version__ = "3.0.0a30"
__release__ = True __release__ = True
__download_url__ = "https://github.com/explosion/spacy-models/releases/download" __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

View File

@ -3,6 +3,7 @@ from pathlib import Path
from wasabi import msg from wasabi import msg
import typer import typer
import logging import logging
import sys
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
from ._util import import_code, setup_gpu from ._util import import_code, setup_gpu
@ -39,7 +40,12 @@ def train_cli(
DOCS: https://nightly.spacy.io/api/cli#train DOCS: https://nightly.spacy.io/api/cli#train
""" """
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO) util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
verify_cli_args(config_path, output_path) # Make sure all files and paths exists if they are needed
if not config_path or not config_path.exists():
msg.fail("Config file not found", config_path, exits=1)
if output_path is not None and not output_path.exists():
output_path.mkdir()
msg.good(f"Created output directory: {output_path}")
overrides = parse_config_overrides(ctx.args) overrides = parse_config_overrides(ctx.args)
import_code(code_path) import_code(code_path)
setup_gpu(use_gpu) setup_gpu(use_gpu)
@ -50,14 +56,4 @@ def train_cli(
nlp = init_nlp(config, use_gpu=use_gpu) nlp = init_nlp(config, use_gpu=use_gpu)
msg.good("Initialized pipeline") msg.good("Initialized pipeline")
msg.divider("Training pipeline") msg.divider("Training pipeline")
train(nlp, output_path, use_gpu=use_gpu, silent=False) train(nlp, output_path, use_gpu=use_gpu, stdout=sys.stdout, stderr=sys.stderr)
def verify_cli_args(config_path: Path, output_path: Optional[Path] = None) -> None:
# Make sure all files and paths exists if they are needed
if not config_path or not config_path.exists():
msg.fail("Config file not found", config_path, exits=1)
if output_path is not None:
if not output_path.exists():
output_path.mkdir()
msg.good(f"Created output directory: {output_path}")

View File

@ -102,7 +102,7 @@ def load_vectors_into_model(
"with the packaged vectors. Make sure that the vectors package you're " "with the packaged vectors. Make sure that the vectors package you're "
"loading is compatible with the current version of spaCy." "loading is compatible with the current version of spaCy."
) )
err = ConfigValidationError.from_error(config=None, title=title, desc=desc) err = ConfigValidationError.from_error(e, config=None, title=title, desc=desc)
raise err from None raise err from None
nlp.vocab.vectors = vectors_nlp.vocab.vectors nlp.vocab.vectors = vectors_nlp.vocab.vectors
if add_strings: if add_strings:

View File

@ -1,18 +1,24 @@
from typing import Dict, Any, Tuple, Callable, List from typing import TYPE_CHECKING, Dict, Any, Tuple, Callable, List, Optional, IO
import wasabi
import tqdm
import sys
from ..util import registry from ..util import registry
from .. import util from .. import util
from ..errors import Errors from ..errors import Errors
from wasabi import msg
@registry.loggers("spacy.ConsoleLogger.v1") @registry.loggers("spacy.ConsoleLogger.v1")
def console_logger(): def console_logger(progress_bar: bool=False):
def setup_printer( def setup_printer(
nlp: "Language", nlp: "Language",
) -> Tuple[Callable[[Dict[str, Any]], None], Callable]: stdout: IO=sys.stdout,
stderr: IO=sys.stderr
) -> Tuple[Callable[[Optional[Dict[str, Any]]], None], Callable]:
msg = wasabi.Printer(no_print=True)
# we assume here that only components are enabled that should be trained & logged # we assume here that only components are enabled that should be trained & logged
logged_pipes = nlp.pipe_names logged_pipes = nlp.pipe_names
eval_frequency = nlp.config["training"]["eval_frequency"]
score_weights = nlp.config["training"]["score_weights"] score_weights = nlp.config["training"]["score_weights"]
score_cols = [col for col, value in score_weights.items() if value is not None] score_cols = [col for col, value in score_weights.items() if value is not None]
score_widths = [max(len(col), 6) for col in score_cols] score_widths = [max(len(col), 6) for col in score_cols]
@ -22,10 +28,18 @@ def console_logger():
table_header = [col.upper() for col in table_header] table_header = [col.upper() for col in table_header]
table_widths = [3, 6] + loss_widths + score_widths + [6] table_widths = [3, 6] + loss_widths + score_widths + [6]
table_aligns = ["r" for _ in table_widths] table_aligns = ["r" for _ in table_widths]
msg.row(table_header, widths=table_widths) stdout.write(msg.row(table_header, widths=table_widths))
msg.row(["-" * width for width in table_widths]) stdout.write(msg.row(["-" * width for width in table_widths]))
progress = None
def log_step(info: Dict[str, Any]): def log_step(info: Optional[Dict[str, Any]]):
nonlocal progress
if info is None:
# If we don't have a new checkpoint, just return.
if progress is not None:
progress.update(1)
return
try: try:
losses = [ losses = [
"{0:.2f}".format(float(info["losses"][pipe_name])) "{0:.2f}".format(float(info["losses"][pipe_name]))
@ -39,24 +53,37 @@ def console_logger():
keys=list(info["losses"].keys()), keys=list(info["losses"].keys()),
) )
) from None ) from None
scores = [] scores = []
for col in score_cols: for col in score_cols:
score = info["other_scores"].get(col, 0.0) score = info["other_scores"].get(col, 0.0)
try: try:
score = float(score) score = float(score)
if col != "speed":
score *= 100
scores.append("{0:.2f}".format(score))
except TypeError: except TypeError:
err = Errors.E916.format(name=col, score_type=type(score)) err = Errors.E916.format(name=col, score_type=type(score))
raise ValueError(err) from None raise ValueError(err) from None
if col != "speed":
score *= 100
scores.append("{0:.2f}".format(score))
data = ( data = (
[info["epoch"], info["step"]] [info["epoch"], info["step"]]
+ losses + losses
+ scores + scores
+ ["{0:.2f}".format(float(info["score"]))] + ["{0:.2f}".format(float(info["score"]))]
) )
msg.row(data, widths=table_widths, aligns=table_aligns) if progress is not None:
progress.close()
stdout.write(msg.row(data, widths=table_widths, aligns=table_aligns))
if progress_bar:
# Set disable=None, so that it disables on non-TTY
progress = tqdm.tqdm(
total=eval_frequency,
disable=None,
leave=False,
file=stderr
)
progress.set_description(f"Epoch {info['epoch']+1}")
def finalize(): def finalize():
pass pass
@ -70,10 +97,12 @@ def console_logger():
def wandb_logger(project_name: str, remove_config_values: List[str] = []): def wandb_logger(project_name: str, remove_config_values: List[str] = []):
import wandb import wandb
console = console_logger() console = console_logger(progress_bar=False)
def setup_logger( def setup_logger(
nlp: "Language", nlp: "Language",
stdout: IO=sys.stdout,
stderr: IO=sys.stderr
) -> Tuple[Callable[[Dict[str, Any]], None], Callable]: ) -> Tuple[Callable[[Dict[str, Any]], None], Callable]:
config = nlp.config.interpolate() config = nlp.config.interpolate()
config_dot = util.dict_to_dot(config) config_dot = util.dict_to_dot(config)
@ -81,18 +110,19 @@ def wandb_logger(project_name: str, remove_config_values: List[str] = []):
del config_dot[field] del config_dot[field]
config = util.dot_to_dict(config_dot) config = util.dot_to_dict(config_dot)
wandb.init(project=project_name, config=config, reinit=True) wandb.init(project=project_name, config=config, reinit=True)
console_log_step, console_finalize = console(nlp) console_log_step, console_finalize = console(nlp, stdout, stderr)
def log_step(info: Dict[str, Any]): def log_step(info: Optional[Dict[str, Any]]):
console_log_step(info) console_log_step(info)
score = info["score"] if info is not None:
other_scores = info["other_scores"] score = info["score"]
losses = info["losses"] other_scores = info["other_scores"]
wandb.log({"score": score}) losses = info["losses"]
if losses: wandb.log({"score": score})
wandb.log({f"loss_{k}": v for k, v in losses.items()}) if losses:
if isinstance(other_scores, dict): wandb.log({f"loss_{k}": v for k, v in losses.items()})
wandb.log(other_scores) if isinstance(other_scores, dict):
wandb.log(other_scores)
def finalize(): def finalize():
console_finalize() console_finalize()

View File

@ -1,11 +1,11 @@
from typing import List, Callable, Tuple, Dict, Iterable, Iterator, Union, Any from typing import List, Callable, Tuple, Dict, Iterable, Iterator, Union, Any, IO
from typing import Optional, TYPE_CHECKING from typing import Optional, TYPE_CHECKING
from pathlib import Path from pathlib import Path
from timeit import default_timer as timer from timeit import default_timer as timer
from thinc.api import Optimizer, Config, constant, fix_random_seed, set_gpu_allocator from thinc.api import Optimizer, Config, constant, fix_random_seed, set_gpu_allocator
import random import random
import tqdm import wasabi
from wasabi import Printer import sys
from .example import Example from .example import Example
from ..schemas import ConfigSchemaTraining from ..schemas import ConfigSchemaTraining
@ -21,7 +21,8 @@ def train(
output_path: Optional[Path] = None, output_path: Optional[Path] = None,
*, *,
use_gpu: int = -1, use_gpu: int = -1,
silent: bool = False, stdout: IO=sys.stdout,
stderr: IO=sys.stderr
) -> None: ) -> None:
"""Train a pipeline. """Train a pipeline.
@ -29,10 +30,15 @@ def train(
output_path (Path): Optional output path to save trained model to. output_path (Path): Optional output path to save trained model to.
use_gpu (int): Whether to train on GPU. Make sure to call require_gpu use_gpu (int): Whether to train on GPU. Make sure to call require_gpu
before calling this function. before calling this function.
silent (bool): Whether to pretty-print outputs. stdout (file): A file-like object to write output messages. To disable
printing, set to io.StringIO.
stderr (file): A second file-like object to write output messages. To disable
printing, set to io.StringIO.
RETURNS (Path / None): The path to the final exported model. RETURNS (Path / None): The path to the final exported model.
""" """
msg = Printer(no_print=silent) # We use no_print here so we can respect the stdout/stderr options.
msg = wasabi.Printer(no_print=True)
# Create iterator, which yields out info after each optimization step. # Create iterator, which yields out info after each optimization step.
config = nlp.config.interpolate() config = nlp.config.interpolate()
if config["training"]["seed"] is not None: if config["training"]["seed"] is not None:
@ -63,50 +69,44 @@ def train(
eval_frequency=T["eval_frequency"], eval_frequency=T["eval_frequency"],
exclude=frozen_components, exclude=frozen_components,
) )
msg.info(f"Pipeline: {nlp.pipe_names}") stdout.write(msg.info(f"Pipeline: {nlp.pipe_names}"))
if frozen_components: if frozen_components:
msg.info(f"Frozen components: {frozen_components}") stdout.write(msg.info(f"Frozen components: {frozen_components}"))
msg.info(f"Initial learn rate: {optimizer.learn_rate}") stdout.write(msg.info(f"Initial learn rate: {optimizer.learn_rate}"))
with nlp.select_pipes(disable=frozen_components): with nlp.select_pipes(disable=frozen_components):
print_row, finalize_logger = train_logger(nlp) log_step, finalize_logger = train_logger(nlp, stdout, stderr)
try: try:
progress = tqdm.tqdm(total=T["eval_frequency"], leave=False)
progress.set_description(f"Epoch 1")
for batch, info, is_best_checkpoint in training_step_iterator: for batch, info, is_best_checkpoint in training_step_iterator:
progress.update(1) log_step(info if is_best_checkpoint else None)
if is_best_checkpoint is not None: if is_best_checkpoint is not None and output_path is not None:
progress.close() with nlp.select_pipes(disable=frozen_components):
print_row(info) update_meta(T, nlp, info)
if is_best_checkpoint and output_path is not None: with nlp.use_params(optimizer.averages):
with nlp.select_pipes(disable=frozen_components): nlp = before_to_disk(nlp)
update_meta(T, nlp, info) nlp.to_disk(output_path / "model-best")
with nlp.use_params(optimizer.averages):
nlp = before_to_disk(nlp)
nlp.to_disk(output_path / "model-best")
progress = tqdm.tqdm(total=T["eval_frequency"], leave=False)
progress.set_description(f"Epoch {info['epoch']}")
except Exception as e: except Exception as e:
finalize_logger()
if output_path is not None: if output_path is not None:
# We don't want to swallow the traceback if we don't have a # We don't want to swallow the traceback if we don't have a
# specific error. # specific error, but we do want to warn that we're trying
msg.warn( # to do something here.
f"Aborting and saving the final best model. " stdout.write(
f"Encountered exception: {str(e)}" msg.warn(
f"Aborting and saving the final best model. "
f"Encountered exception: {str(e)}"
)
) )
nlp = before_to_disk(nlp)
nlp.to_disk(output_path / "model-final")
raise e raise e
finally: finally:
finalize_logger() finalize_logger()
if output_path is not None: if output_path is not None:
final_model_path = output_path / "model-final" final_model_path = output_path / "model-last"
if optimizer.averages: if optimizer.averages:
with nlp.use_params(optimizer.averages): with nlp.use_params(optimizer.averages):
nlp.to_disk(final_model_path) nlp.to_disk(final_model_path)
else: else:
nlp.to_disk(final_model_path) nlp.to_disk(final_model_path)
msg.good(f"Saved pipeline to output directory", final_model_path) # This will only run if we don't hit an error
stdout.write(msg.good("Saved pipeline to output directory", final_model_path))
def train_while_improving( def train_while_improving(

View File

@ -689,8 +689,8 @@ During training, the results of each step are passed to a logger function. By
default, these results are written to the console with the default, these results are written to the console with the
[`ConsoleLogger`](/api/top-level#ConsoleLogger). There is also built-in support [`ConsoleLogger`](/api/top-level#ConsoleLogger). There is also built-in support
for writing the log files to [Weights & Biases](https://www.wandb.com/) with the for writing the log files to [Weights & Biases](https://www.wandb.com/) with the
[`WandbLogger`](/api/top-level#WandbLogger). The logger function receives a [`WandbLogger`](/api/top-level#WandbLogger). On each step, the logger function
**dictionary** with the following keys: receives a **dictionary** with the following keys:
| Key | Value | | Key | Value |
| -------------- | ----------------------------------------------------------------------------------------------------- | | -------------- | ----------------------------------------------------------------------------------------------------- |
@ -715,30 +715,37 @@ tabular results to a file:
```python ```python
### functions.py ### functions.py
from typing import Tuple, Callable, Dict, Any import sys
from typing import IO, Tuple, Callable, Dict, Any
import spacy import spacy
from spacy import Language
from pathlib import Path from pathlib import Path
@spacy.registry.loggers("my_custom_logger.v1") @spacy.registry.loggers("my_custom_logger.v1")
def custom_logger(log_path): def custom_logger(log_path):
def setup_logger(nlp: "Language") -> Tuple[Callable, Callable]: def setup_logger(
with Path(log_path).open("w", encoding="utf8") as file_: nlp: Language,
file_.write("step\\t") stdout: IO=sys.stdout,
file_.write("score\\t") stderr: IO=sys.stderr
for pipe in nlp.pipe_names: ) -> Tuple[Callable, Callable]:
file_.write(f"loss_{pipe}\\t") stdout.write(f"Logging to {log_path}\n")
file_.write("\\n") log_file = Path(log_path).open("w", encoding="utf8")
log_file.write("step\\t")
log_file.write("score\\t")
for pipe in nlp.pipe_names:
log_file.write(f"loss_{pipe}\\t")
log_file.write("\\n")
def log_step(info: Dict[str, Any]): def log_step(info: Optional[Dict[str, Any]]):
with Path(log_path).open("a") as file_: if info:
file_.write(f"{info['step']}\\t") log_file.write(f"{info['step']}\\t")
file_.write(f"{info['score']}\\t") log_file.write(f"{info['score']}\\t")
for pipe in nlp.pipe_names: for pipe in nlp.pipe_names:
file_.write(f"{info['losses'][pipe]}\\t") log_file.write(f"{info['losses'][pipe]}\\t")
file_.write("\\n") log_file.write("\\n")
def finalize(): def finalize():
pass log_file.close()
return log_step, finalize return log_step, finalize