2020-08-14 16:00:52 +03:00
|
|
|
import logging
|
2020-10-03 15:57:46 +03:00
|
|
|
import sys
|
2023-06-26 12:41:03 +03:00
|
|
|
from pathlib import Path
|
|
|
|
from typing import Any, Dict, Optional, Union
|
|
|
|
|
|
|
|
import typer
|
|
|
|
from wasabi import msg
|
2020-01-29 19:06:46 +03:00
|
|
|
|
2020-02-27 20:42:27 +03:00
|
|
|
from .. import util
|
2023-06-26 12:41:03 +03:00
|
|
|
from ..training.initialize import init_nlp
|
|
|
|
from ..training.loop import train as train_nlp
|
|
|
|
from ._util import (
|
|
|
|
Arg,
|
|
|
|
Opt,
|
|
|
|
app,
|
Accept multiple code files in all CLI commands (#12101)
* Add support for multiple code files to all relevant commands
Prior to this, only the package command supported multiple code files.
* Update docs
* Add debug data test, plus generic fixtures
One tricky thing here: it's tempting to create the config by creating a
pipeline in code, but that requires declaring the custom components
here. However the CliRunner appears to be run in the same process or
otherwise have access to our registry, so it works even without any
code arguments. So it's necessary to avoid declaring the components in
the tests.
* Add debug config test and restructure
The code argument imports the provided file. If it adds item to the
registry, that affects global state, which CliRunner doesn't isolate.
Since there's no standard way to remove things from the registry, this
instead uses subprocess.run to run commands.
* Use a more generic, parametrized test
* Add output arg for assemble and pretrain
Assemble and pretrain require an output argument. This commit adds
assemble testing, but not pretrain, as that requires an actual trainable
component, which is not currently in the test config.
* Add evaluate test and some cleanup
* Mark tests as slow
* Revert argument name change
* Apply suggestions from code review
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
* Format API CLI docs
* isort
* Fix imports in tests
* isort
* Undo changes to package CLI help
* Fix python executable and lang code in test
* Fix executable in another test
---------
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: Raphael Mitsch <r.mitsch@outlook.com>
2023-08-01 16:24:02 +03:00
|
|
|
import_code_paths,
|
2023-06-26 12:41:03 +03:00
|
|
|
parse_config_overrides,
|
|
|
|
setup_gpu,
|
|
|
|
show_validation_error,
|
|
|
|
)
|
2020-07-10 14:31:27 +03:00
|
|
|
|
2020-06-21 14:44:00 +03:00
|
|
|
|
2020-07-10 18:57:40 +03:00
|
|
|
@app.command(
|
|
|
|
"train", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}
|
|
|
|
)
|
2020-06-20 15:15:04 +03:00
|
|
|
def train_cli(
|
2020-01-29 19:06:46 +03:00
|
|
|
# fmt: off
|
2020-07-10 18:57:40 +03:00
|
|
|
ctx: typer.Context, # This is only used to read additional arguments
|
2020-12-08 10:01:40 +03:00
|
|
|
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
|
2020-09-03 14:13:03 +03:00
|
|
|
output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"),
|
Accept multiple code files in all CLI commands (#12101)
* Add support for multiple code files to all relevant commands
Prior to this, only the package command supported multiple code files.
* Update docs
* Add debug data test, plus generic fixtures
One tricky thing here: it's tempting to create the config by creating a
pipeline in code, but that requires declaring the custom components
here. However the CliRunner appears to be run in the same process or
otherwise have access to our registry, so it works even without any
code arguments. So it's necessary to avoid declaring the components in
the tests.
* Add debug config test and restructure
The code argument imports the provided file. If it adds item to the
registry, that affects global state, which CliRunner doesn't isolate.
Since there's no standard way to remove things from the registry, this
instead uses subprocess.run to run commands.
* Use a more generic, parametrized test
* Add output arg for assemble and pretrain
Assemble and pretrain require an output argument. This commit adds
assemble testing, but not pretrain, as that requires an actual trainable
component, which is not currently in the test config.
* Add evaluate test and some cleanup
* Mark tests as slow
* Revert argument name change
* Apply suggestions from code review
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
* Format API CLI docs
* isort
* Fix imports in tests
* isort
* Undo changes to package CLI help
* Fix python executable and lang code in test
* Fix executable in another test
---------
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: Raphael Mitsch <r.mitsch@outlook.com>
2023-08-01 16:24:02 +03:00
|
|
|
code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"),
|
2020-07-09 20:44:28 +03:00
|
|
|
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
2020-09-28 12:06:07 +03:00
|
|
|
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
|
2020-01-29 19:06:46 +03:00
|
|
|
# fmt: on
|
|
|
|
):
|
|
|
|
"""
|
2020-09-03 14:13:03 +03:00
|
|
|
Train or update a spaCy pipeline. Requires data in spaCy's binary format. To
|
2020-07-10 18:57:40 +03:00
|
|
|
convert data from other formats, use the `spacy convert` command. The
|
2021-06-26 08:45:03 +03:00
|
|
|
config file includes all settings and hyperparameters used during training.
|
2020-07-10 18:57:40 +03:00
|
|
|
To override settings in the config, e.g. settings that point to local
|
|
|
|
paths or that you want to experiment with, you can override them as
|
|
|
|
command line options. For instance, --training.batch_size 128 overrides
|
|
|
|
the value of "batch_size" in the block "[training]". The --code argument
|
|
|
|
lets you pass in a Python file that's imported before training. It can be
|
|
|
|
used to register custom functions and architectures that can then be
|
|
|
|
referenced in the config.
|
2020-09-04 13:58:50 +03:00
|
|
|
|
2021-01-30 12:09:38 +03:00
|
|
|
DOCS: https://spacy.io/api/cli#train
|
2020-01-29 19:06:46 +03:00
|
|
|
"""
|
2020-09-30 02:22:08 +03:00
|
|
|
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
2021-10-11 11:56:14 +03:00
|
|
|
overrides = parse_config_overrides(ctx.args)
|
Accept multiple code files in all CLI commands (#12101)
* Add support for multiple code files to all relevant commands
Prior to this, only the package command supported multiple code files.
* Update docs
* Add debug data test, plus generic fixtures
One tricky thing here: it's tempting to create the config by creating a
pipeline in code, but that requires declaring the custom components
here. However the CliRunner appears to be run in the same process or
otherwise have access to our registry, so it works even without any
code arguments. So it's necessary to avoid declaring the components in
the tests.
* Add debug config test and restructure
The code argument imports the provided file. If it adds item to the
registry, that affects global state, which CliRunner doesn't isolate.
Since there's no standard way to remove things from the registry, this
instead uses subprocess.run to run commands.
* Use a more generic, parametrized test
* Add output arg for assemble and pretrain
Assemble and pretrain require an output argument. This commit adds
assemble testing, but not pretrain, as that requires an actual trainable
component, which is not currently in the test config.
* Add evaluate test and some cleanup
* Mark tests as slow
* Revert argument name change
* Apply suggestions from code review
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
* Format API CLI docs
* isort
* Fix imports in tests
* isort
* Undo changes to package CLI help
* Fix python executable and lang code in test
* Fix executable in another test
---------
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: Raphael Mitsch <r.mitsch@outlook.com>
2023-08-01 16:24:02 +03:00
|
|
|
import_code_paths(code_path)
|
2021-10-11 11:56:14 +03:00
|
|
|
train(config_path, output_path, use_gpu=use_gpu, overrides=overrides)
|
|
|
|
|
|
|
|
|
|
|
|
def train(
|
2021-10-29 11:36:34 +03:00
|
|
|
config_path: Union[str, Path],
|
|
|
|
output_path: Optional[Union[str, Path]] = None,
|
2021-10-11 11:56:14 +03:00
|
|
|
*,
|
|
|
|
use_gpu: int = -1,
|
|
|
|
overrides: Dict[str, Any] = util.SimpleFrozenDict(),
|
|
|
|
):
|
2021-10-29 11:36:34 +03:00
|
|
|
config_path = util.ensure_path(config_path)
|
|
|
|
output_path = util.ensure_path(output_path)
|
2020-10-03 15:57:46 +03:00
|
|
|
# Make sure all files and paths exists if they are needed
|
2020-12-08 10:01:40 +03:00
|
|
|
if not config_path or (str(config_path) != "-" and not config_path.exists()):
|
2020-10-03 15:57:46 +03:00
|
|
|
msg.fail("Config file not found", config_path, exits=1)
|
2021-08-05 10:21:22 +03:00
|
|
|
if not output_path:
|
|
|
|
msg.info("No output directory provided")
|
|
|
|
else:
|
|
|
|
if not output_path.exists():
|
|
|
|
output_path.mkdir(parents=True)
|
|
|
|
msg.good(f"Created output directory: {output_path}")
|
|
|
|
msg.info(f"Saving to output directory: {output_path}")
|
2020-09-28 16:09:59 +03:00
|
|
|
setup_gpu(use_gpu)
|
|
|
|
with show_validation_error(config_path):
|
|
|
|
config = util.load_config(config_path, overrides=overrides, interpolate=False)
|
2020-09-28 11:53:17 +03:00
|
|
|
msg.divider("Initializing pipeline")
|
2020-09-29 23:29:09 +03:00
|
|
|
with show_validation_error(config_path, hint_fill=False):
|
2020-09-29 23:53:18 +03:00
|
|
|
nlp = init_nlp(config, use_gpu=use_gpu)
|
2020-09-30 02:08:55 +03:00
|
|
|
msg.good("Initialized pipeline")
|
2020-09-28 12:06:07 +03:00
|
|
|
msg.divider("Training pipeline")
|
2021-10-11 11:56:14 +03:00
|
|
|
train_nlp(nlp, output_path, use_gpu=use_gpu, stdout=sys.stdout, stderr=sys.stderr)
|