Add distill subcommand (#13431)

* Add distill subcommand

This subcommand distills a student model from a teacher model.

* Fixes from Sofie

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Type and doc fixes

* Wording

* distill: document missing `-o`

* Wording

* Small fix

---------

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
This commit is contained in:
Daniël de Kok 2024-04-11 19:33:46 +02:00 committed by GitHub
parent 304b9331e6
commit fbc14aea45
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 201 additions and 6 deletions

View File

@ -12,6 +12,7 @@ from .debug_config import debug_config # noqa: F401
from .debug_data import debug_data # noqa: F401
from .debug_diff import debug_diff # noqa: F401
from .debug_model import debug_model # noqa: F401
from .distill import distill # noqa: F401
from .download import download # noqa: F401
from .evaluate import evaluate # noqa: F401
from .find_function import find_function # noqa: F401

98
spacy/cli/distill.py Normal file
View File

@ -0,0 +1,98 @@
import logging
import sys
from pathlib import Path
from typing import Any, Dict, Optional, Union
import typer
from wasabi import msg
from .. import util
from ..pipeline.trainable_pipe import TrainablePipe
from ..schemas import ConfigSchemaDistill
from ..training.initialize import init_nlp_student
from ..training.loop import distill as distill_nlp
from ._util import (
Arg,
Opt,
app,
import_code_paths,
parse_config_overrides,
setup_gpu,
show_validation_error,
)
@app.command(
"distill",
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
)
def distill_cli(
# fmt: off
ctx: typer.Context, # This is only used to read additional arguments
teacher_model: str = Arg(..., help="Teacher model name or path"),
student_config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"),
code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"),
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
# fmt: on
):
"""
Distill a spaCy pipeline from a teacher model.
DOCS: https://spacy.io/api/cli#distill
"""
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
overrides = parse_config_overrides(ctx.args)
import_code_paths(code_path)
distill(
teacher_model,
student_config_path,
output_path,
use_gpu=use_gpu,
overrides=overrides,
)
def distill(
teacher_model: Union[str, Path],
student_config_path: Union[str, Path],
output_path: Optional[Union[str, Path]] = None,
*,
use_gpu: int = -1,
overrides: Dict[str, Any] = util.SimpleFrozenDict(),
):
student_config_path = util.ensure_path(student_config_path)
output_path = util.ensure_path(output_path)
# Make sure all files and paths exist if they are needed
if not student_config_path or (
str(student_config_path) != "-" and not student_config_path.exists()
):
msg.fail("Student config file not found", student_config_path, exits=1)
if not output_path:
msg.info("No output directory provided")
else:
if not output_path.exists():
output_path.mkdir(parents=True)
msg.good(f"Created output directory: {output_path}")
msg.info(f"Saving to output directory: {output_path}")
setup_gpu(use_gpu)
teacher = util.load_model(teacher_model)
with show_validation_error(student_config_path):
config = util.load_config(
student_config_path, overrides=overrides, interpolate=False
)
msg.divider("Initializing student pipeline")
with show_validation_error(student_config_path, hint_fill=False):
student = init_nlp_student(config, teacher, use_gpu=use_gpu)
msg.good("Initialized student pipeline")
msg.divider("Distilling student pipeline from teacher")
distill_nlp(
teacher,
student,
output_path,
use_gpu=use_gpu,
stdout=sys.stdout,
stderr=sys.stderr,
)

View File

@ -52,6 +52,7 @@ def test_convert_auto_conflict():
NOOP_CONFIG = """
[paths]
train = null
distill = null
dev = null
vectors = null
init_tok2vec = null
@ -96,6 +97,14 @@ max_length = 0
limit = 0
augmenter = null
[corpora.distill]
@readers = "spacy.Corpus.v1"
path = ${paths.distill}
gold_preproc = false
max_length = 0
limit = 0
augmenter = null
[training]
seed = ${system.seed}
gpu_allocator = ${system.gpu_allocator}
@ -143,6 +152,37 @@ learn_rate = 0.001
[training.score_weights]
[distillation]
student_to_teacher = []
dropout = 0.1
max_epochs = 0
max_steps = 100
corpus = "corpora.distill"
[distillation.batcher]
@batchers = "spacy.batch_by_words.v1"
discard_oversize = false
tolerance = 0.2
get_length = null
[distillation.batcher.size]
@schedules = "compounding.v1"
start = 100
stop = 1000
compound = 1.001
t = 0.0
[distillation.optimizer]
@optimizers = "Adam.v1"
beta1 = 0.9
beta2 = 0.999
L2_is_weight_decay = true
L2 = 0.01
grad_clip = 1.0
use_averages = false
eps = 0.00000001
learn_rate = 0.001
[pretraining]
[initialize]
@ -172,6 +212,8 @@ def data_paths():
db.to_disk(fpath)
args = [
"--paths.distill",
str(fpath),
"--paths.train",
str(fpath),
"--paths.dev",
@ -211,21 +253,33 @@ def noop_config():
yield cfg
@pytest.fixture
def noop_model():
with make_tempdir() as temp_d:
nlp = spacy.blank("en")
path = temp_d / "noop-model"
nlp.to_disk(path)
yield path
@pytest.mark.slow
@pytest.mark.parametrize(
"cmd",
["debug config", "debug data", "train", "assemble"],
["debug config", "debug data", "train", "assemble", "distill"],
)
def test_multi_code(cmd, code_paths, data_paths, noop_config):
def test_multi_code(cmd, code_paths, data_paths, noop_config, noop_model):
# check that it fails without the code arg
cmd = cmd.split()
output = ["."] if cmd[0] == "assemble" else []
model = [str(noop_model)] if cmd[0] == "distill" else []
cmd = [sys.executable, "-m", "spacy"] + cmd
result = subprocess.run([*cmd, str(noop_config), *output, *data_paths])
result = subprocess.run([*cmd, *model, str(noop_config), *output, *data_paths])
assert result.returncode == 1
# check that it succeeds with the code arg
result = subprocess.run([*cmd, str(noop_config), *output, *data_paths, *code_paths])
result = subprocess.run(
[*cmd, *model, str(noop_config), *output, *data_paths, *code_paths]
)
assert result.returncode == 0

View File

@ -19,6 +19,7 @@ menu:
- ['assemble', 'assemble']
- ['package', 'package']
- ['project', 'project']
- ['distill', 'distill']
- ['huggingface-hub', 'huggingface-hub']
---
@ -1296,8 +1297,10 @@ input formats are:
When a directory is provided it is traversed recursively to collect all files.
When loading a .spacy file, any potential annotations stored on the `Doc` that are not overwritten by the pipeline will be preserved.
If you want to evaluate the pipeline on raw text only, make sure that the .spacy file does not contain any annotations.
When loading a .spacy file, any potential annotations stored on the `Doc` that
are not overwritten by the pipeline will be preserved. If you want to evaluate
the pipeline on raw text only, make sure that the .spacy file does not contain
any annotations.
```bash
$ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key] [--force-overwrite] [--gpu-id] [--batch-size] [--n-process]
@ -1699,6 +1702,45 @@ $ python -m spacy project dvc [project_dir] [workflow] [--force] [--verbose] [--
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| **CREATES** | A `dvc.yaml` file in the project directory, based on the steps defined in the given workflow. |
## distill {id="distill", tag="experimental", version="4"}
Distill a _student_ pipeline from a _teacher_ pipeline. Distillation trains the
models in the student pipeline on the activations of the teacher's models. A
typical use case for distillation is to extract a smaller, more performant model
from a larger high-accuracy model. Since distillation uses the activations of
the teacher, distillation can be performed on a corpus of raw text without (gold
standard) annotations. A development set of gold annotations _is_ needed to
evaluate the student pipeline on during distillation.
`distill` will save out the best performing pipeline across all epochs, as well
as the final pipeline. The `--code` argument can be used to provide a Python
file that's imported before the training process starts. This lets you register
[custom functions](/usage/training#custom-functions) and architectures and refer
to them in your config, all while still using spaCy's built-in `train` workflow.
If you need to manage complex multi-step training workflows, check out
[Weasel](https://github.com/explosion/weasel).
> #### Example
>
> ```bash
> $ python -m spacy distill teacher-pipeline student.cfg --output ./output --paths.distill ./distill --paths.dev ./dev
> ```
```bash
$ python -m spacy distill [teacher_model] [student_config_path] [--output] [--code] [--verbose] [--gpu-id] [overrides]
```
| Name | Description |
| --------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `teacher_model` | The teacher pipeline (name or path) to distill the student from. ~~Union[str, Path] (positional)~~ |
| `student_config_path` | The configuration of the student pipeline. ~~Path (positional)~~ |
| `--output`, `-o` | Directory to store the distilled pipeline in. Will be created if it doesn't exist. No pipeline will be saved when this option is absent. ~~Optional[Path] \(option)~~ |
| `--code`, `-c` | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
| `--verbose`, `-V` | Show more detailed messages during distillation. ~~bool (flag)~~ |
| `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| **CREATES** | The final distilled pipeline and the best distilled pipeline. |
## huggingface-hub {id="huggingface-hub",version="3.1"}
The `spacy huggingface-cli` CLI includes commands for uploading your trained