Add distill subcommand (#13431)

* Add distill subcommand This subcommand distills a student model from a teacher model. * Fixes from Sofie Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Type and doc fixes * Wording * distill: document missing `-o` * Wording * Small fix --------- Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
2025-11-06 19:07:30 +03:00 · 2024-04-11 19:33:46 +02:00 · 2024-04-11 19:33:46 +02:00 · fbc14aea45
commit fbc14aea45
parent 304b9331e6
4 changed files with 201 additions and 6 deletions
--- a/spacy/cli/init.py
+++ b/spacy/cli/init.py
@ -12,6 +12,7 @@ from .debug_config import debug_config  # noqa: F401
 from .debug_data import debug_data  # noqa: F401
 from .debug_diff import debug_diff  # noqa: F401
 from .debug_model import debug_model  # noqa: F401
 from .distill import distill  # noqa: F401
 from .download import download  # noqa: F401
 from .evaluate import evaluate  # noqa: F401
 from .find_function import find_function  # noqa: F401
--- a/spacy/cli/distill.py
+++ b/spacy/cli/distill.py
@ -0,0 +1,98 @@
 import logging
 import sys
 from pathlib import Path
 from typing import Any, Dict, Optional, Union
 import typer
 from wasabi import msg
 from .. import util
 from ..pipeline.trainable_pipe import TrainablePipe
 from ..schemas import ConfigSchemaDistill
 from ..training.initialize import init_nlp_student
 from ..training.loop import distill as distill_nlp
 from ._util import (
    Arg,
    Opt,
    app,
    import_code_paths,
    parse_config_overrides,
    setup_gpu,
    show_validation_error,
 )
@app.command(
    "distill",
    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
 )
 def distill_cli(
    # fmt: off
    ctx: typer.Context,  # This is only used to read additional arguments
    teacher_model: str = Arg(..., help="Teacher model name or path"),
    student_config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
    output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"),
    code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"),
    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
    # fmt: on
 ):
    """
    Distill a spaCy pipeline from a teacher model.
    DOCS: https://spacy.io/api/cli#distill
    """
    util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
    overrides = parse_config_overrides(ctx.args)
    import_code_paths(code_path)
    distill(
        teacher_model,
        student_config_path,
        output_path,
        use_gpu=use_gpu,
        overrides=overrides,
    )
 def distill(
    teacher_model: Union[str, Path],
    student_config_path: Union[str, Path],
    output_path: Optional[Union[str, Path]] = None,
    *,
    use_gpu: int = -1,
    overrides: Dict[str, Any] = util.SimpleFrozenDict(),
 ):
    student_config_path = util.ensure_path(student_config_path)
    output_path = util.ensure_path(output_path)
    # Make sure all files and paths exist if they are needed
    if not student_config_path or (
        str(student_config_path) != "-" and not student_config_path.exists()
    ):
        msg.fail("Student config file not found", student_config_path, exits=1)
    if not output_path:
        msg.info("No output directory provided")
    else:
        if not output_path.exists():
            output_path.mkdir(parents=True)
            msg.good(f"Created output directory: {output_path}")
        msg.info(f"Saving to output directory: {output_path}")
    setup_gpu(use_gpu)
    teacher = util.load_model(teacher_model)
    with show_validation_error(student_config_path):
        config = util.load_config(
            student_config_path, overrides=overrides, interpolate=False
        )
    msg.divider("Initializing student pipeline")
    with show_validation_error(student_config_path, hint_fill=False):
        student = init_nlp_student(config, teacher, use_gpu=use_gpu)
    msg.good("Initialized student pipeline")
    msg.divider("Distilling student pipeline from teacher")
    distill_nlp(
        teacher,
        student,
        output_path,
        use_gpu=use_gpu,
        stdout=sys.stdout,
        stderr=sys.stderr,
    )
--- a/spacy/tests/test_cli_app.py
+++ b/spacy/tests/test_cli_app.py
@ -52,6 +52,7 @@ def test_convert_auto_conflict():
 NOOP_CONFIG = """
 [paths]
 train = null
 distill = null
 dev = null
 vectors = null
 init_tok2vec = null
@ -96,6 +97,14 @@ max_length = 0
 limit = 0
 augmenter = null
 [corpora.distill]
@readers = "spacy.Corpus.v1"
 path = ${paths.distill}
 gold_preproc = false
 max_length = 0
 limit = 0
 augmenter = null
 [training]
 seed = ${system.seed}
 gpu_allocator = ${system.gpu_allocator}
@ -143,6 +152,37 @@ learn_rate = 0.001
 [training.score_weights]
 [distillation]
 student_to_teacher = []
 dropout = 0.1
 max_epochs = 0
 max_steps = 100
 corpus = "corpora.distill"
 [distillation.batcher]
@batchers = "spacy.batch_by_words.v1"
 discard_oversize = false
 tolerance = 0.2
 get_length = null
 [distillation.batcher.size]
@schedules = "compounding.v1"
 start = 100
 stop = 1000
 compound = 1.001
 t = 0.0
 [distillation.optimizer]
@optimizers = "Adam.v1"
 beta1 = 0.9
 beta2 = 0.999
 L2_is_weight_decay = true
 L2 = 0.01
 grad_clip = 1.0
 use_averages = false
 eps = 0.00000001
 learn_rate = 0.001
 [pretraining]
 [initialize]
@ -172,6 +212,8 @@ def data_paths():
        db.to_disk(fpath)
        args = [
            "--paths.distill",
            str(fpath),
            "--paths.train",
            str(fpath),
            "--paths.dev",
@ -211,21 +253,33 @@ def noop_config():
        yield cfg
@pytest.fixture
 def noop_model():
    with make_tempdir() as temp_d:
        nlp = spacy.blank("en")
        path = temp_d / "noop-model"
        nlp.to_disk(path)
        yield path
@pytest.mark.slow
@pytest.mark.parametrize(
    "cmd",
-    ["debug config", "debug data", "train", "assemble"],
+    ["debug config", "debug data", "train", "assemble", "distill"],
 )
-def test_multi_code(cmd, code_paths, data_paths, noop_config):
+def test_multi_code(cmd, code_paths, data_paths, noop_config, noop_model):
    # check that it fails without the code arg
    cmd = cmd.split()
    output = ["."] if cmd[0] == "assemble" else []
    model = [str(noop_model)] if cmd[0] == "distill" else []
    cmd = [sys.executable, "-m", "spacy"] + cmd
-    result = subprocess.run([*cmd, str(noop_config), *output, *data_paths])
+    result = subprocess.run([*cmd, *model, str(noop_config), *output, *data_paths])
    assert result.returncode == 1
    # check that it succeeds with the code arg
-    result = subprocess.run([*cmd, str(noop_config), *output, *data_paths, *code_paths])
+    result = subprocess.run(
        [*cmd, *model, str(noop_config), *output, *data_paths, *code_paths]
    )
    assert result.returncode == 0
--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@ -19,6 +19,7 @@ menu:
  - ['assemble', 'assemble']
  - ['package', 'package']
  - ['project', 'project']
  - ['distill', 'distill']
  - ['huggingface-hub', 'huggingface-hub']
 ---
@ -1296,8 +1297,10 @@ input formats are:
 When a directory is provided it is traversed recursively to collect all files.
-When loading a .spacy file, any potential annotations stored on the `Doc` that are not overwritten by the pipeline will be preserved.
+When loading a .spacy file, any potential annotations stored on the `Doc` that
-If you want to evaluate the pipeline on raw text only, make sure that the .spacy file does not contain any annotations.
+are not overwritten by the pipeline will be preserved. If you want to evaluate
 the pipeline on raw text only, make sure that the .spacy file does not contain
 any annotations.
 ```bash
 $ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key] [--force-overwrite] [--gpu-id] [--batch-size] [--n-process]
@ -1699,6 +1702,45 @@ $ python -m spacy project dvc [project_dir] [workflow] [--force] [--verbose] [--
 | `--help`, `-h`    | Show help message and available arguments. ~~bool (flag)~~                                                    |
 | **CREATES**       | A `dvc.yaml` file in the project directory, based on the steps defined in the given workflow.                 |
 ## distill {id="distill", tag="experimental", version="4"}
 Distill a _student_ pipeline from a _teacher_ pipeline. Distillation trains the
 models in the student pipeline on the activations of the teacher's models. A
 typical use case for distillation is to extract a smaller, more performant model
 from a larger high-accuracy model. Since distillation uses the activations of
 the teacher, distillation can be performed on a corpus of raw text without (gold
 standard) annotations. A development set of gold annotations _is_ needed to
 evaluate the student pipeline on during distillation.
 `distill` will save out the best performing pipeline across all epochs, as well
 as the final pipeline. The `--code` argument can be used to provide a Python
 file that's imported before the training process starts. This lets you register
 [custom functions](/usage/training#custom-functions) and architectures and refer
 to them in your config, all while still using spaCy's built-in `train` workflow.
 If you need to manage complex multi-step training workflows, check out 
 [Weasel](https://github.com/explosion/weasel).
 > #### Example
 >
 > ```bash
 > $ python -m spacy distill teacher-pipeline student.cfg --output ./output --paths.distill ./distill --paths.dev ./dev
 > ```
 ```bash
 $ python -m spacy distill [teacher_model] [student_config_path] [--output] [--code] [--verbose] [--gpu-id] [overrides]
 ```
 | Name                  | Description                                                                                                                                                                                            |
 | --------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `teacher_model`       | The teacher pipeline (name or path) to distill the student from. ~~Union[str, Path] (positional)~~                                                                                                     |
 | `student_config_path` | The configuration of the student pipeline. ~~Path (positional)~~                                                                                                                                       |
 | `--output`, `-o`      | Directory to store the distilled pipeline in. Will be created if it doesn't exist. No pipeline will be saved when this option is absent. ~~Optional[Path] \(option)~~                                  |
 | `--code`, `-c`        | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
 | `--verbose`, `-V`     | Show more detailed messages during distillation. ~~bool (flag)~~                                                                                                                                       |
 | `--gpu-id`, `-g`      | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~                                                                                                                                             |
 | `--help`, `-h`        | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                             |
 | **CREATES**           | The final distilled pipeline and the best distilled pipeline.                                                                                                                                          |
 ## huggingface-hub {id="huggingface-hub",version="3.1"}
 The `spacy huggingface-cli` CLI includes commands for uploading your trained