Add apply CLI (#11376)

* annotate cli first try * add batch-size and n_process * rename to apply * typing fix * handle file suffixes * walk directories * support jsonl * typing fix * remove debug * make suffix optional for walk * revert unrelated * don't warn but raise * better error message * minor touch up * Update spacy/tests/test_cli.py Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> * Update spacy/cli/apply.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/cli/apply.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * update tests and bugfix * add force_overwrite * typo * fix adding .spacy suffix * Update spacy/cli/apply.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/cli/apply.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/cli/apply.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * store user data and rename cmd arg * include test for user attr * rename cmd arg * better help message * documentation * prettier * black * link fix * Update spacy/cli/apply.py Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com> * Update website/docs/api/cli.md Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com> * Update website/docs/api/cli.md Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com> * Update website/docs/api/cli.md Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com> * addressing reviews * dont quit but warn * prettier Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com>
2025-07-15 02:32:37 +03:00 · 2022-12-20 17:11:33 +01:00 · 2022-12-20 17:11:33 +01:00 · c223cd7a86
commit c223cd7a86
parent 18ffe5bbd6
6 changed files with 280 additions and 31 deletions
--- a/spacy/cli/init.py
+++ b/spacy/cli/init.py
@ -16,6 +16,7 @@ from .debug_config import debug_config  # noqa: F401
 from .debug_model import debug_model  # noqa: F401
 from .debug_diff import debug_diff  # noqa: F401
 from .evaluate import evaluate  # noqa: F401
 from .apply import apply  # noqa: F401
 from .convert import convert  # noqa: F401
 from .init_pipeline import init_pipeline_cli  # noqa: F401
 from .init_config import init_config, fill_config  # noqa: F401
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@ -582,6 +582,29 @@ def setup_gpu(use_gpu: int, silent=None) -> None:
            local_msg.info("To switch to GPU 0, use the option: --gpu-id 0")
 def walk_directory(path: Path, suffix: Optional[str] = None) -> List[Path]:
    if not path.is_dir():
        return [path]
    paths = [path]
    locs = []
    seen = set()
    for path in paths:
        if str(path) in seen:
            continue
        seen.add(str(path))
        if path.parts[-1].startswith("."):
            continue
        elif path.is_dir():
            paths.extend(path.iterdir())
        elif suffix is not None and not path.parts[-1].endswith(suffix):
            continue
        else:
            locs.append(path)
    # It's good to sort these, in case the ordering messes up cache.
    locs.sort()
    return locs
 def _format_number(number: Union[int, float], ndigits: int = 2) -> str:
    """Formats a number (float or int) rounding to `ndigits`, without truncating trailing 0s,
    as happens with `round(number, ndigits)`"""
--- a/spacy/cli/apply.py
+++ b/spacy/cli/apply.py
@ -0,0 +1,143 @@
 import tqdm
 import srsly
 from itertools import chain
 from pathlib import Path
 from typing import Optional, List, Iterable, cast, Union
 from wasabi import msg
 from ._util import app, Arg, Opt, setup_gpu, import_code, walk_directory
 from ..tokens import Doc, DocBin
 from ..vocab import Vocab
 from ..util import ensure_path, load_model
 path_help = """Location of the documents to predict on.
 Can be a single file in .spacy format or a .jsonl file.
 Files with other extensions are treated as single plain text documents.
 If a directory is provided it is traversed recursively to grab
 all files to be processed.
 The files can be a mixture of .spacy, .jsonl and text files.
 If .jsonl is provided the specified field is going
 to be grabbed ("text" by default)."""
 out_help = "Path to save the resulting .spacy file"
 code_help = (
    "Path to Python file with additional " "code (registered functions) to be imported"
 )
 gold_help = "Use gold preprocessing provided in the .spacy files"
 force_msg = (
    "The provided output file already exists. "
    "To force overwriting the output file, set the --force or -F flag."
 )
 DocOrStrStream = Union[Iterable[str], Iterable[Doc]]
 def _stream_docbin(path: Path, vocab: Vocab) -> Iterable[Doc]:
    """
    Stream Doc objects from DocBin.
    """
    docbin = DocBin().from_disk(path)
    for doc in docbin.get_docs(vocab):
        yield doc
 def _stream_jsonl(path: Path, field: str) -> Iterable[str]:
    """
    Stream "text" field from JSONL. If the field "text" is
    not found it raises error.
    """
    for entry in srsly.read_jsonl(path):
        if field not in entry:
            msg.fail(
                f"{path} does not contain the required '{field}' field.", exits=1
            )
        else:
            yield entry[field]
 def _stream_texts(paths: Iterable[Path]) -> Iterable[str]:
    """
    Yields strings from text files in paths.
    """
    for path in paths:
        with open(path, "r") as fin:
            text = fin.read()
            yield text
@app.command("apply")
 def apply_cli(
    # fmt: off
    model: str = Arg(..., help="Model name or path"),
    data_path: Path = Arg(..., help=path_help, exists=True),
    output_file: Path = Arg(..., help=out_help, dir_okay=False),
    code_path: Optional[Path] = Opt(None, "--code", "-c", help=code_help),
    text_key: str = Opt("text", "--text-key", "-tk", help="Key containing text string for JSONL"),
    force_overwrite: bool = Opt(False, "--force", "-F", help="Force overwriting the output file"),
    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU."),
    batch_size: int = Opt(1, "--batch-size", "-b", help="Batch size."),
    n_process: int = Opt(1, "--n-process", "-n", help="number of processors to use.")
 ):
    """
    Apply a trained pipeline to documents to get predictions.
    Expects a loadable spaCy pipeline and path to the data, which
    can be a directory or a file.
    The data files can be provided in multiple formats:
        1. .spacy files
        2. .jsonl files with a specified "field" to read the text from.
        3. Files with any other extension are assumed to be containing
           a single document.
    DOCS: https://spacy.io/api/cli#apply
    """
    data_path = ensure_path(data_path)
    output_file = ensure_path(output_file)
    code_path = ensure_path(code_path)
    if output_file.exists() and not force_overwrite:
        msg.fail(force_msg, exits=1)
    if not data_path.exists():
        msg.fail(f"Couldn't find data path: {data_path}", exits=1)
    import_code(code_path)
    setup_gpu(use_gpu)
    apply(data_path, output_file, model, text_key, batch_size, n_process)
 def apply(
    data_path: Path,
    output_file: Path,
    model: str,
    json_field: str,
    batch_size: int,
    n_process: int,
 ):
    docbin = DocBin(store_user_data=True)
    paths = walk_directory(data_path)
    if len(paths) == 0:
        docbin.to_disk(output_file)
        msg.warn("Did not find data to process,"
                 f" {data_path} seems to be an empty directory.")
        return
    nlp = load_model(model)
    msg.good(f"Loaded model {model}")
    vocab = nlp.vocab
    streams: List[DocOrStrStream] = []
    text_files = []
    for path in paths:
        if path.suffix == ".spacy":
            streams.append(_stream_docbin(path, vocab))
        elif path.suffix == ".jsonl":
            streams.append(_stream_jsonl(path, json_field))
        else:
            text_files.append(path)
    if len(text_files) > 0:
        streams.append(_stream_texts(text_files))
    datagen = cast(DocOrStrStream, chain(*streams))
    for doc in tqdm.tqdm(nlp.pipe(datagen, batch_size=batch_size, n_process=n_process)):
        docbin.add(doc)
    if output_file.suffix == "":
        output_file = output_file.with_suffix(".spacy")
    docbin.to_disk(output_file)
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@ -1,4 +1,4 @@
-from typing import Callable, Iterable, Mapping, Optional, Any, List, Union
+from typing import Callable, Iterable, Mapping, Optional, Any, Union
 from enum import Enum
 from pathlib import Path
 from wasabi import Printer
@ -7,7 +7,7 @@ import re
 import sys
 import itertools
-from ._util import app, Arg, Opt
+from ._util import app, Arg, Opt, walk_directory
 from ..training import docs_to_json
 from ..tokens import Doc, DocBin
 from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs
@ -189,33 +189,6 @@ def autodetect_ner_format(input_data: str) -> Optional[str]:
    return None
 def walk_directory(path: Path, converter: str) -> List[Path]:
    if not path.is_dir():
        return [path]
    paths = [path]
    locs = []
    seen = set()
    for path in paths:
        if str(path) in seen:
            continue
        seen.add(str(path))
        if path.parts[-1].startswith("."):
            continue
        elif path.is_dir():
            paths.extend(path.iterdir())
        elif converter == "json" and not path.parts[-1].endswith("json"):
            continue
        elif converter == "conll" and not path.parts[-1].endswith("conll"):
            continue
        elif converter == "iob" and not path.parts[-1].endswith("iob"):
            continue
        else:
            locs.append(path)
    # It's good to sort these, in case the ordering messes up cache.
    locs.sort()
    return locs
 def verify_cli_args(
    msg: Printer,
    input_path: Path,
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@ -5,6 +5,7 @@ from typing import Tuple, List, Dict, Any
 import pkg_resources
 import time
 import spacy
 import numpy
 import pytest
 import srsly
@ -32,6 +33,7 @@ from spacy.cli.package import _is_permitted_package_name
 from spacy.cli.project.remote_storage import RemoteStorage
 from spacy.cli.project.run import _check_requirements
 from spacy.cli.validate import get_model_pkgs
 from spacy.cli.apply import apply
 from spacy.cli.find_threshold import find_threshold
 from spacy.lang.en import English
 from spacy.lang.nl import Dutch
@ -885,6 +887,82 @@ def test_span_length_freq_dist_output_must_be_correct():
    assert list(span_freqs.keys()) == [3, 1, 4, 5, 2]
 def test_applycli_empty_dir():
    with make_tempdir() as data_path:
        output = data_path / "test.spacy"
        apply(data_path, output, "blank:en", "text", 1, 1)
 def test_applycli_docbin():
    with make_tempdir() as data_path:
        output = data_path / "testout.spacy"
        nlp = spacy.blank("en")
        doc = nlp("testing apply cli.")
        # test empty DocBin case
        docbin = DocBin()
        docbin.to_disk(data_path / "testin.spacy")
        apply(data_path, output, "blank:en", "text", 1, 1)
        docbin.add(doc)
        docbin.to_disk(data_path / "testin.spacy")
        apply(data_path, output, "blank:en", "text", 1, 1)
 def test_applycli_jsonl():
    with make_tempdir() as data_path:
        output = data_path / "testout.spacy"
        data = [{"field": "Testing apply cli.", "key": 234}]
        data2 = [{"field": "234"}]
        srsly.write_jsonl(data_path / "test.jsonl", data)
        apply(data_path, output, "blank:en", "field", 1, 1)
        srsly.write_jsonl(data_path / "test2.jsonl", data2)
        apply(data_path, output, "blank:en", "field", 1, 1)
 def test_applycli_txt():
    with make_tempdir() as data_path:
        output = data_path / "testout.spacy"
        with open(data_path / "test.foo", "w") as ftest:
            ftest.write("Testing apply cli.")
        apply(data_path, output, "blank:en", "text", 1, 1)
 def test_applycli_mixed():
    with make_tempdir() as data_path:
        output = data_path / "testout.spacy"
        text = "Testing apply cli"
        nlp = spacy.blank("en")
        doc = nlp(text)
        jsonl_data = [{"text": text}]
        srsly.write_jsonl(data_path / "test.jsonl", jsonl_data)
        docbin = DocBin()
        docbin.add(doc)
        docbin.to_disk(data_path / "testin.spacy")
        with open(data_path / "test.txt", "w") as ftest:
            ftest.write(text)
        apply(data_path, output, "blank:en", "text", 1, 1)
        # Check whether it worked
        result = list(DocBin().from_disk(output).get_docs(nlp.vocab))
        assert len(result) == 3
        for doc in result:
            assert doc.text == text
 def test_applycli_user_data():
    Doc.set_extension("ext", default=0)
    val = ("ext", 0)
    with make_tempdir() as data_path:
        output = data_path / "testout.spacy"
        nlp = spacy.blank("en")
        doc = nlp("testing apply cli.")
        doc._.ext = val
        docbin = DocBin(store_user_data=True)
        docbin.add(doc)
        docbin.to_disk(data_path / "testin.spacy")
        apply(data_path, output, "blank:en", "", 1, 1)
        result = list(DocBin().from_disk(output).get_docs(nlp.vocab))
        assert result[0]._.ext == val
 def test_local_remote_storage():
    with make_tempdir() as d:
        filename = "a.txt"
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@ -12,6 +12,7 @@ menu:
  - ['train', 'train']
  - ['pretrain', 'pretrain']
  - ['evaluate', 'evaluate']
  - ['apply', 'apply']
  - ['find-threshold', 'find-threshold']
  - ['assemble', 'assemble']
  - ['package', 'package']
@ -474,7 +475,7 @@ report span characteristics such as the average span length and the span (or
 span boundary) distinctiveness. The distinctiveness measure shows how different
 the tokens are with respect to the rest of the corpus using the KL-divergence of
 the token distributions. To learn more, you can check out Papay et al.'s work on
-[*Dissecting Span Identification Tasks with Performance Prediction* (EMNLP 2020)](https://aclanthology.org/2020.emnlp-main.396/).
+[_Dissecting Span Identification Tasks with Performance Prediction_ (EMNLP 2020)](https://aclanthology.org/2020.emnlp-main.396/).
 </Infobox>
@ -1162,6 +1163,37 @@ $ python -m spacy evaluate [model] [data_path] [--output] [--code] [--gold-prepr
 | `--help`, `-h`                            | Show help message and available arguments. ~~bool (flag)~~                                                                                                                           |
 | **CREATES**                               | Training results and optional metrics and visualizations.                                                                                                                            |
 ## apply {#apply new="3.5" tag="command"}
 Applies a trained pipeline to data and stores the resulting annotated documents
 in a `DocBin`. The input can be a single file or a directory. The recognized
 input formats are:
 1. `.spacy`
 2. `.jsonl` containing a user specified `text_key`
 3. Files with any other extension are assumed to be plain text files containing
   a single document.
 When a directory is provided it is traversed recursively to collect all files.
 ```cli
 $ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key] [--force-overwrite] [--gpu-id] [--batch-size] [--n-process]
 ```
 | Name                                      | Description                                                                                                                                                                          |
 | ----------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `model`                                   | Pipeline to apply to the data. Can be a package or a path to a data directory. ~~str (positional)~~                                                                                  |
 | `data_path`                               | Location of data to be evaluated in spaCy's [binary format](/api/data-formats#training), jsonl, or plain text. ~~Path (positional)~~                                                 |
 | `output-file`, `-o`                       | Output `DocBin` path. ~~str (positional)~~                                                                                                                                           |
 | `--code`, `-c` <Tag variant="new">3</Tag> | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
 | `--text-key`, `-tk`                       | The key for `.jsonl` files to use to grab the texts from. Defaults to `text`. ~~Optional[str] \(option)~~                                                                            |
 | `--force-overwrite`, `-F`                 | If the provided `output-file` already exists, then force `apply` to overwrite it. If this is `False` (default) then quits with a warning instead. ~~bool (flag)~~                    |
 | `--gpu-id`, `-g`                          | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                       |
 | `--batch-size`, `-b`                      | Batch size to use for prediction. Defaults to `1`. ~~int (option)~~                                                                                                                  |
 | `--n-process`, `-n`                       | Number of processes to use for prediction. Defaults to `1`. ~~int (option)~~                                                                                                         |
 | `--help`, `-h`                            | Show help message and available arguments. ~~bool (flag)~~                                                                                                                           |
 | **CREATES**                               | A `DocBin` with the annotations from the `model` for all the files found in `data-path`.                                                                                             |
 ## find-threshold {#find-threshold new="3.5" tag="command"}
 Runs prediction trials for a trained model with varying tresholds to maximize
@ -1187,7 +1219,6 @@ be provided.
 > $ python -m spacy find-threshold my_nlp data.spacy spancat threshold spans_sc_f
 > ```
 | Name                    | Description                                                                                                                                                                          |
 | ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `model`                 | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~                                                                                           |