mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Add apply CLI (#11376)
* annotate cli first try * add batch-size and n_process * rename to apply * typing fix * handle file suffixes * walk directories * support jsonl * typing fix * remove debug * make suffix optional for walk * revert unrelated * don't warn but raise * better error message * minor touch up * Update spacy/tests/test_cli.py Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> * Update spacy/cli/apply.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/cli/apply.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * update tests and bugfix * add force_overwrite * typo * fix adding .spacy suffix * Update spacy/cli/apply.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/cli/apply.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/cli/apply.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * store user data and rename cmd arg * include test for user attr * rename cmd arg * better help message * documentation * prettier * black * link fix * Update spacy/cli/apply.py Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com> * Update website/docs/api/cli.md Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com> * Update website/docs/api/cli.md Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com> * Update website/docs/api/cli.md Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com> * addressing reviews * dont quit but warn * prettier Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com>
This commit is contained in:
parent
18ffe5bbd6
commit
c223cd7a86
|
@ -16,6 +16,7 @@ from .debug_config import debug_config # noqa: F401
|
|||
from .debug_model import debug_model # noqa: F401
|
||||
from .debug_diff import debug_diff # noqa: F401
|
||||
from .evaluate import evaluate # noqa: F401
|
||||
from .apply import apply # noqa: F401
|
||||
from .convert import convert # noqa: F401
|
||||
from .init_pipeline import init_pipeline_cli # noqa: F401
|
||||
from .init_config import init_config, fill_config # noqa: F401
|
||||
|
|
|
@ -582,6 +582,29 @@ def setup_gpu(use_gpu: int, silent=None) -> None:
|
|||
local_msg.info("To switch to GPU 0, use the option: --gpu-id 0")
|
||||
|
||||
|
||||
def walk_directory(path: Path, suffix: Optional[str] = None) -> List[Path]:
|
||||
if not path.is_dir():
|
||||
return [path]
|
||||
paths = [path]
|
||||
locs = []
|
||||
seen = set()
|
||||
for path in paths:
|
||||
if str(path) in seen:
|
||||
continue
|
||||
seen.add(str(path))
|
||||
if path.parts[-1].startswith("."):
|
||||
continue
|
||||
elif path.is_dir():
|
||||
paths.extend(path.iterdir())
|
||||
elif suffix is not None and not path.parts[-1].endswith(suffix):
|
||||
continue
|
||||
else:
|
||||
locs.append(path)
|
||||
# It's good to sort these, in case the ordering messes up cache.
|
||||
locs.sort()
|
||||
return locs
|
||||
|
||||
|
||||
def _format_number(number: Union[int, float], ndigits: int = 2) -> str:
|
||||
"""Formats a number (float or int) rounding to `ndigits`, without truncating trailing 0s,
|
||||
as happens with `round(number, ndigits)`"""
|
||||
|
|
143
spacy/cli/apply.py
Normal file
143
spacy/cli/apply.py
Normal file
|
@ -0,0 +1,143 @@
|
|||
import tqdm
|
||||
import srsly
|
||||
|
||||
from itertools import chain
|
||||
from pathlib import Path
|
||||
from typing import Optional, List, Iterable, cast, Union
|
||||
|
||||
from wasabi import msg
|
||||
|
||||
from ._util import app, Arg, Opt, setup_gpu, import_code, walk_directory
|
||||
|
||||
from ..tokens import Doc, DocBin
|
||||
from ..vocab import Vocab
|
||||
from ..util import ensure_path, load_model
|
||||
|
||||
|
||||
path_help = """Location of the documents to predict on.
|
||||
Can be a single file in .spacy format or a .jsonl file.
|
||||
Files with other extensions are treated as single plain text documents.
|
||||
If a directory is provided it is traversed recursively to grab
|
||||
all files to be processed.
|
||||
The files can be a mixture of .spacy, .jsonl and text files.
|
||||
If .jsonl is provided the specified field is going
|
||||
to be grabbed ("text" by default)."""
|
||||
|
||||
out_help = "Path to save the resulting .spacy file"
|
||||
code_help = (
|
||||
"Path to Python file with additional " "code (registered functions) to be imported"
|
||||
)
|
||||
gold_help = "Use gold preprocessing provided in the .spacy files"
|
||||
force_msg = (
|
||||
"The provided output file already exists. "
|
||||
"To force overwriting the output file, set the --force or -F flag."
|
||||
)
|
||||
|
||||
|
||||
DocOrStrStream = Union[Iterable[str], Iterable[Doc]]
|
||||
|
||||
|
||||
def _stream_docbin(path: Path, vocab: Vocab) -> Iterable[Doc]:
|
||||
"""
|
||||
Stream Doc objects from DocBin.
|
||||
"""
|
||||
docbin = DocBin().from_disk(path)
|
||||
for doc in docbin.get_docs(vocab):
|
||||
yield doc
|
||||
|
||||
|
||||
def _stream_jsonl(path: Path, field: str) -> Iterable[str]:
|
||||
"""
|
||||
Stream "text" field from JSONL. If the field "text" is
|
||||
not found it raises error.
|
||||
"""
|
||||
for entry in srsly.read_jsonl(path):
|
||||
if field not in entry:
|
||||
msg.fail(
|
||||
f"{path} does not contain the required '{field}' field.", exits=1
|
||||
)
|
||||
else:
|
||||
yield entry[field]
|
||||
|
||||
|
||||
def _stream_texts(paths: Iterable[Path]) -> Iterable[str]:
|
||||
"""
|
||||
Yields strings from text files in paths.
|
||||
"""
|
||||
for path in paths:
|
||||
with open(path, "r") as fin:
|
||||
text = fin.read()
|
||||
yield text
|
||||
|
||||
|
||||
@app.command("apply")
|
||||
def apply_cli(
|
||||
# fmt: off
|
||||
model: str = Arg(..., help="Model name or path"),
|
||||
data_path: Path = Arg(..., help=path_help, exists=True),
|
||||
output_file: Path = Arg(..., help=out_help, dir_okay=False),
|
||||
code_path: Optional[Path] = Opt(None, "--code", "-c", help=code_help),
|
||||
text_key: str = Opt("text", "--text-key", "-tk", help="Key containing text string for JSONL"),
|
||||
force_overwrite: bool = Opt(False, "--force", "-F", help="Force overwriting the output file"),
|
||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU."),
|
||||
batch_size: int = Opt(1, "--batch-size", "-b", help="Batch size."),
|
||||
n_process: int = Opt(1, "--n-process", "-n", help="number of processors to use.")
|
||||
):
|
||||
"""
|
||||
Apply a trained pipeline to documents to get predictions.
|
||||
Expects a loadable spaCy pipeline and path to the data, which
|
||||
can be a directory or a file.
|
||||
The data files can be provided in multiple formats:
|
||||
1. .spacy files
|
||||
2. .jsonl files with a specified "field" to read the text from.
|
||||
3. Files with any other extension are assumed to be containing
|
||||
a single document.
|
||||
DOCS: https://spacy.io/api/cli#apply
|
||||
"""
|
||||
data_path = ensure_path(data_path)
|
||||
output_file = ensure_path(output_file)
|
||||
code_path = ensure_path(code_path)
|
||||
if output_file.exists() and not force_overwrite:
|
||||
msg.fail(force_msg, exits=1)
|
||||
if not data_path.exists():
|
||||
msg.fail(f"Couldn't find data path: {data_path}", exits=1)
|
||||
import_code(code_path)
|
||||
setup_gpu(use_gpu)
|
||||
apply(data_path, output_file, model, text_key, batch_size, n_process)
|
||||
|
||||
|
||||
def apply(
|
||||
data_path: Path,
|
||||
output_file: Path,
|
||||
model: str,
|
||||
json_field: str,
|
||||
batch_size: int,
|
||||
n_process: int,
|
||||
):
|
||||
docbin = DocBin(store_user_data=True)
|
||||
paths = walk_directory(data_path)
|
||||
if len(paths) == 0:
|
||||
docbin.to_disk(output_file)
|
||||
msg.warn("Did not find data to process,"
|
||||
f" {data_path} seems to be an empty directory.")
|
||||
return
|
||||
nlp = load_model(model)
|
||||
msg.good(f"Loaded model {model}")
|
||||
vocab = nlp.vocab
|
||||
streams: List[DocOrStrStream] = []
|
||||
text_files = []
|
||||
for path in paths:
|
||||
if path.suffix == ".spacy":
|
||||
streams.append(_stream_docbin(path, vocab))
|
||||
elif path.suffix == ".jsonl":
|
||||
streams.append(_stream_jsonl(path, json_field))
|
||||
else:
|
||||
text_files.append(path)
|
||||
if len(text_files) > 0:
|
||||
streams.append(_stream_texts(text_files))
|
||||
datagen = cast(DocOrStrStream, chain(*streams))
|
||||
for doc in tqdm.tqdm(nlp.pipe(datagen, batch_size=batch_size, n_process=n_process)):
|
||||
docbin.add(doc)
|
||||
if output_file.suffix == "":
|
||||
output_file = output_file.with_suffix(".spacy")
|
||||
docbin.to_disk(output_file)
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Callable, Iterable, Mapping, Optional, Any, List, Union
|
||||
from typing import Callable, Iterable, Mapping, Optional, Any, Union
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
from wasabi import Printer
|
||||
|
@ -7,7 +7,7 @@ import re
|
|||
import sys
|
||||
import itertools
|
||||
|
||||
from ._util import app, Arg, Opt
|
||||
from ._util import app, Arg, Opt, walk_directory
|
||||
from ..training import docs_to_json
|
||||
from ..tokens import Doc, DocBin
|
||||
from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs
|
||||
|
@ -189,33 +189,6 @@ def autodetect_ner_format(input_data: str) -> Optional[str]:
|
|||
return None
|
||||
|
||||
|
||||
def walk_directory(path: Path, converter: str) -> List[Path]:
|
||||
if not path.is_dir():
|
||||
return [path]
|
||||
paths = [path]
|
||||
locs = []
|
||||
seen = set()
|
||||
for path in paths:
|
||||
if str(path) in seen:
|
||||
continue
|
||||
seen.add(str(path))
|
||||
if path.parts[-1].startswith("."):
|
||||
continue
|
||||
elif path.is_dir():
|
||||
paths.extend(path.iterdir())
|
||||
elif converter == "json" and not path.parts[-1].endswith("json"):
|
||||
continue
|
||||
elif converter == "conll" and not path.parts[-1].endswith("conll"):
|
||||
continue
|
||||
elif converter == "iob" and not path.parts[-1].endswith("iob"):
|
||||
continue
|
||||
else:
|
||||
locs.append(path)
|
||||
# It's good to sort these, in case the ordering messes up cache.
|
||||
locs.sort()
|
||||
return locs
|
||||
|
||||
|
||||
def verify_cli_args(
|
||||
msg: Printer,
|
||||
input_path: Path,
|
||||
|
|
|
@ -5,6 +5,7 @@ from typing import Tuple, List, Dict, Any
|
|||
import pkg_resources
|
||||
import time
|
||||
|
||||
import spacy
|
||||
import numpy
|
||||
import pytest
|
||||
import srsly
|
||||
|
@ -32,6 +33,7 @@ from spacy.cli.package import _is_permitted_package_name
|
|||
from spacy.cli.project.remote_storage import RemoteStorage
|
||||
from spacy.cli.project.run import _check_requirements
|
||||
from spacy.cli.validate import get_model_pkgs
|
||||
from spacy.cli.apply import apply
|
||||
from spacy.cli.find_threshold import find_threshold
|
||||
from spacy.lang.en import English
|
||||
from spacy.lang.nl import Dutch
|
||||
|
@ -885,6 +887,82 @@ def test_span_length_freq_dist_output_must_be_correct():
|
|||
assert list(span_freqs.keys()) == [3, 1, 4, 5, 2]
|
||||
|
||||
|
||||
def test_applycli_empty_dir():
|
||||
with make_tempdir() as data_path:
|
||||
output = data_path / "test.spacy"
|
||||
apply(data_path, output, "blank:en", "text", 1, 1)
|
||||
|
||||
|
||||
def test_applycli_docbin():
|
||||
with make_tempdir() as data_path:
|
||||
output = data_path / "testout.spacy"
|
||||
nlp = spacy.blank("en")
|
||||
doc = nlp("testing apply cli.")
|
||||
# test empty DocBin case
|
||||
docbin = DocBin()
|
||||
docbin.to_disk(data_path / "testin.spacy")
|
||||
apply(data_path, output, "blank:en", "text", 1, 1)
|
||||
docbin.add(doc)
|
||||
docbin.to_disk(data_path / "testin.spacy")
|
||||
apply(data_path, output, "blank:en", "text", 1, 1)
|
||||
|
||||
|
||||
def test_applycli_jsonl():
|
||||
with make_tempdir() as data_path:
|
||||
output = data_path / "testout.spacy"
|
||||
data = [{"field": "Testing apply cli.", "key": 234}]
|
||||
data2 = [{"field": "234"}]
|
||||
srsly.write_jsonl(data_path / "test.jsonl", data)
|
||||
apply(data_path, output, "blank:en", "field", 1, 1)
|
||||
srsly.write_jsonl(data_path / "test2.jsonl", data2)
|
||||
apply(data_path, output, "blank:en", "field", 1, 1)
|
||||
|
||||
|
||||
def test_applycli_txt():
|
||||
with make_tempdir() as data_path:
|
||||
output = data_path / "testout.spacy"
|
||||
with open(data_path / "test.foo", "w") as ftest:
|
||||
ftest.write("Testing apply cli.")
|
||||
apply(data_path, output, "blank:en", "text", 1, 1)
|
||||
|
||||
|
||||
def test_applycli_mixed():
|
||||
with make_tempdir() as data_path:
|
||||
output = data_path / "testout.spacy"
|
||||
text = "Testing apply cli"
|
||||
nlp = spacy.blank("en")
|
||||
doc = nlp(text)
|
||||
jsonl_data = [{"text": text}]
|
||||
srsly.write_jsonl(data_path / "test.jsonl", jsonl_data)
|
||||
docbin = DocBin()
|
||||
docbin.add(doc)
|
||||
docbin.to_disk(data_path / "testin.spacy")
|
||||
with open(data_path / "test.txt", "w") as ftest:
|
||||
ftest.write(text)
|
||||
apply(data_path, output, "blank:en", "text", 1, 1)
|
||||
# Check whether it worked
|
||||
result = list(DocBin().from_disk(output).get_docs(nlp.vocab))
|
||||
assert len(result) == 3
|
||||
for doc in result:
|
||||
assert doc.text == text
|
||||
|
||||
|
||||
def test_applycli_user_data():
|
||||
Doc.set_extension("ext", default=0)
|
||||
val = ("ext", 0)
|
||||
with make_tempdir() as data_path:
|
||||
output = data_path / "testout.spacy"
|
||||
nlp = spacy.blank("en")
|
||||
doc = nlp("testing apply cli.")
|
||||
doc._.ext = val
|
||||
docbin = DocBin(store_user_data=True)
|
||||
docbin.add(doc)
|
||||
docbin.to_disk(data_path / "testin.spacy")
|
||||
apply(data_path, output, "blank:en", "", 1, 1)
|
||||
result = list(DocBin().from_disk(output).get_docs(nlp.vocab))
|
||||
assert result[0]._.ext == val
|
||||
|
||||
|
||||
def test_local_remote_storage():
|
||||
with make_tempdir() as d:
|
||||
filename = "a.txt"
|
||||
|
|
|
@ -12,6 +12,7 @@ menu:
|
|||
- ['train', 'train']
|
||||
- ['pretrain', 'pretrain']
|
||||
- ['evaluate', 'evaluate']
|
||||
- ['apply', 'apply']
|
||||
- ['find-threshold', 'find-threshold']
|
||||
- ['assemble', 'assemble']
|
||||
- ['package', 'package']
|
||||
|
@ -474,7 +475,7 @@ report span characteristics such as the average span length and the span (or
|
|||
span boundary) distinctiveness. The distinctiveness measure shows how different
|
||||
the tokens are with respect to the rest of the corpus using the KL-divergence of
|
||||
the token distributions. To learn more, you can check out Papay et al.'s work on
|
||||
[*Dissecting Span Identification Tasks with Performance Prediction* (EMNLP 2020)](https://aclanthology.org/2020.emnlp-main.396/).
|
||||
[_Dissecting Span Identification Tasks with Performance Prediction_ (EMNLP 2020)](https://aclanthology.org/2020.emnlp-main.396/).
|
||||
|
||||
</Infobox>
|
||||
|
||||
|
@ -1162,6 +1163,37 @@ $ python -m spacy evaluate [model] [data_path] [--output] [--code] [--gold-prepr
|
|||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| **CREATES** | Training results and optional metrics and visualizations. |
|
||||
|
||||
## apply {#apply new="3.5" tag="command"}
|
||||
|
||||
Applies a trained pipeline to data and stores the resulting annotated documents
|
||||
in a `DocBin`. The input can be a single file or a directory. The recognized
|
||||
input formats are:
|
||||
|
||||
1. `.spacy`
|
||||
2. `.jsonl` containing a user specified `text_key`
|
||||
3. Files with any other extension are assumed to be plain text files containing
|
||||
a single document.
|
||||
|
||||
When a directory is provided it is traversed recursively to collect all files.
|
||||
|
||||
```cli
|
||||
$ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key] [--force-overwrite] [--gpu-id] [--batch-size] [--n-process]
|
||||
```
|
||||
|
||||
| Name | Description |
|
||||
| ----------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `model` | Pipeline to apply to the data. Can be a package or a path to a data directory. ~~str (positional)~~ |
|
||||
| `data_path` | Location of data to be evaluated in spaCy's [binary format](/api/data-formats#training), jsonl, or plain text. ~~Path (positional)~~ |
|
||||
| `output-file`, `-o` | Output `DocBin` path. ~~str (positional)~~ |
|
||||
| `--code`, `-c` <Tag variant="new">3</Tag> | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||
| `--text-key`, `-tk` | The key for `.jsonl` files to use to grab the texts from. Defaults to `text`. ~~Optional[str] \(option)~~ |
|
||||
| `--force-overwrite`, `-F` | If the provided `output-file` already exists, then force `apply` to overwrite it. If this is `False` (default) then quits with a warning instead. ~~bool (flag)~~ |
|
||||
| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ |
|
||||
| `--batch-size`, `-b` | Batch size to use for prediction. Defaults to `1`. ~~int (option)~~ |
|
||||
| `--n-process`, `-n` | Number of processes to use for prediction. Defaults to `1`. ~~int (option)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| **CREATES** | A `DocBin` with the annotations from the `model` for all the files found in `data-path`. |
|
||||
|
||||
## find-threshold {#find-threshold new="3.5" tag="command"}
|
||||
|
||||
Runs prediction trials for a trained model with varying tresholds to maximize
|
||||
|
@ -1187,7 +1219,6 @@ be provided.
|
|||
> $ python -m spacy find-threshold my_nlp data.spacy spancat threshold spans_sc_f
|
||||
> ```
|
||||
|
||||
|
||||
| Name | Description |
|
||||
| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `model` | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~ |
|
||||
|
|
Loading…
Reference in New Issue
Block a user