mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-26 21:51:24 +03:00 
			
		
		
		
	Add apply CLI (#11376)
* annotate cli first try * add batch-size and n_process * rename to apply * typing fix * handle file suffixes * walk directories * support jsonl * typing fix * remove debug * make suffix optional for walk * revert unrelated * don't warn but raise * better error message * minor touch up * Update spacy/tests/test_cli.py Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> * Update spacy/cli/apply.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/cli/apply.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * update tests and bugfix * add force_overwrite * typo * fix adding .spacy suffix * Update spacy/cli/apply.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/cli/apply.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/cli/apply.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * store user data and rename cmd arg * include test for user attr * rename cmd arg * better help message * documentation * prettier * black * link fix * Update spacy/cli/apply.py Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com> * Update website/docs/api/cli.md Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com> * Update website/docs/api/cli.md Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com> * Update website/docs/api/cli.md Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com> * addressing reviews * dont quit but warn * prettier Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com>
This commit is contained in:
		
							parent
							
								
									18ffe5bbd6
								
							
						
					
					
						commit
						c223cd7a86
					
				|  | @ -16,6 +16,7 @@ from .debug_config import debug_config  # noqa: F401 | |||
| from .debug_model import debug_model  # noqa: F401 | ||||
| from .debug_diff import debug_diff  # noqa: F401 | ||||
| from .evaluate import evaluate  # noqa: F401 | ||||
| from .apply import apply  # noqa: F401 | ||||
| from .convert import convert  # noqa: F401 | ||||
| from .init_pipeline import init_pipeline_cli  # noqa: F401 | ||||
| from .init_config import init_config, fill_config  # noqa: F401 | ||||
|  |  | |||
|  | @ -582,6 +582,29 @@ def setup_gpu(use_gpu: int, silent=None) -> None: | |||
|             local_msg.info("To switch to GPU 0, use the option: --gpu-id 0") | ||||
| 
 | ||||
| 
 | ||||
| def walk_directory(path: Path, suffix: Optional[str] = None) -> List[Path]: | ||||
|     if not path.is_dir(): | ||||
|         return [path] | ||||
|     paths = [path] | ||||
|     locs = [] | ||||
|     seen = set() | ||||
|     for path in paths: | ||||
|         if str(path) in seen: | ||||
|             continue | ||||
|         seen.add(str(path)) | ||||
|         if path.parts[-1].startswith("."): | ||||
|             continue | ||||
|         elif path.is_dir(): | ||||
|             paths.extend(path.iterdir()) | ||||
|         elif suffix is not None and not path.parts[-1].endswith(suffix): | ||||
|             continue | ||||
|         else: | ||||
|             locs.append(path) | ||||
|     # It's good to sort these, in case the ordering messes up cache. | ||||
|     locs.sort() | ||||
|     return locs | ||||
| 
 | ||||
| 
 | ||||
| def _format_number(number: Union[int, float], ndigits: int = 2) -> str: | ||||
|     """Formats a number (float or int) rounding to `ndigits`, without truncating trailing 0s, | ||||
|     as happens with `round(number, ndigits)`""" | ||||
|  |  | |||
							
								
								
									
										143
									
								
								spacy/cli/apply.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										143
									
								
								spacy/cli/apply.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,143 @@ | |||
| import tqdm | ||||
| import srsly | ||||
| 
 | ||||
| from itertools import chain | ||||
| from pathlib import Path | ||||
| from typing import Optional, List, Iterable, cast, Union | ||||
| 
 | ||||
| from wasabi import msg | ||||
| 
 | ||||
| from ._util import app, Arg, Opt, setup_gpu, import_code, walk_directory | ||||
| 
 | ||||
| from ..tokens import Doc, DocBin | ||||
| from ..vocab import Vocab | ||||
| from ..util import ensure_path, load_model | ||||
| 
 | ||||
| 
 | ||||
| path_help = """Location of the documents to predict on. | ||||
| Can be a single file in .spacy format or a .jsonl file. | ||||
| Files with other extensions are treated as single plain text documents. | ||||
| If a directory is provided it is traversed recursively to grab | ||||
| all files to be processed. | ||||
| The files can be a mixture of .spacy, .jsonl and text files. | ||||
| If .jsonl is provided the specified field is going | ||||
| to be grabbed ("text" by default).""" | ||||
| 
 | ||||
| out_help = "Path to save the resulting .spacy file" | ||||
| code_help = ( | ||||
|     "Path to Python file with additional " "code (registered functions) to be imported" | ||||
| ) | ||||
| gold_help = "Use gold preprocessing provided in the .spacy files" | ||||
| force_msg = ( | ||||
|     "The provided output file already exists. " | ||||
|     "To force overwriting the output file, set the --force or -F flag." | ||||
| ) | ||||
| 
 | ||||
| 
 | ||||
| DocOrStrStream = Union[Iterable[str], Iterable[Doc]] | ||||
| 
 | ||||
| 
 | ||||
| def _stream_docbin(path: Path, vocab: Vocab) -> Iterable[Doc]: | ||||
|     """ | ||||
|     Stream Doc objects from DocBin. | ||||
|     """ | ||||
|     docbin = DocBin().from_disk(path) | ||||
|     for doc in docbin.get_docs(vocab): | ||||
|         yield doc | ||||
| 
 | ||||
| 
 | ||||
| def _stream_jsonl(path: Path, field: str) -> Iterable[str]: | ||||
|     """ | ||||
|     Stream "text" field from JSONL. If the field "text" is | ||||
|     not found it raises error. | ||||
|     """ | ||||
|     for entry in srsly.read_jsonl(path): | ||||
|         if field not in entry: | ||||
|             msg.fail( | ||||
|                 f"{path} does not contain the required '{field}' field.", exits=1 | ||||
|             ) | ||||
|         else: | ||||
|             yield entry[field] | ||||
| 
 | ||||
| 
 | ||||
| def _stream_texts(paths: Iterable[Path]) -> Iterable[str]: | ||||
|     """ | ||||
|     Yields strings from text files in paths. | ||||
|     """ | ||||
|     for path in paths: | ||||
|         with open(path, "r") as fin: | ||||
|             text = fin.read() | ||||
|             yield text | ||||
| 
 | ||||
| 
 | ||||
| @app.command("apply") | ||||
| def apply_cli( | ||||
|     # fmt: off | ||||
|     model: str = Arg(..., help="Model name or path"), | ||||
|     data_path: Path = Arg(..., help=path_help, exists=True), | ||||
|     output_file: Path = Arg(..., help=out_help, dir_okay=False), | ||||
|     code_path: Optional[Path] = Opt(None, "--code", "-c", help=code_help), | ||||
|     text_key: str = Opt("text", "--text-key", "-tk", help="Key containing text string for JSONL"), | ||||
|     force_overwrite: bool = Opt(False, "--force", "-F", help="Force overwriting the output file"), | ||||
|     use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU."), | ||||
|     batch_size: int = Opt(1, "--batch-size", "-b", help="Batch size."), | ||||
|     n_process: int = Opt(1, "--n-process", "-n", help="number of processors to use.") | ||||
| ): | ||||
|     """ | ||||
|     Apply a trained pipeline to documents to get predictions. | ||||
|     Expects a loadable spaCy pipeline and path to the data, which | ||||
|     can be a directory or a file. | ||||
|     The data files can be provided in multiple formats: | ||||
|         1. .spacy files | ||||
|         2. .jsonl files with a specified "field" to read the text from. | ||||
|         3. Files with any other extension are assumed to be containing | ||||
|            a single document. | ||||
|     DOCS: https://spacy.io/api/cli#apply | ||||
|     """ | ||||
|     data_path = ensure_path(data_path) | ||||
|     output_file = ensure_path(output_file) | ||||
|     code_path = ensure_path(code_path) | ||||
|     if output_file.exists() and not force_overwrite: | ||||
|         msg.fail(force_msg, exits=1) | ||||
|     if not data_path.exists(): | ||||
|         msg.fail(f"Couldn't find data path: {data_path}", exits=1) | ||||
|     import_code(code_path) | ||||
|     setup_gpu(use_gpu) | ||||
|     apply(data_path, output_file, model, text_key, batch_size, n_process) | ||||
| 
 | ||||
| 
 | ||||
| def apply( | ||||
|     data_path: Path, | ||||
|     output_file: Path, | ||||
|     model: str, | ||||
|     json_field: str, | ||||
|     batch_size: int, | ||||
|     n_process: int, | ||||
| ): | ||||
|     docbin = DocBin(store_user_data=True) | ||||
|     paths = walk_directory(data_path) | ||||
|     if len(paths) == 0: | ||||
|         docbin.to_disk(output_file) | ||||
|         msg.warn("Did not find data to process," | ||||
|                  f" {data_path} seems to be an empty directory.") | ||||
|         return | ||||
|     nlp = load_model(model) | ||||
|     msg.good(f"Loaded model {model}") | ||||
|     vocab = nlp.vocab | ||||
|     streams: List[DocOrStrStream] = [] | ||||
|     text_files = [] | ||||
|     for path in paths: | ||||
|         if path.suffix == ".spacy": | ||||
|             streams.append(_stream_docbin(path, vocab)) | ||||
|         elif path.suffix == ".jsonl": | ||||
|             streams.append(_stream_jsonl(path, json_field)) | ||||
|         else: | ||||
|             text_files.append(path) | ||||
|     if len(text_files) > 0: | ||||
|         streams.append(_stream_texts(text_files)) | ||||
|     datagen = cast(DocOrStrStream, chain(*streams)) | ||||
|     for doc in tqdm.tqdm(nlp.pipe(datagen, batch_size=batch_size, n_process=n_process)): | ||||
|         docbin.add(doc) | ||||
|     if output_file.suffix == "": | ||||
|         output_file = output_file.with_suffix(".spacy") | ||||
|     docbin.to_disk(output_file) | ||||
|  | @ -1,4 +1,4 @@ | |||
| from typing import Callable, Iterable, Mapping, Optional, Any, List, Union | ||||
| from typing import Callable, Iterable, Mapping, Optional, Any, Union | ||||
| from enum import Enum | ||||
| from pathlib import Path | ||||
| from wasabi import Printer | ||||
|  | @ -7,7 +7,7 @@ import re | |||
| import sys | ||||
| import itertools | ||||
| 
 | ||||
| from ._util import app, Arg, Opt | ||||
| from ._util import app, Arg, Opt, walk_directory | ||||
| from ..training import docs_to_json | ||||
| from ..tokens import Doc, DocBin | ||||
| from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs | ||||
|  | @ -189,33 +189,6 @@ def autodetect_ner_format(input_data: str) -> Optional[str]: | |||
|     return None | ||||
| 
 | ||||
| 
 | ||||
| def walk_directory(path: Path, converter: str) -> List[Path]: | ||||
|     if not path.is_dir(): | ||||
|         return [path] | ||||
|     paths = [path] | ||||
|     locs = [] | ||||
|     seen = set() | ||||
|     for path in paths: | ||||
|         if str(path) in seen: | ||||
|             continue | ||||
|         seen.add(str(path)) | ||||
|         if path.parts[-1].startswith("."): | ||||
|             continue | ||||
|         elif path.is_dir(): | ||||
|             paths.extend(path.iterdir()) | ||||
|         elif converter == "json" and not path.parts[-1].endswith("json"): | ||||
|             continue | ||||
|         elif converter == "conll" and not path.parts[-1].endswith("conll"): | ||||
|             continue | ||||
|         elif converter == "iob" and not path.parts[-1].endswith("iob"): | ||||
|             continue | ||||
|         else: | ||||
|             locs.append(path) | ||||
|     # It's good to sort these, in case the ordering messes up cache. | ||||
|     locs.sort() | ||||
|     return locs | ||||
| 
 | ||||
| 
 | ||||
| def verify_cli_args( | ||||
|     msg: Printer, | ||||
|     input_path: Path, | ||||
|  |  | |||
|  | @ -5,6 +5,7 @@ from typing import Tuple, List, Dict, Any | |||
| import pkg_resources | ||||
| import time | ||||
| 
 | ||||
| import spacy | ||||
| import numpy | ||||
| import pytest | ||||
| import srsly | ||||
|  | @ -32,6 +33,7 @@ from spacy.cli.package import _is_permitted_package_name | |||
| from spacy.cli.project.remote_storage import RemoteStorage | ||||
| from spacy.cli.project.run import _check_requirements | ||||
| from spacy.cli.validate import get_model_pkgs | ||||
| from spacy.cli.apply import apply | ||||
| from spacy.cli.find_threshold import find_threshold | ||||
| from spacy.lang.en import English | ||||
| from spacy.lang.nl import Dutch | ||||
|  | @ -885,6 +887,82 @@ def test_span_length_freq_dist_output_must_be_correct(): | |||
|     assert list(span_freqs.keys()) == [3, 1, 4, 5, 2] | ||||
| 
 | ||||
| 
 | ||||
| def test_applycli_empty_dir(): | ||||
|     with make_tempdir() as data_path: | ||||
|         output = data_path / "test.spacy" | ||||
|         apply(data_path, output, "blank:en", "text", 1, 1) | ||||
| 
 | ||||
| 
 | ||||
| def test_applycli_docbin(): | ||||
|     with make_tempdir() as data_path: | ||||
|         output = data_path / "testout.spacy" | ||||
|         nlp = spacy.blank("en") | ||||
|         doc = nlp("testing apply cli.") | ||||
|         # test empty DocBin case | ||||
|         docbin = DocBin() | ||||
|         docbin.to_disk(data_path / "testin.spacy") | ||||
|         apply(data_path, output, "blank:en", "text", 1, 1) | ||||
|         docbin.add(doc) | ||||
|         docbin.to_disk(data_path / "testin.spacy") | ||||
|         apply(data_path, output, "blank:en", "text", 1, 1) | ||||
| 
 | ||||
| 
 | ||||
| def test_applycli_jsonl(): | ||||
|     with make_tempdir() as data_path: | ||||
|         output = data_path / "testout.spacy" | ||||
|         data = [{"field": "Testing apply cli.", "key": 234}] | ||||
|         data2 = [{"field": "234"}] | ||||
|         srsly.write_jsonl(data_path / "test.jsonl", data) | ||||
|         apply(data_path, output, "blank:en", "field", 1, 1) | ||||
|         srsly.write_jsonl(data_path / "test2.jsonl", data2) | ||||
|         apply(data_path, output, "blank:en", "field", 1, 1) | ||||
| 
 | ||||
| 
 | ||||
| def test_applycli_txt(): | ||||
|     with make_tempdir() as data_path: | ||||
|         output = data_path / "testout.spacy" | ||||
|         with open(data_path / "test.foo", "w") as ftest: | ||||
|             ftest.write("Testing apply cli.") | ||||
|         apply(data_path, output, "blank:en", "text", 1, 1) | ||||
| 
 | ||||
| 
 | ||||
| def test_applycli_mixed(): | ||||
|     with make_tempdir() as data_path: | ||||
|         output = data_path / "testout.spacy" | ||||
|         text = "Testing apply cli" | ||||
|         nlp = spacy.blank("en") | ||||
|         doc = nlp(text) | ||||
|         jsonl_data = [{"text": text}] | ||||
|         srsly.write_jsonl(data_path / "test.jsonl", jsonl_data) | ||||
|         docbin = DocBin() | ||||
|         docbin.add(doc) | ||||
|         docbin.to_disk(data_path / "testin.spacy") | ||||
|         with open(data_path / "test.txt", "w") as ftest: | ||||
|             ftest.write(text) | ||||
|         apply(data_path, output, "blank:en", "text", 1, 1) | ||||
|         # Check whether it worked | ||||
|         result = list(DocBin().from_disk(output).get_docs(nlp.vocab)) | ||||
|         assert len(result) == 3 | ||||
|         for doc in result: | ||||
|             assert doc.text == text | ||||
| 
 | ||||
| 
 | ||||
| def test_applycli_user_data(): | ||||
|     Doc.set_extension("ext", default=0) | ||||
|     val = ("ext", 0) | ||||
|     with make_tempdir() as data_path: | ||||
|         output = data_path / "testout.spacy" | ||||
|         nlp = spacy.blank("en") | ||||
|         doc = nlp("testing apply cli.") | ||||
|         doc._.ext = val | ||||
|         docbin = DocBin(store_user_data=True) | ||||
|         docbin.add(doc) | ||||
|         docbin.to_disk(data_path / "testin.spacy") | ||||
|         apply(data_path, output, "blank:en", "", 1, 1) | ||||
|         result = list(DocBin().from_disk(output).get_docs(nlp.vocab)) | ||||
|         assert result[0]._.ext == val | ||||
| 
 | ||||
| 
 | ||||
| def test_local_remote_storage(): | ||||
|     with make_tempdir() as d: | ||||
|         filename = "a.txt" | ||||
|  |  | |||
|  | @ -12,6 +12,7 @@ menu: | |||
|   - ['train', 'train'] | ||||
|   - ['pretrain', 'pretrain'] | ||||
|   - ['evaluate', 'evaluate'] | ||||
|   - ['apply', 'apply'] | ||||
|   - ['find-threshold', 'find-threshold'] | ||||
|   - ['assemble', 'assemble'] | ||||
|   - ['package', 'package'] | ||||
|  | @ -474,7 +475,7 @@ report span characteristics such as the average span length and the span (or | |||
| span boundary) distinctiveness. The distinctiveness measure shows how different | ||||
| the tokens are with respect to the rest of the corpus using the KL-divergence of | ||||
| the token distributions. To learn more, you can check out Papay et al.'s work on | ||||
| [*Dissecting Span Identification Tasks with Performance Prediction* (EMNLP 2020)](https://aclanthology.org/2020.emnlp-main.396/). | ||||
| [_Dissecting Span Identification Tasks with Performance Prediction_ (EMNLP 2020)](https://aclanthology.org/2020.emnlp-main.396/). | ||||
| 
 | ||||
| </Infobox> | ||||
| 
 | ||||
|  | @ -1162,6 +1163,37 @@ $ python -m spacy evaluate [model] [data_path] [--output] [--code] [--gold-prepr | |||
| | `--help`, `-h`                            | Show help message and available arguments. ~~bool (flag)~~                                                                                                                           | | ||||
| | **CREATES**                               | Training results and optional metrics and visualizations.                                                                                                                            | | ||||
| 
 | ||||
| ## apply {#apply new="3.5" tag="command"} | ||||
| 
 | ||||
| Applies a trained pipeline to data and stores the resulting annotated documents | ||||
| in a `DocBin`. The input can be a single file or a directory. The recognized | ||||
| input formats are: | ||||
| 
 | ||||
| 1. `.spacy` | ||||
| 2. `.jsonl` containing a user specified `text_key` | ||||
| 3. Files with any other extension are assumed to be plain text files containing | ||||
|    a single document. | ||||
| 
 | ||||
| When a directory is provided it is traversed recursively to collect all files. | ||||
| 
 | ||||
| ```cli | ||||
| $ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key] [--force-overwrite] [--gpu-id] [--batch-size] [--n-process] | ||||
| ``` | ||||
| 
 | ||||
| | Name                                      | Description                                                                                                                                                                          | | ||||
| | ----------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | ||||
| | `model`                                   | Pipeline to apply to the data. Can be a package or a path to a data directory. ~~str (positional)~~                                                                                  | | ||||
| | `data_path`                               | Location of data to be evaluated in spaCy's [binary format](/api/data-formats#training), jsonl, or plain text. ~~Path (positional)~~                                                 | | ||||
| | `output-file`, `-o`                       | Output `DocBin` path. ~~str (positional)~~                                                                                                                                           | | ||||
| | `--code`, `-c` <Tag variant="new">3</Tag> | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | | ||||
| | `--text-key`, `-tk`                       | The key for `.jsonl` files to use to grab the texts from. Defaults to `text`. ~~Optional[str] \(option)~~                                                                            | | ||||
| | `--force-overwrite`, `-F`                 | If the provided `output-file` already exists, then force `apply` to overwrite it. If this is `False` (default) then quits with a warning instead. ~~bool (flag)~~                    | | ||||
| | `--gpu-id`, `-g`                          | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                       | | ||||
| | `--batch-size`, `-b`                      | Batch size to use for prediction. Defaults to `1`. ~~int (option)~~                                                                                                                  | | ||||
| | `--n-process`, `-n`                       | Number of processes to use for prediction. Defaults to `1`. ~~int (option)~~                                                                                                         | | ||||
| | `--help`, `-h`                            | Show help message and available arguments. ~~bool (flag)~~                                                                                                                           | | ||||
| | **CREATES**                               | A `DocBin` with the annotations from the `model` for all the files found in `data-path`.                                                                                             | | ||||
| 
 | ||||
| ## find-threshold {#find-threshold new="3.5" tag="command"} | ||||
| 
 | ||||
| Runs prediction trials for a trained model with varying tresholds to maximize | ||||
|  | @ -1187,7 +1219,6 @@ be provided. | |||
| > $ python -m spacy find-threshold my_nlp data.spacy spancat threshold spans_sc_f | ||||
| > ``` | ||||
| 
 | ||||
| 
 | ||||
| | Name                    | Description                                                                                                                                                                          | | ||||
| | ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | ||||
| | `model`                 | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~                                                                                           | | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user