annotate cli first try

2025-09-17 09:32:42 +03:00 · 2022-08-24 15:22:13 +00:00 · 2022-08-24 15:22:13 +00:00 · fe563e6ba9
commit fe563e6ba9
parent c09d2fa25b
2 changed files with 118 additions and 0 deletions
--- a/spacy/cli/init.py
+++ b/spacy/cli/init.py
@ -16,6 +16,7 @@ from .debug_config import debug_config  # noqa: F401
 from .debug_model import debug_model  # noqa: F401
 from .debug_diff import debug_diff  # noqa: F401
 from .evaluate import evaluate  # noqa: F401
+from .annotate import annotate  # noqa: F401
 from .convert import convert  # noqa: F401
 from .init_pipeline import init_pipeline_cli  # noqa: F401
 from .init_config import init_config, fill_config  # noqa: F401
--- a/spacy/cli/annotate.py
+++ b/spacy/cli/annotate.py
@ -0,0 +1,117 @@
+import tqdm
+import sys
+
+from ._util import app, Arg, Opt, setup_gpu, import_code
+from typing import Optional, Generator
+from pathlib import Path
+from wasabi import msg
+
+from ..tokens import Doc, DocBin
+from ..vocab import Vocab
+from .. import util
+
+
+path_help = ("Location of the documents to predict on."
+             "Can be a single file in .spacy format or "
+             "a text file with one document per line."
+             "If a directory is provided each "
+             "text file in the directory will be treated "
+             "as a single document.")
+out_help = "Path where to save the result .spacy file"
+code_help = ("Path to Python file with additional "
+             "code (registered functions) to be imported")
+gold_help = "Use gold preprocessing provided in the .spacy files"
+
+
+def _stream_data(
+    data_path: Path,
+    vocab: Vocab
+) -> Generator[Doc, None, None]:
+    """
+    Load data which is either in a single file
+    in .spacy or plain text format or multiple
+    text files in a directory.
+    """
+    # XXX I know that we have it in the developer guidelines
+    # to don't try/except, but I thought its appropriate here.
+    # because we are not sure exactly what input we are getting.
+    if not data_path.is_dir():
+        # Yield from DocBin.
+        try:
+            docbin = DocBin().from_disk(data_path)
+            for doc in docbin.get_docs(vocab):
+                yield doc
+        # Yield from text file.
+        except ValueError:
+            try:
+                with open(data_path, 'r') as fin:
+                    for line in fin:
+                        yield line
+            except UnicodeDecodeError:
+                print(
+                    f"file {data_path} does not seem "
+                    "to be a plain text file"
+                )
+                sys.exit()
+    else:
+        # Yield per one file in directory
+        for path in data_path.iterdir():
+            if path.is_dir():
+                raise ValueError(
+                    "All files should be text files."
+                )
+            with open(path, 'r') as fin:
+                try:
+                    text = fin.read()
+                    yield text
+                except UnicodeDecodeError:
+                    print(
+                        f"file {path} does not seem "
+                        "to be a plain text file"
+                    )
+                    sys.exit()
+
+
+@app.command("annotate")
+def annotate_cli(
+    # fmt: off
+    model: str = Arg(..., help="Model name or path"),
+    data_path: Path = Arg(..., help=path_help, exists=True),
+    output: Optional[Path] = Arg(..., help=out_help, dir_okay=False),
+    code_path: Optional[Path] = Opt(None, "--code", "-c", help=code_help),
+    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
+    batch_size: int = Opt(1, "--batch-size", "-b", help="Batch size"),
+    n_process: int = Opt(1, "--n-process", "-n", help="Number of processors to use")
+):
+    """
+    Run a trained pipeline over documents.
+    Expects a loadable spaCy pipeline and some data as input.
+    The input can be provided multiple formats. It can be a .spacy
+    file, a single text file with one document per line or a directory
+    where each file is assumed to be plain text document.
+
+    DOCS: https://spacy.io/api/cli#tba
+    """
+    import_code(code_path)
+    setup_gpu(use_gpu)
+    annotate(data_path, output, model)
+
+
+def annotate(
+    data_path: Path,
+    output: Path,
+    model: str
+):
+    data_path = util.ensure_path(data_path)
+    output_path = util.ensure_path(output)
+    if not data_path.exists():
+        msg.fail("Couldn't find data path.", data_path, exits=1)
+    nlp = util.load_model(model)
+    msg.good(f"Loaded model {model}")
+    vocab = nlp.vocab
+    docbin = DocBin()
+    for doc in tqdm.tqdm(nlp.pipe(_stream_data(data_path, vocab))):
+        docbin.add(doc)
+    if output_path.is_dir():
+        output_path = output_path / "predictions.spacy"
+    docbin.to_disk(output_path)