annotate cli first try

This commit is contained in:
kadarakos 2022-08-24 15:22:13 +00:00
parent c09d2fa25b
commit fe563e6ba9
2 changed files with 118 additions and 0 deletions

View File

@ -16,6 +16,7 @@ from .debug_config import debug_config # noqa: F401
from .debug_model import debug_model # noqa: F401
from .debug_diff import debug_diff # noqa: F401
from .evaluate import evaluate # noqa: F401
from .annotate import annotate # noqa: F401
from .convert import convert # noqa: F401
from .init_pipeline import init_pipeline_cli # noqa: F401
from .init_config import init_config, fill_config # noqa: F401

117
spacy/cli/annotate.py Normal file
View File

@ -0,0 +1,117 @@
import tqdm
import sys
from ._util import app, Arg, Opt, setup_gpu, import_code
from typing import Optional, Generator
from pathlib import Path
from wasabi import msg
from ..tokens import Doc, DocBin
from ..vocab import Vocab
from .. import util
path_help = ("Location of the documents to predict on."
"Can be a single file in .spacy format or "
"a text file with one document per line."
"If a directory is provided each "
"text file in the directory will be treated "
"as a single document.")
out_help = "Path where to save the result .spacy file"
code_help = ("Path to Python file with additional "
"code (registered functions) to be imported")
gold_help = "Use gold preprocessing provided in the .spacy files"
def _stream_data(
data_path: Path,
vocab: Vocab
) -> Generator[Doc, None, None]:
"""
Load data which is either in a single file
in .spacy or plain text format or multiple
text files in a directory.
"""
# XXX I know that we have it in the developer guidelines
# to don't try/except, but I thought its appropriate here.
# because we are not sure exactly what input we are getting.
if not data_path.is_dir():
# Yield from DocBin.
try:
docbin = DocBin().from_disk(data_path)
for doc in docbin.get_docs(vocab):
yield doc
# Yield from text file.
except ValueError:
try:
with open(data_path, 'r') as fin:
for line in fin:
yield line
except UnicodeDecodeError:
print(
f"file {data_path} does not seem "
"to be a plain text file"
)
sys.exit()
else:
# Yield per one file in directory
for path in data_path.iterdir():
if path.is_dir():
raise ValueError(
"All files should be text files."
)
with open(path, 'r') as fin:
try:
text = fin.read()
yield text
except UnicodeDecodeError:
print(
f"file {path} does not seem "
"to be a plain text file"
)
sys.exit()
@app.command("annotate")
def annotate_cli(
# fmt: off
model: str = Arg(..., help="Model name or path"),
data_path: Path = Arg(..., help=path_help, exists=True),
output: Optional[Path] = Arg(..., help=out_help, dir_okay=False),
code_path: Optional[Path] = Opt(None, "--code", "-c", help=code_help),
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
batch_size: int = Opt(1, "--batch-size", "-b", help="Batch size"),
n_process: int = Opt(1, "--n-process", "-n", help="Number of processors to use")
):
"""
Run a trained pipeline over documents.
Expects a loadable spaCy pipeline and some data as input.
The input can be provided multiple formats. It can be a .spacy
file, a single text file with one document per line or a directory
where each file is assumed to be plain text document.
DOCS: https://spacy.io/api/cli#tba
"""
import_code(code_path)
setup_gpu(use_gpu)
annotate(data_path, output, model)
def annotate(
data_path: Path,
output: Path,
model: str
):
data_path = util.ensure_path(data_path)
output_path = util.ensure_path(output)
if not data_path.exists():
msg.fail("Couldn't find data path.", data_path, exits=1)
nlp = util.load_model(model)
msg.good(f"Loaded model {model}")
vocab = nlp.vocab
docbin = DocBin()
for doc in tqdm.tqdm(nlp.pipe(_stream_data(data_path, vocab))):
docbin.add(doc)
if output_path.is_dir():
output_path = output_path / "predictions.spacy"
docbin.to_disk(output_path)