mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-07 05:40:20 +03:00
annotate cli first try
This commit is contained in:
parent
c09d2fa25b
commit
fe563e6ba9
|
@ -16,6 +16,7 @@ from .debug_config import debug_config # noqa: F401
|
|||
from .debug_model import debug_model # noqa: F401
|
||||
from .debug_diff import debug_diff # noqa: F401
|
||||
from .evaluate import evaluate # noqa: F401
|
||||
from .annotate import annotate # noqa: F401
|
||||
from .convert import convert # noqa: F401
|
||||
from .init_pipeline import init_pipeline_cli # noqa: F401
|
||||
from .init_config import init_config, fill_config # noqa: F401
|
||||
|
|
117
spacy/cli/annotate.py
Normal file
117
spacy/cli/annotate.py
Normal file
|
@ -0,0 +1,117 @@
|
|||
import tqdm
|
||||
import sys
|
||||
|
||||
from ._util import app, Arg, Opt, setup_gpu, import_code
|
||||
from typing import Optional, Generator
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
|
||||
from ..tokens import Doc, DocBin
|
||||
from ..vocab import Vocab
|
||||
from .. import util
|
||||
|
||||
|
||||
path_help = ("Location of the documents to predict on."
|
||||
"Can be a single file in .spacy format or "
|
||||
"a text file with one document per line."
|
||||
"If a directory is provided each "
|
||||
"text file in the directory will be treated "
|
||||
"as a single document.")
|
||||
out_help = "Path where to save the result .spacy file"
|
||||
code_help = ("Path to Python file with additional "
|
||||
"code (registered functions) to be imported")
|
||||
gold_help = "Use gold preprocessing provided in the .spacy files"
|
||||
|
||||
|
||||
def _stream_data(
|
||||
data_path: Path,
|
||||
vocab: Vocab
|
||||
) -> Generator[Doc, None, None]:
|
||||
"""
|
||||
Load data which is either in a single file
|
||||
in .spacy or plain text format or multiple
|
||||
text files in a directory.
|
||||
"""
|
||||
# XXX I know that we have it in the developer guidelines
|
||||
# to don't try/except, but I thought its appropriate here.
|
||||
# because we are not sure exactly what input we are getting.
|
||||
if not data_path.is_dir():
|
||||
# Yield from DocBin.
|
||||
try:
|
||||
docbin = DocBin().from_disk(data_path)
|
||||
for doc in docbin.get_docs(vocab):
|
||||
yield doc
|
||||
# Yield from text file.
|
||||
except ValueError:
|
||||
try:
|
||||
with open(data_path, 'r') as fin:
|
||||
for line in fin:
|
||||
yield line
|
||||
except UnicodeDecodeError:
|
||||
print(
|
||||
f"file {data_path} does not seem "
|
||||
"to be a plain text file"
|
||||
)
|
||||
sys.exit()
|
||||
else:
|
||||
# Yield per one file in directory
|
||||
for path in data_path.iterdir():
|
||||
if path.is_dir():
|
||||
raise ValueError(
|
||||
"All files should be text files."
|
||||
)
|
||||
with open(path, 'r') as fin:
|
||||
try:
|
||||
text = fin.read()
|
||||
yield text
|
||||
except UnicodeDecodeError:
|
||||
print(
|
||||
f"file {path} does not seem "
|
||||
"to be a plain text file"
|
||||
)
|
||||
sys.exit()
|
||||
|
||||
|
||||
@app.command("annotate")
|
||||
def annotate_cli(
|
||||
# fmt: off
|
||||
model: str = Arg(..., help="Model name or path"),
|
||||
data_path: Path = Arg(..., help=path_help, exists=True),
|
||||
output: Optional[Path] = Arg(..., help=out_help, dir_okay=False),
|
||||
code_path: Optional[Path] = Opt(None, "--code", "-c", help=code_help),
|
||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
||||
batch_size: int = Opt(1, "--batch-size", "-b", help="Batch size"),
|
||||
n_process: int = Opt(1, "--n-process", "-n", help="Number of processors to use")
|
||||
):
|
||||
"""
|
||||
Run a trained pipeline over documents.
|
||||
Expects a loadable spaCy pipeline and some data as input.
|
||||
The input can be provided multiple formats. It can be a .spacy
|
||||
file, a single text file with one document per line or a directory
|
||||
where each file is assumed to be plain text document.
|
||||
|
||||
DOCS: https://spacy.io/api/cli#tba
|
||||
"""
|
||||
import_code(code_path)
|
||||
setup_gpu(use_gpu)
|
||||
annotate(data_path, output, model)
|
||||
|
||||
|
||||
def annotate(
|
||||
data_path: Path,
|
||||
output: Path,
|
||||
model: str
|
||||
):
|
||||
data_path = util.ensure_path(data_path)
|
||||
output_path = util.ensure_path(output)
|
||||
if not data_path.exists():
|
||||
msg.fail("Couldn't find data path.", data_path, exits=1)
|
||||
nlp = util.load_model(model)
|
||||
msg.good(f"Loaded model {model}")
|
||||
vocab = nlp.vocab
|
||||
docbin = DocBin()
|
||||
for doc in tqdm.tqdm(nlp.pipe(_stream_data(data_path, vocab))):
|
||||
docbin.add(doc)
|
||||
if output_path.is_dir():
|
||||
output_path = output_path / "predictions.spacy"
|
||||
docbin.to_disk(output_path)
|
Loading…
Reference in New Issue
Block a user