Refactor CLI

This commit is contained in:
Ines Montani 2020-06-21 21:35:01 +02:00
parent c12713a8be
commit 275bab62df
15 changed files with 451 additions and 209 deletions

View File

@ -1,4 +1,7 @@
from spacy.cli import app
from typer.main import get_command
if __name__ == "__main__":
app()
command = get_command(app)
# Ensure that the help messages always display the correct prompt
command(prog_name="python -m spacy")

View File

@ -34,10 +34,10 @@ class FileTypes(str, Enum):
@app.command("convert")
def convert(
def convert_cli(
# fmt: off
input_file: str = Arg(..., help="Input file"),
output_dir: str = Arg("-", help="Output directory. '-' for stdout."),
input_file: str = Arg(..., help="Input file", exists=True),
output_dir: Path = Arg("-", help="Output directory. '-' for stdout.", allow_dash=True, exists=True),
file_type: FileTypes = Opt(FileTypes.json.value, "--file-type", "-t", help="Type of data to produce"),
n_sents: int = Opt(1, "--n-sents", "-n", help="Number of sentences per doc (0 to disable)"),
seg_sents: bool = Opt(False, "--seg-sents", "-s", help="Segment sentences (for -c ner)"),
@ -45,7 +45,7 @@ def convert(
morphology: bool = Opt(False, "--morphology", "-m", help="Enable appending morphology to tags"),
merge_subtokens: bool = Opt(False, "--merge-subtokens", "-T", help="Merge CoNLL-U subtokens"),
converter: str = Opt("auto", "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"),
ner_map_path: Optional[Path] = Opt(None, "--ner-map-path", "-N", help="NER tag mapping (as JSON-encoded dict of entity types)"),
ner_map_path: Optional[Path] = Opt(None, "--ner-map-path", "-N", help="NER tag mapping (as JSON-encoded dict of entity types)", exists=True),
lang: Optional[str] = Opt(None, "--lang", "-l", help="Language (if tokenizer required)"),
# fmt: on
):
@ -58,8 +58,39 @@ def convert(
if isinstance(file_type, FileTypes):
# We get an instance of the FileTypes from the CLI so we need its string value
file_type = file_type.value
no_print = output_dir == "-"
msg = Printer(no_print=no_print)
silent = output_dir == "-"
convert(
input_file,
output_dir,
file_type=file_type,
n_sents=n_sents,
seg_sents=seg_sents,
model=model,
morphology=morphology,
merge_subtokens=merge_subtokens,
converter=converter,
ner_map_path=ner_map_path,
lang=lang,
silent=silent,
)
def convert(
input_file: Path,
output_dir: Path,
*,
file_type: str = "json",
n_sents: int = 1,
seg_sents: bool = False,
model: Optional[str] = None,
morphology: bool = False,
merge_subtokens: bool = False,
converter: str = "auto",
ner_map_path: Optional[Path] = None,
lang: Optional[str] = None,
silent: bool = True,
) -> None:
msg = Printer(no_print=silent, pretty=not silent)
input_path = Path(input_file)
if file_type not in FILE_TYPES_STDOUT and output_dir == "-":
# TODO: support msgpack via stdout in srsly?
@ -85,7 +116,8 @@ def convert(
converter = converter_autodetect
else:
msg.warn(
"Can't automatically detect NER format. Conversion may not succeed. See https://spacy.io/api/cli#convert"
"Can't automatically detect NER format. Conversion may not "
"succeed. See https://spacy.io/api/cli#convert"
)
if converter not in CONVERTERS:
msg.fail(f"Can't find converter for {converter}", exits=1)
@ -102,7 +134,7 @@ def convert(
merge_subtokens=merge_subtokens,
lang=lang,
model=model,
no_print=no_print,
no_print=silent,
ner_map=ner_map,
)
if output_dir != "-":
@ -124,7 +156,7 @@ def convert(
srsly.write_jsonl("-", data)
def autodetect_ner_format(input_data):
def autodetect_ner_format(input_data: str) -> str:
# guess format from the first 20 lines
lines = input_data.split("\n")[:20]
format_guesses = {"ner": 0, "iob": 0}

View File

@ -1,4 +1,4 @@
from typing import Optional
from typing import Optional, List, Sequence, Dict, Any, Tuple
from pathlib import Path
from collections import Counter
import sys
@ -6,8 +6,9 @@ import srsly
from wasabi import Printer, MESSAGES
from ._app import app, Arg, Opt
from ..gold import GoldCorpus
from ..gold import GoldCorpus, Example
from ..syntax import nonproj
from ..language import Language
from ..util import load_model, get_lang_class
@ -21,12 +22,12 @@ BLANK_MODEL_THRESHOLD = 2000
@app.command("debug-data")
def debug_data(
def debug_data_cli(
# fmt: off
lang: str = Arg(..., help="Model language"),
train_path: Path = Arg(..., help="Location of JSON-formatted training data"),
dev_path: Path = Arg(..., help="Location of JSON-formatted development data"),
tag_map_path: Optional[Path] = Opt(None, "--tag-map-path", "-tm", help="Location of JSON-formatted tag map"),
train_path: Path = Arg(..., help="Location of JSON-formatted training data", exists=True),
dev_path: Path = Arg(..., help="Location of JSON-formatted development data", exists=True),
tag_map_path: Optional[Path] = Opt(None, "--tag-map-path", "-tm", help="Location of JSON-formatted tag map", exists=True, dir_okay=False),
base_model: Optional[str] = Opt(None, "--base-model", "-b", help="Name of model to update (optional)"),
pipeline: str = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of pipeline components to train"),
ignore_warnings: bool = Opt(False, "--ignore-warnings", "-IW", help="Ignore warnings, only show stats and errors"),
@ -39,8 +40,36 @@ def debug_data(
stats, and find problems like invalid entity annotations, cyclic
dependencies, low data labels and more.
"""
msg = Printer(pretty=not no_format, ignore_warnings=ignore_warnings)
debug_data(
lang,
train_path,
dev_path,
tag_map_path=tag_map_path,
base_model=base_model,
pipeline=[p.strip() for p in pipeline.split(",")],
ignore_warnings=ignore_warnings,
verbose=verbose,
no_format=no_format,
silent=False,
)
def debug_data(
lang: str,
train_path: Path,
dev_path: Path,
*,
tag_map_path: Optional[Path] = None,
base_model: Optional[str] = None,
pipeline: List[str] = ["tagger", "parser", "ner"],
ignore_warnings: bool = False,
verbose: bool = False,
no_format: bool = True,
silent: bool = True,
):
msg = Printer(
no_print=silent, pretty=not no_format, ignore_warnings=ignore_warnings
)
# Make sure all files and paths exists if they are needed
if not train_path.exists():
msg.fail("Training data not found", train_path, exits=1)
@ -52,7 +81,6 @@ def debug_data(
tag_map = srsly.read_json(tag_map_path)
# Initialize the model and pipeline
pipeline = [p.strip() for p in pipeline.split(",")]
if base_model:
nlp = load_model(base_model)
else:
@ -449,7 +477,7 @@ def debug_data(
sys.exit(1)
def _load_file(file_path, msg):
def _load_file(file_path: Path, msg: Printer) -> None:
file_name = file_path.parts[-1]
if file_path.suffix == ".json":
with msg.loading(f"Loading {file_name}..."):
@ -468,7 +496,9 @@ def _load_file(file_path, msg):
)
def _compile_gold(examples, pipeline, nlp):
def _compile_gold(
examples: Sequence[Example], pipeline: List[str], nlp: Language
) -> Dict[str, Any]:
data = {
"ner": Counter(),
"cats": Counter(),
@ -540,13 +570,13 @@ def _compile_gold(examples, pipeline, nlp):
return data
def _format_labels(labels, counts=False):
def _format_labels(labels: List[Tuple[str, int]], counts: bool = False) -> str:
if counts:
return ", ".join([f"'{l}' ({c})" for l, c in labels])
return ", ".join([f"'{l}'" for l in labels])
def _get_examples_without_label(data, label):
def _get_examples_without_label(data: Sequence[Example], label: str) -> int:
count = 0
for ex in data:
labels = [
@ -559,7 +589,7 @@ def _get_examples_without_label(data, label):
return count
def _get_labels_from_model(nlp, pipe_name):
def _get_labels_from_model(nlp: Language, pipe_name: str) -> Sequence[str]:
if pipe_name not in nlp.pipe_names:
return set()
pipe = nlp.get_pipe(pipe_name)

View File

@ -1,31 +1,36 @@
from typing import List
from typing import Optional, Sequence, Union
import requests
import os
import subprocess
import sys
from wasabi import msg
import typer
from ._app import app, Arg, Opt
from .. import about
from ..util import is_package, get_base_version
from ..util import is_package, get_base_version, run_command
@app.command(
"download",
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
)
def download(
def download_cli(
# fmt: off
ctx: typer.Context,
model: str = Arg(..., help="Model to download (shortcut or name)"),
direct: bool = Opt(False, "--direct", "-d", help="Force direct download of name + version"),
pip_args: List[str] = Arg(..., help="Additional arguments to be passed to `pip install` on model install"),
# fmt: on
):
"""
Download compatible model from default download path using pip. If --direct
flag is set, the command expects the full model name with version.
For direct downloads, the compatibility check will be skipped.
For direct downloads, the compatibility check will be skipped. All
additional arguments provided to this command will be passed to `pip install`
on model installation.
"""
download(model, direct, *ctx.args)
def download(model: str, direct: bool = False, *pip_args) -> None:
if not is_package("spacy") and "--no-deps" not in pip_args:
msg.warn(
"Skipping model package dependencies and setting `--no-deps`. "
@ -41,22 +46,20 @@ def download(
components = model.split("-")
model_name = "".join(components[:-1])
version = components[-1]
dl = download_model(dl_tpl.format(m=model_name, v=version), pip_args)
download_model(dl_tpl.format(m=model_name, v=version), pip_args)
else:
shortcuts = get_json(about.__shortcuts__, "available shortcuts")
model_name = shortcuts.get(model, model)
compatibility = get_compatibility()
version = get_version(model_name, compatibility)
dl = download_model(dl_tpl.format(m=model_name, v=version), pip_args)
if dl != 0: # if download subprocess doesn't return 0, exit
sys.exit(dl)
msg.good(
"Download and installation successful",
f"You can now load the model via spacy.load('{model_name}')",
)
download_model(dl_tpl.format(m=model_name, v=version), pip_args)
msg.good(
"Download and installation successful",
f"You can now load the model via spacy.load('{model_name}')",
)
def get_json(url, desc):
def get_json(url: str, desc: str) -> Union[dict, list]:
r = requests.get(url)
if r.status_code != 200:
msg.fail(
@ -70,7 +73,7 @@ def get_json(url, desc):
return r.json()
def get_compatibility():
def get_compatibility() -> dict:
version = get_base_version(about.__version__)
comp_table = get_json(about.__compatibility__, "compatibility table")
comp = comp_table["spacy"]
@ -79,7 +82,7 @@ def get_compatibility():
return comp[version]
def get_version(model, comp):
def get_version(model: str, comp: dict) -> str:
model = get_base_version(model)
if model not in comp:
msg.fail(
@ -89,10 +92,12 @@ def get_version(model, comp):
return comp[model][0]
def download_model(filename, user_pip_args=None):
def download_model(
filename: str, user_pip_args: Optional[Sequence[str]] = None
) -> None:
download_url = about.__download_url__ + "/" + filename
pip_args = ["--no-cache-dir"]
if user_pip_args:
pip_args.extend(user_pip_args)
cmd = [sys.executable, "-m", "pip", "install"] + pip_args + [download_url]
return subprocess.call(cmd, env=os.environ.copy())
run_command(cmd)

View File

@ -1,29 +1,52 @@
from typing import Optional
from typing import Optional, List
from timeit import default_timer as timer
from wasabi import msg
from wasabi import Printer
from pathlib import Path
from ._app import app, Arg, Opt
from ..tokens import Doc
from ..scorer import Scorer
from ..gold import GoldCorpus
from .. import util
from .. import displacy
@app.command("evaluate")
def evaluate(
def evaluate_cli(
# fmt: off
model: str = Arg(..., help="Model name or path"),
data_path: str = Arg(..., help="Location of JSON-formatted evaluation data"),
data_path: Path = Arg(..., help="Location of JSON-formatted evaluation data", exists=True),
gpu_id: int = Opt(-1, "--gpu-id", "-g", help="Use GPU"),
gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"),
displacy_path: Optional[str] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML"),
displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"),
return_scores: bool = Opt(False, "--return-scores", "-R", help="Return dict containing model scores"),
# fmt: on
):
"""
Evaluate a model. To render a sample of parses in a HTML file, set an
output directory as the displacy_path argument.
"""
evaluate(
model,
data_path,
gpu_id=gpu_id,
gold_preproc=gold_preproc,
displacy_path=displacy_path,
displacy_limit=displacy_limit,
silent=False,
)
def evaluate(
model: str,
data_path: Path,
gpu_id: int = -1,
gold_preproc: bool = False,
displacy_path: Optional[Path] = None,
displacy_limit: int = 25,
silent: bool = True,
) -> Scorer:
msg = Printer(no_print=silent, pretty=not silent)
util.fix_random_seed()
if gpu_id >= 0:
util.use_gpu(gpu_id)
@ -78,11 +101,17 @@ def evaluate(
ents=render_ents,
)
msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path)
if return_scores:
return scorer.scores
return scorer.scores
def render_parses(docs, output_path, model_name="", limit=250, deps=True, ents=True):
def render_parses(
docs: List[Doc],
output_path: Path,
model_name: str = "",
limit: int = 250,
deps: bool = True,
ents: bool = True,
):
docs[0].user_data["title"] = model_name
if ents:
html = displacy.render(docs[:limit], style="ent", page=True)

View File

@ -1,7 +1,7 @@
from typing import Optional
from typing import Optional, Dict, Any, Union
import platform
from pathlib import Path
from wasabi import msg
from wasabi import Printer
import srsly
from ._app import app, Arg, Opt
@ -11,7 +11,7 @@ from .. import about
@app.command("info")
def info(
def info_cli(
# fmt: off
model: Optional[str] = Arg(None, help="Optional model name"),
markdown: bool = Opt(False, "--markdown", "-md", help="Generate Markdown for GitHub issues"),
@ -23,60 +23,83 @@ def info(
print model information. Flag --markdown prints details in Markdown for easy
copy-pasting to GitHub issues.
"""
info(model, markdown=markdown, silent=silent)
def info(
model: Optional[str], *, markdown: bool = False, silent: bool = True
) -> Union[str, dict]:
msg = Printer(no_print=silent, pretty=not silent)
if model:
if util.is_package(model):
model_path = util.get_package_path(model)
else:
model_path = model
meta_path = model_path / "meta.json"
if not meta_path.is_file():
msg.fail("Can't find model meta.json", meta_path, exits=1)
meta = srsly.read_json(meta_path)
if model_path.resolve() != model_path:
meta["link"] = str(model_path)
meta["source"] = str(model_path.resolve())
else:
meta["source"] = str(model_path)
title = f"Info about model '{model}'"
data = info_model(model, silent=silent)
else:
title = "Info about spaCy"
data = info_spacy(silent=silent)
markdown_data = get_markdown(data, title=title)
if markdown:
if not silent:
title = f"Info about model '{model}'"
model_meta = {
k: v for k, v in meta.items() if k not in ("accuracy", "speed")
}
if markdown:
print_markdown(model_meta, title=title)
else:
msg.table(model_meta, title=title)
return meta
all_models, _ = get_model_pkgs()
data = {
print(markdown_data)
return markdown_data
if not silent:
msg.table(data, title=title)
return data
def info_spacy(*, silent: bool = True) -> Dict[str, any]:
"""Generate info about the current spaCy intallation.
silent (bool): Don't print anything, just return.
RETURNS (dict): The spaCy info.
"""
all_models, _ = get_model_pkgs(silent=silent)
models = ", ".join(f"{m['name']} ({m['version']})" for m in all_models.values())
return {
"spaCy version": about.__version__,
"Location": str(Path(__file__).parent.parent),
"Platform": platform.platform(),
"Python version": platform.python_version(),
"Models": ", ".join(
f"{m['name']} ({m['version']})" for m in all_models.values()
),
"Models": models,
}
if not silent:
title = "Info about spaCy"
if markdown:
print_markdown(data, title=title)
else:
msg.table(data, title=title)
return data
def print_markdown(data, title=None):
"""Print data in GitHub-flavoured Markdown format for issues etc.
def info_model(model: str, *, silent: bool = True) -> Dict[str, Any]:
"""Generate info about a specific model.
model (str): Model name of path.
silent (bool): Don't print anything, just return.
RETURNS (dict): The model meta.
"""
msg = Printer(no_print=silent, pretty=not silent)
if util.is_package(model):
model_path = util.get_package_path(model)
else:
model_path = model
meta_path = model_path / "meta.json"
if not meta_path.is_file():
msg.fail("Can't find model meta.json", meta_path, exits=1)
meta = srsly.read_json(meta_path)
if model_path.resolve() != model_path:
meta["link"] = str(model_path)
meta["source"] = str(model_path.resolve())
else:
meta["source"] = str(model_path)
return {k: v for k, v in meta.items() if k not in ("accuracy", "speed")}
def get_markdown(data: Dict[str, Any], title: Optional[str] = None) -> str:
"""Get data in GitHub-flavoured Markdown format for issues etc.
data (dict or list of tuples): Label/value pairs.
title (str / None): Title, will be rendered as headline 2.
RETURNS (str): The Markdown string.
"""
markdown = []
for key, value in data.items():
if isinstance(value, str) and Path(value).exists():
continue
markdown.append(f"* **{key}:** {value}")
result = "\n{}\n".format("\n".join(markdown))
if title:
print(f"\n## {title}")
print("\n{}\n".format("\n".join(markdown)))
result = f"\n## {title}\n{result}"
return result

View File

@ -1,4 +1,4 @@
from typing import Optional
from typing import Optional, List, Dict, Any, Union, IO
import math
from tqdm import tqdm
import numpy
@ -10,11 +10,12 @@ import gzip
import zipfile
import srsly
import warnings
from wasabi import msg
from wasabi import Printer
from ._app import app, Arg, Opt
from ..vectors import Vectors
from ..errors import Errors, Warnings
from ..language import Language
from ..util import ensure_path, get_lang_class, load_model, OOV_RANK
from ..lookups import Lookups
@ -28,14 +29,14 @@ DEFAULT_OOV_PROB = -20
@app.command("init-model")
def init_model(
def init_model_cli(
# fmt: off
lang: str = Arg(..., help="Model language"),
output_dir: Path = Arg(..., help="Model output directory"),
freqs_loc: Optional[Path] = Arg(None, help="Location of words frequencies file"),
clusters_loc: Optional[str] = Opt(None, "--clusters-loc", "-c", help="Optional location of brown clusters data"),
jsonl_loc: Optional[Path] = Opt(None, "--jsonl-loc", "-j", help="Location of JSONL-formatted attributes file"),
vectors_loc: Optional[str] = Opt(None, "--vectors-loc", "-v", help="Optional vectors file in Word2Vec format"),
freqs_loc: Optional[Path] = Arg(None, help="Location of words frequencies file", exists=True),
clusters_loc: Optional[Path] = Opt(None, "--clusters-loc", "-c", help="Optional location of brown clusters data", exists=True),
jsonl_loc: Optional[Path] = Opt(None, "--jsonl-loc", "-j", help="Location of JSONL-formatted attributes file", exists=True),
vectors_loc: Optional[Path] = Opt(None, "--vectors-loc", "-v", help="Optional vectors file in Word2Vec format", exists=True),
prune_vectors: int = Opt(-1 , "--prune-vectors", "-V", help="Optional number of vectors to prune to"),
truncate_vectors: int = Opt(0, "--truncate-vectors", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
vectors_name: Optional[str] = Opt(None, "--vectors-name", "-vn", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
@ -49,6 +50,38 @@ def init_model(
and word vectors. If vectors are provided in Word2Vec format, they can
be either a .txt or zipped as a .zip or .tar.gz.
"""
init_model(
lang,
output_dir,
freqs_loc=freqs_loc,
clusters_loc=clusters_loc,
jsonl_loc=jsonl_loc,
prune_vectors=prune_vectors,
truncate_vectors=truncate_vectors,
vectors_name=vectors_name,
model_name=model_name,
omit_extra_lookups=omit_extra_lookups,
base_model=base_model,
silent=False,
)
def init_model(
lang: str,
output_dir: Path,
freqs_loc: Optional[Path] = None,
clusters_loc: Optional[Path] = None,
jsonl_loc: Optional[Path] = None,
vectors_loc: Optional[Path] = None,
prune_vectors: int = -1,
truncate_vectors: int = 0,
vectors_name: Optional[str] = None,
model_name: Optional[str] = None,
omit_extra_lookups: bool = False,
base_model: Optional[str] = None,
silent: bool = True,
) -> Language:
msg = Printer(no_print=silent, pretty=not silent)
if jsonl_loc is not None:
if freqs_loc is not None or clusters_loc is not None:
settings = ["-j"]
@ -71,7 +104,7 @@ def init_model(
freqs_loc = ensure_path(freqs_loc)
if freqs_loc is not None and not freqs_loc.exists():
msg.fail("Can't find words frequencies file", freqs_loc, exits=1)
lex_attrs = read_attrs_from_deprecated(freqs_loc, clusters_loc)
lex_attrs = read_attrs_from_deprecated(msg, freqs_loc, clusters_loc)
with msg.loading("Creating model..."):
nlp = create_model(lang, lex_attrs, name=model_name, base_model=base_model)
@ -86,7 +119,9 @@ def init_model(
msg.good("Successfully created model")
if vectors_loc is not None:
add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name)
add_vectors(
msg, nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name
)
vec_added = len(nlp.vocab.vectors)
lex_added = len(nlp.vocab)
msg.good(
@ -98,7 +133,7 @@ def init_model(
return nlp
def open_file(loc):
def open_file(loc: Union[str, Path]) -> IO:
"""Handle .gz, .tar.gz or unzipped files"""
loc = ensure_path(loc)
if tarfile.is_tarfile(str(loc)):
@ -114,7 +149,9 @@ def open_file(loc):
return loc.open("r", encoding="utf8")
def read_attrs_from_deprecated(freqs_loc, clusters_loc):
def read_attrs_from_deprecated(
msg: Printer, freqs_loc: Optional[Path], clusters_loc: Optional[Path]
) -> List[Dict[str, Any]]:
if freqs_loc is not None:
with msg.loading("Counting frequencies..."):
probs, _ = read_freqs(freqs_loc)
@ -142,7 +179,12 @@ def read_attrs_from_deprecated(freqs_loc, clusters_loc):
return lex_attrs
def create_model(lang, lex_attrs, name=None, base_model=None):
def create_model(
lang: str,
lex_attrs: List[Dict[str, Any]],
name: Optional[str] = None,
base_model: Optional[Union[str, Path]] = None,
) -> Language:
if base_model:
nlp = load_model(base_model)
# keep the tokenizer but remove any existing pipeline components due to
@ -169,7 +211,14 @@ def create_model(lang, lex_attrs, name=None, base_model=None):
return nlp
def add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, name=None):
def add_vectors(
msg: Printer,
nlp: Language,
vectors_loc: Optional[Path],
truncate_vectors: int,
prune_vectors: int,
name: Optional[str] = None,
) -> None:
vectors_loc = ensure_path(vectors_loc)
if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))
@ -179,7 +228,7 @@ def add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, name=None):
else:
if vectors_loc:
with msg.loading(f"Reading vectors from {vectors_loc}"):
vectors_data, vector_keys = read_vectors(vectors_loc)
vectors_data, vector_keys = read_vectors(msg, vectors_loc)
msg.good(f"Loaded vectors from {vectors_loc}")
else:
vectors_data, vector_keys = (None, None)
@ -198,7 +247,7 @@ def add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, name=None):
nlp.vocab.prune_vectors(prune_vectors)
def read_vectors(vectors_loc, truncate_vectors=0):
def read_vectors(msg: Printer, vectors_loc: Path, truncate_vectors: int = 0):
f = open_file(vectors_loc)
shape = tuple(int(size) for size in next(f).split())
if truncate_vectors >= 1:
@ -218,7 +267,9 @@ def read_vectors(vectors_loc, truncate_vectors=0):
return vectors_data, vectors_keys
def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
def read_freqs(
freqs_loc: Path, max_length: int = 100, min_doc_freq: int = 5, min_freq: int = 50
):
counts = PreshCounter()
total = 0
with freqs_loc.open() as f:
@ -247,7 +298,7 @@ def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
return probs, oov_prob
def read_clusters(clusters_loc):
def read_clusters(clusters_loc: Path) -> dict:
clusters = {}
if ftfy is None:
warnings.warn(Warnings.W004)

View File

@ -1,22 +1,24 @@
from typing import Optional
from typing import Optional, Union, Any, Dict
import shutil
from pathlib import Path
from wasabi import msg, get_raw_input
from wasabi import Printer, get_raw_input
import srsly
import sys
from ._app import app, Arg, Opt
from ..schemas import validate, ModelMetaSchema
from .. import util
from .. import about
@app.command("package")
def package(
def package_cli(
# fmt: off
input_dir: str = Arg(..., help="Directory with model data"),
output_dir: str = Arg(..., help="Output parent directory"),
meta_path: Optional[str] = Opt(None, "--meta-path", "-m", help="Path to meta.json"),
input_dir: Path = Arg(..., help="Directory with model data", exists=True, file_okay=False),
output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False),
meta_path: Optional[Path] = Opt(None, "--meta-path", "-m", help="Path to meta.json", exists=True, dir_okay=False),
create_meta: bool = Opt(False, "--create-meta", "-c", help="Create meta.json, even if one exists"),
force: bool = Opt(False, "--force", "-f", help="Force overwriting existing model in output directory"),
force: bool = Opt(False, "--force", "-f", "-F", help="Force overwriting existing model in output directory"),
# fmt: on
):
"""
@ -26,6 +28,25 @@ def package(
set and a meta.json already exists in the output directory, the existing
values will be used as the defaults in the command-line prompt.
"""
package(
input_dir,
output_dir,
meta_path=meta_path,
create_meta=create_meta,
force=force,
silent=False,
)
def package(
input_dir: Path,
output_dir: Path,
meta_path: Optional[Path] = None,
create_meta: bool = False,
force: bool = False,
silent: bool = True,
) -> None:
msg = Printer(no_print=silent, pretty=not silent)
input_path = util.ensure_path(input_dir)
output_path = util.ensure_path(output_dir)
meta_path = util.ensure_path(meta_path)
@ -36,23 +57,20 @@ def package(
if meta_path and not meta_path.exists():
msg.fail("Can't find model meta.json", meta_path, exits=1)
meta_path = meta_path or input_path / "meta.json"
if meta_path.is_file():
meta = srsly.read_json(meta_path)
if not create_meta: # only print if user doesn't want to overwrite
msg.good("Loaded meta.json from file", meta_path)
else:
meta = generate_meta(input_dir, meta, msg)
for key in ("lang", "name", "version"):
if key not in meta or meta[key] == "":
msg.fail(
f"No '{key}' setting found in meta.json",
"This setting is required to build your package.",
exits=1,
)
meta_path = meta_path or input_dir / "meta.json"
if not meta_path.exists() or not meta_path.is_file():
msg.fail("Can't load model meta.json", meta_path, exits=1)
meta = srsly.read_json(meta_path)
if not create_meta: # only print if user doesn't want to overwrite
msg.good("Loaded meta.json from file", meta_path)
else:
meta = generate_meta(input_dir, meta, msg)
errors = validate(ModelMetaSchema, meta)
if errors:
msg.fail("Invalid model meta.json", "\n".join(errors), exits=1)
model_name = meta["lang"] + "_" + meta["name"]
model_name_v = model_name + "-" + meta["version"]
main_path = output_path / model_name_v
main_path = output_dir / model_name_v
package_path = main_path / model_name
if package_path.exists():
@ -66,21 +84,26 @@ def package(
exits=1,
)
Path.mkdir(package_path, parents=True)
shutil.copytree(str(input_path), str(package_path / model_name_v))
shutil.copytree(str(input_dir), str(package_path / model_name_v))
create_file(main_path / "meta.json", srsly.json_dumps(meta, indent=2))
create_file(main_path / "setup.py", TEMPLATE_SETUP)
create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST)
create_file(package_path / "__init__.py", TEMPLATE_INIT)
msg.good(f"Successfully created package '{model_name_v}'", main_path)
msg.text("To build the package, run `python setup.py sdist` in this directory.")
with util.working_dir(main_path):
util.run_command([sys.executable, "setup.py", "sdist"])
zip_file = main_path / "dist" / f"{model_name_v}.tar.gz"
msg.good(f"Successfully created zipped Python package", zip_file)
def create_file(file_path, contents):
def create_file(file_path: Path, contents: str) -> None:
file_path.touch()
file_path.open("w", encoding="utf-8").write(contents)
def generate_meta(model_path, existing_meta, msg):
def generate_meta(
model_path: Union[str, Path], existing_meta: Dict[str, Any], msg: Printer
) -> Dict[str, Any]:
meta = existing_meta or {}
settings = [
("lang", "Model language", meta.get("lang", "en")),

View File

@ -19,12 +19,12 @@ from ..gold import Example
@app.command("pretrain")
def pretrain(
def pretrain_cli(
# fmt: off
texts_loc: str =Arg(..., help="Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the key 'tokens'"),
texts_loc: Path = Arg(..., help="Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the key 'tokens'", exists=True),
vectors_model: str = Arg(..., help="Name or path to spaCy model with vectors to learn from"),
output_dir: Path = Arg(..., help="Directory to write models to on each epoch"),
config_path: Path = Arg(..., help="Path to config file"),
config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False),
use_gpu: int = Opt(-1, "--use-gpu", "-g", help="Use GPU"),
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files."),
@ -45,6 +45,26 @@ def pretrain(
all settings are the same between pretraining and training. Ideally,
this is done by using the same config file for both commands.
"""
pretrain(
texts_loc,
vectors_model,
output_dir,
config_path,
use_gpu=use_gpu,
resume_path=resume_path,
epoch_resume=epoch_resume,
)
def pretrain(
texts_loc: Path,
vectors_model: str,
output_dir: Path,
config_path: Path,
use_gpu: int = -1,
resume_path: Optional[Path] = None,
epoch_resume: Optional[int] = None,
):
if not config_path or not config_path.exists():
msg.fail("Config file not found", config_path, exits=1)

View File

@ -1,4 +1,4 @@
from typing import Optional
from typing import Optional, Sequence, Union, Iterator
import tqdm
from pathlib import Path
import srsly
@ -7,17 +7,18 @@ import pstats
import sys
import itertools
import ml_datasets
from wasabi import msg
from wasabi import msg, Printer
from ._app import app, Arg, Opt
from ..language import Language
from ..util import load_model
@app.command("profile")
def profile(
def profile_cli(
# fmt: off
model: str = Arg(..., help="Model to load"),
inputs: Optional[str] = Arg(None, help="Location of input file. '-' for stdin."),
inputs: Optional[Path] = Arg(None, help="Location of input file. '-' for stdin.", exists=True, allow_dash=True),
n_texts: int = Opt(10000, "--n-texts", "-n", help="Maximum number of texts to use if available"),
# fmt: on
):
@ -27,6 +28,10 @@ def profile(
It can either be provided as a JSONL file, or be read from sys.sytdin.
If no input file is specified, the IMDB dataset is loaded via Thinc.
"""
profile(model, inputs=inputs, n_texts=n_texts)
def profile(model: str, inputs: Optional[Path] = None, n_texts: int = 10000) -> None:
if inputs is not None:
inputs = _read_inputs(inputs, msg)
if inputs is None:
@ -46,12 +51,12 @@ def profile(
s.strip_dirs().sort_stats("time").print_stats()
def parse_texts(nlp, texts):
def parse_texts(nlp: Language, texts: Sequence[str]) -> None:
for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=16):
pass
def _read_inputs(loc, msg):
def _read_inputs(loc: Union[Path, str], msg: Printer) -> Iterator[str]:
if loc == "-":
msg.info("Reading input from sys.stdin")
file_ = sys.stdin

View File

@ -1,64 +1,25 @@
from typing import List, Dict
from typing import List, Dict, Any
import typer
import srsly
from pathlib import Path
import os
import subprocess
import sys
from wasabi import msg
import shlex
from ._app import app, Arg, Opt
from .. import about
from ..schemas import ProjectConfigSchema, validate
from ..util import run_command
CONFIG_FILE = "project.yml"
SUBDIRS = [
"assets",
"configs",
"packages",
"metrics",
"scripts",
"notebooks",
"training",
]
DIRS = ["assets", "configs", "packages", "metrics", "scripts", "notebooks", "training"]
project_cli = typer.Typer(help="Command-line interface for spaCy projects")
def load_project_config(path):
config_path = path / CONFIG_FILE
if not config_path.exists():
msg.fail("Can't find project config", config_path, exits=1)
config = srsly.read_yaml(config_path)
errors = validate(ProjectConfigSchema, config)
if errors:
msg.fail(f"Invalid project config in {CONFIG_FILE}", "\n".join(errors), exits=1)
return config
def create_dirs(project_dir: Path):
for subdir in SUBDIRS:
(project_dir / subdir).mkdir(parents=True)
def run_cmd(command: str):
status = subprocess.call(shlex.split(command), env=os.environ.copy())
if status != 0:
sys.exit(status)
def run_commands(commands: List[str] = tuple(), variables: Dict[str, str] = {}):
for command in commands:
# Substitute variables, e.g. "./{NAME}.json"
command = command.format(**variables)
msg.info(command)
run_cmd(command)
@project_cli.command("clone")
def project_clone(
def project_clone_cli(
# fmt: off
name: str = Arg(..., help="The name of the template to fetch"),
dest: Path = Arg(Path.cwd(), help="Where to download and work. Defaults to current working directory.", exists=True, file_okay=False),
@ -70,13 +31,17 @@ def project_clone(
@project_cli.command("run")
def project_run(
def project_run_cli(
# fmt: off
project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False),
subcommand: str = Arg(None, help="Name of command defined in project config")
# fmt: on
):
"""Run scripts defined in the project."""
project_run(project_dir, subcommand)
def project_run(project_dir: Path, subcommand: str) -> None:
config = load_project_config(project_dir)
config_commands = config.get("commands", [])
variables = config.get("variables", {})
@ -98,3 +63,27 @@ def project_run(
app.add_typer(project_cli, name="project")
def load_project_config(path: Path) -> Dict[str, Any]:
config_path = path / CONFIG_FILE
if not config_path.exists():
msg.fail("Can't find project config", config_path, exits=1)
config = srsly.read_yaml(config_path)
errors = validate(ProjectConfigSchema, config)
if errors:
msg.fail(f"Invalid project config in {CONFIG_FILE}", "\n".join(errors), exits=1)
return config
def create_dirs(project_dir: Path) -> None:
for subdir in DIRS:
(project_dir / subdir).mkdir(parents=True)
def run_commands(commands: List[str] = tuple(), variables: Dict[str, str] = {}) -> None:
for command in commands:
# Substitute variables, e.g. "./{NAME}.json"
command = command.format(**variables)
msg.info(command)
run_command(shlex.split(command))

View File

@ -1,4 +1,4 @@
from typing import Optional
from typing import Optional, Dict
from timeit import default_timer as timer
import srsly
import tqdm
@ -85,9 +85,9 @@ subword_features = true
@app.command("train")
def train_cli(
# fmt: off
train_path: Path = Arg(..., help="Location of JSON-formatted training data"),
dev_path: Path = Arg(..., help="Location of JSON-formatted development data"),
config_path: Path = Arg(..., help="Path to config file"),
train_path: Path = Arg(..., help="Location of JSON-formatted training data", exists=True),
dev_path: Path = Arg(..., help="Location of JSON-formatted development data", exists=True),
config_path: Path = Arg(..., help="Path to config file", exists=True),
output_path: Optional[Path] = Opt(None, "--output-path", "-o", help="Output directory to store model in"),
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
init_tok2vec: Optional[Path] = Opt(None, "--init-tok2vec", "-t2v", help="Path to pretrained weights for the tok2vec components. See 'spacy pretrain'. Experimental."),
@ -162,14 +162,14 @@ def train_cli(
def train(
config_path,
data_paths,
raw_text=None,
output_path=None,
tag_map=None,
weights_data=None,
omit_extra_lookups=False,
):
config_path: Path,
data_paths: Dict[str, Path],
raw_text: Optional[Path] = None,
output_path: Optional[Path] = None,
tag_map: Optional[Path] = None,
weights_data: Optional[bytes] = None,
omit_extra_lookups: bool = False,
) -> None:
msg.info(f"Loading config from: {config_path}")
# Read the config first without creating objects, to get to the original nlp_config
config = util.load_config(config_path, create_objects=False)

View File

@ -1,7 +1,8 @@
from typing import Tuple
from pathlib import Path
import sys
import requests
from wasabi import msg
from wasabi import msg, Printer
from ._app import app
from .. import about
@ -10,11 +11,15 @@ from ..util import get_package_path, get_model_meta, is_compatible_version
@app.command("validate")
def validate():
def validate_cli():
"""
Validate that the currently installed version of spaCy is compatible
with the installed models. Should be run after `pip install -U spacy`.
"""
validate()
def validate() -> None:
model_pkgs, compat = get_model_pkgs()
spacy_version = get_base_version(about.__version__)
current_compat = compat.get(spacy_version, {})
@ -57,7 +62,8 @@ def validate():
sys.exit(1)
def get_model_pkgs():
def get_model_pkgs(silent: bool = False) -> Tuple[dict, dict]:
msg = Printer(no_print=silent, pretty=not silent)
with msg.loading("Loading compatibility table..."):
r = requests.get(about.__compatibility__)
if r.status_code != 200:
@ -95,7 +101,7 @@ def get_model_pkgs():
return pkgs, compat
def reformat_version(version):
def reformat_version(version: str) -> str:
"""Hack to reformat old versions ending on '-alpha' to match pip format."""
if version.endswith("-alpha"):
return version.replace("-alpha", "a0")

View File

@ -1,4 +1,4 @@
from typing import Dict, List, Union, Optional, Sequence
from typing import Dict, List, Union, Optional, Sequence, Any
from enum import Enum
from pydantic import BaseModel, Field, ValidationError, validator
from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool, FilePath
@ -164,7 +164,7 @@ class ModelMetaSchema(BaseModel):
email: Optional[StrictStr] = Field(None, title="Model author email")
url: Optional[StrictStr] = Field(None, title="Model author URL")
sources: Optional[Union[List[StrictStr], Dict[str, str]]] = Field(None, title="Training data sources")
vectors: Optional[Dict[str, int]] = Field(None, title="Included word vectors")
vectors: Optional[Dict[str, Any]] = Field(None, title="Included word vectors")
accuracy: Optional[Dict[str, Union[float, int]]] = Field(None, title="Accuracy numbers")
speed: Optional[Dict[str, Union[float, int]]] = Field(None, title="Speed evaluation numbers")
# fmt: on

View File

@ -1,10 +1,10 @@
from typing import List, Union
import os
import importlib
import importlib.util
import re
from pathlib import Path
import random
from typing import List
import thinc
from thinc.api import NumpyOps, get_current_ops, Adam, require_gpu, Config
import functools
@ -17,6 +17,8 @@ import sys
import warnings
from packaging.specifiers import SpecifierSet, InvalidSpecifier
from packaging.version import Version, InvalidVersion
import subprocess
from contextlib import contextmanager
try:
@ -427,6 +429,30 @@ def get_package_path(name):
return Path(pkg.__file__).parent
def run_command(command: List[str]) -> None:
"""Run a command on the command line as a subprocess.
command (list): The split command.
"""
status = subprocess.call(command, env=os.environ.copy())
if status != 0:
sys.exit(status)
@contextmanager
def working_dir(path: Union[str, Path]) -> None:
"""Change current working directory and returns to previous on exit.
path (str / Path): The directory to navigate to.
"""
prev_cwd = Path.cwd()
os.chdir(str(path))
try:
yield
finally:
os.chdir(prev_cwd)
def is_in_jupyter():
"""Check if user is running spaCy from a Jupyter notebook by detecting the
IPython kernel. Mainly used for the displaCy visualizer.