Refactor CLI

2025-10-24 20:51:30 +03:00 · 2020-06-21 21:35:01 +02:00 · 2020-06-21 21:35:01 +02:00 · 275bab62df
commit 275bab62df
parent c12713a8be
15 changed files with 451 additions and 209 deletions
--- a/spacy/main.py
+++ b/spacy/main.py
@ -1,4 +1,7 @@
 from spacy.cli import app
 from typer.main import get_command
 if __name__ == "__main__":
-    app()
+    command = get_command(app)
    # Ensure that the help messages always display the correct prompt
    command(prog_name="python -m spacy")
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@ -34,10 +34,10 @@ class FileTypes(str, Enum):
@app.command("convert")
-def convert(
+def convert_cli(
    # fmt: off
-    input_file: str = Arg(..., help="Input file"),
+    input_file: str = Arg(..., help="Input file", exists=True),
-    output_dir: str = Arg("-", help="Output directory. '-' for stdout."),
+    output_dir: Path = Arg("-", help="Output directory. '-' for stdout.", allow_dash=True, exists=True),
    file_type: FileTypes = Opt(FileTypes.json.value, "--file-type", "-t", help="Type of data to produce"),
    n_sents: int = Opt(1, "--n-sents", "-n", help="Number of sentences per doc (0 to disable)"),
    seg_sents: bool = Opt(False, "--seg-sents", "-s", help="Segment sentences (for -c ner)"),
@ -45,7 +45,7 @@ def convert(
    morphology: bool = Opt(False, "--morphology", "-m", help="Enable appending morphology to tags"),
    merge_subtokens: bool = Opt(False, "--merge-subtokens", "-T", help="Merge CoNLL-U subtokens"),
    converter: str = Opt("auto", "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"),
-    ner_map_path: Optional[Path] = Opt(None, "--ner-map-path", "-N", help="NER tag mapping (as JSON-encoded dict of entity types)"),
+    ner_map_path: Optional[Path] = Opt(None, "--ner-map-path", "-N", help="NER tag mapping (as JSON-encoded dict of entity types)", exists=True),
    lang: Optional[str] = Opt(None, "--lang", "-l", help="Language (if tokenizer required)"),
    # fmt: on
 ):
@ -58,8 +58,39 @@ def convert(
    if isinstance(file_type, FileTypes):
        # We get an instance of the FileTypes from the CLI so we need its string value
        file_type = file_type.value
-    no_print = output_dir == "-"
+    silent = output_dir == "-"
-    msg = Printer(no_print=no_print)
+    convert(
        input_file,
        output_dir,
        file_type=file_type,
        n_sents=n_sents,
        seg_sents=seg_sents,
        model=model,
        morphology=morphology,
        merge_subtokens=merge_subtokens,
        converter=converter,
        ner_map_path=ner_map_path,
        lang=lang,
        silent=silent,
    )
 def convert(
    input_file: Path,
    output_dir: Path,
    *,
    file_type: str = "json",
    n_sents: int = 1,
    seg_sents: bool = False,
    model: Optional[str] = None,
    morphology: bool = False,
    merge_subtokens: bool = False,
    converter: str = "auto",
    ner_map_path: Optional[Path] = None,
    lang: Optional[str] = None,
    silent: bool = True,
 ) -> None:
    msg = Printer(no_print=silent, pretty=not silent)
    input_path = Path(input_file)
    if file_type not in FILE_TYPES_STDOUT and output_dir == "-":
        # TODO: support msgpack via stdout in srsly?
@ -85,7 +116,8 @@ def convert(
            converter = converter_autodetect
        else:
            msg.warn(
-                "Can't automatically detect NER format. Conversion may not succeed. See https://spacy.io/api/cli#convert"
+                "Can't automatically detect NER format. Conversion may not "
                "succeed. See https://spacy.io/api/cli#convert"
            )
    if converter not in CONVERTERS:
        msg.fail(f"Can't find converter for {converter}", exits=1)
@ -102,7 +134,7 @@ def convert(
        merge_subtokens=merge_subtokens,
        lang=lang,
        model=model,
-        no_print=no_print,
+        no_print=silent,
        ner_map=ner_map,
    )
    if output_dir != "-":
@ -124,7 +156,7 @@ def convert(
            srsly.write_jsonl("-", data)
-def autodetect_ner_format(input_data):
+def autodetect_ner_format(input_data: str) -> str:
    # guess format from the first 20 lines
    lines = input_data.split("\n")[:20]
    format_guesses = {"ner": 0, "iob": 0}
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, List, Sequence, Dict, Any, Tuple
 from pathlib import Path
 from collections import Counter
 import sys
@ -6,8 +6,9 @@ import srsly
 from wasabi import Printer, MESSAGES
 from ._app import app, Arg, Opt
-from ..gold import GoldCorpus
+from ..gold import GoldCorpus, Example
 from ..syntax import nonproj
 from ..language import Language
 from ..util import load_model, get_lang_class
@ -21,12 +22,12 @@ BLANK_MODEL_THRESHOLD = 2000
@app.command("debug-data")
-def debug_data(
+def debug_data_cli(
    # fmt: off
    lang: str = Arg(..., help="Model language"),
-    train_path: Path = Arg(..., help="Location of JSON-formatted training data"),
+    train_path: Path = Arg(..., help="Location of JSON-formatted training data", exists=True),
-    dev_path: Path = Arg(..., help="Location of JSON-formatted development data"),
+    dev_path: Path = Arg(..., help="Location of JSON-formatted development data", exists=True),
-    tag_map_path: Optional[Path] = Opt(None, "--tag-map-path", "-tm", help="Location of JSON-formatted tag map"),
+    tag_map_path: Optional[Path] = Opt(None, "--tag-map-path", "-tm", help="Location of JSON-formatted tag map", exists=True, dir_okay=False),
    base_model: Optional[str] = Opt(None, "--base-model", "-b", help="Name of model to update (optional)"),
    pipeline: str = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of pipeline components to train"),
    ignore_warnings: bool = Opt(False, "--ignore-warnings", "-IW", help="Ignore warnings, only show stats and errors"),
@ -39,8 +40,36 @@ def debug_data(
    stats, and find problems like invalid entity annotations, cyclic
    dependencies, low data labels and more.
    """
-    msg = Printer(pretty=not no_format, ignore_warnings=ignore_warnings)
+    debug_data(
        lang,
        train_path,
        dev_path,
        tag_map_path=tag_map_path,
        base_model=base_model,
        pipeline=[p.strip() for p in pipeline.split(",")],
        ignore_warnings=ignore_warnings,
        verbose=verbose,
        no_format=no_format,
        silent=False,
    )
 def debug_data(
    lang: str,
    train_path: Path,
    dev_path: Path,
    *,
    tag_map_path: Optional[Path] = None,
    base_model: Optional[str] = None,
    pipeline: List[str] = ["tagger", "parser", "ner"],
    ignore_warnings: bool = False,
    verbose: bool = False,
    no_format: bool = True,
    silent: bool = True,
 ):
    msg = Printer(
        no_print=silent, pretty=not no_format, ignore_warnings=ignore_warnings
    )
    # Make sure all files and paths exists if they are needed
    if not train_path.exists():
        msg.fail("Training data not found", train_path, exits=1)
@ -52,7 +81,6 @@ def debug_data(
        tag_map = srsly.read_json(tag_map_path)
    # Initialize the model and pipeline
    pipeline = [p.strip() for p in pipeline.split(",")]
    if base_model:
        nlp = load_model(base_model)
    else:
@ -449,7 +477,7 @@ def debug_data(
        sys.exit(1)
-def _load_file(file_path, msg):
+def _load_file(file_path: Path, msg: Printer) -> None:
    file_name = file_path.parts[-1]
    if file_path.suffix == ".json":
        with msg.loading(f"Loading {file_name}..."):
@ -468,7 +496,9 @@ def _load_file(file_path, msg):
    )
-def _compile_gold(examples, pipeline, nlp):
+def _compile_gold(
    examples: Sequence[Example], pipeline: List[str], nlp: Language
 ) -> Dict[str, Any]:
    data = {
        "ner": Counter(),
        "cats": Counter(),
@ -540,13 +570,13 @@ def _compile_gold(examples, pipeline, nlp):
    return data
-def _format_labels(labels, counts=False):
+def _format_labels(labels: List[Tuple[str, int]], counts: bool = False) -> str:
    if counts:
        return ", ".join([f"'{l}' ({c})" for l, c in labels])
    return ", ".join([f"'{l}'" for l in labels])
-def _get_examples_without_label(data, label):
+def _get_examples_without_label(data: Sequence[Example], label: str) -> int:
    count = 0
    for ex in data:
        labels = [
@ -559,7 +589,7 @@ def _get_examples_without_label(data, label):
    return count
-def _get_labels_from_model(nlp, pipe_name):
+def _get_labels_from_model(nlp: Language, pipe_name: str) -> Sequence[str]:
    if pipe_name not in nlp.pipe_names:
        return set()
    pipe = nlp.get_pipe(pipe_name)
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@ -1,31 +1,36 @@
-from typing import List
+from typing import Optional, Sequence, Union
 import requests
 import os
 import subprocess
 import sys
 from wasabi import msg
 import typer
 from ._app import app, Arg, Opt
 from .. import about
-from ..util import is_package, get_base_version
+from ..util import is_package, get_base_version, run_command
@app.command(
    "download",
    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
 )
-def download(
+def download_cli(
    # fmt: off
    ctx: typer.Context,
    model: str = Arg(..., help="Model to download (shortcut or name)"),
    direct: bool = Opt(False, "--direct", "-d", help="Force direct download of name + version"),
    pip_args: List[str] = Arg(..., help="Additional arguments to be passed to `pip install` on model install"),
    # fmt: on
 ):
    """
    Download compatible model from default download path using pip. If --direct
    flag is set, the command expects the full model name with version.
-    For direct downloads, the compatibility check will be skipped.
+    For direct downloads, the compatibility check will be skipped. All
    additional arguments provided to this command will be passed to `pip install`
    on model installation.
    """
    download(model, direct, *ctx.args)
 def download(model: str, direct: bool = False, *pip_args) -> None:
    if not is_package("spacy") and "--no-deps" not in pip_args:
        msg.warn(
            "Skipping model package dependencies and setting `--no-deps`. "
@ -41,22 +46,20 @@ def download(
        components = model.split("-")
        model_name = "".join(components[:-1])
        version = components[-1]
-        dl = download_model(dl_tpl.format(m=model_name, v=version), pip_args)
+        download_model(dl_tpl.format(m=model_name, v=version), pip_args)
    else:
        shortcuts = get_json(about.__shortcuts__, "available shortcuts")
        model_name = shortcuts.get(model, model)
        compatibility = get_compatibility()
        version = get_version(model_name, compatibility)
-        dl = download_model(dl_tpl.format(m=model_name, v=version), pip_args)
+        download_model(dl_tpl.format(m=model_name, v=version), pip_args)
        if dl != 0:  # if download subprocess doesn't return 0, exit
            sys.exit(dl)
    msg.good(
        "Download and installation successful",
        f"You can now load the model via spacy.load('{model_name}')",
    )
-def get_json(url, desc):
+def get_json(url: str, desc: str) -> Union[dict, list]:
    r = requests.get(url)
    if r.status_code != 200:
        msg.fail(
@ -70,7 +73,7 @@ def get_json(url, desc):
    return r.json()
-def get_compatibility():
+def get_compatibility() -> dict:
    version = get_base_version(about.__version__)
    comp_table = get_json(about.__compatibility__, "compatibility table")
    comp = comp_table["spacy"]
@ -79,7 +82,7 @@ def get_compatibility():
    return comp[version]
-def get_version(model, comp):
+def get_version(model: str, comp: dict) -> str:
    model = get_base_version(model)
    if model not in comp:
        msg.fail(
@ -89,10 +92,12 @@ def get_version(model, comp):
    return comp[model][0]
-def download_model(filename, user_pip_args=None):
+def download_model(
    filename: str, user_pip_args: Optional[Sequence[str]] = None
 ) -> None:
    download_url = about.__download_url__ + "/" + filename
    pip_args = ["--no-cache-dir"]
    if user_pip_args:
        pip_args.extend(user_pip_args)
    cmd = [sys.executable, "-m", "pip", "install"] + pip_args + [download_url]
-    return subprocess.call(cmd, env=os.environ.copy())
+    run_command(cmd)
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@ -1,29 +1,52 @@
-from typing import Optional
+from typing import Optional, List
 from timeit import default_timer as timer
-from wasabi import msg
+from wasabi import Printer
 from pathlib import Path
 from ._app import app, Arg, Opt
 from ..tokens import Doc
 from ..scorer import Scorer
 from ..gold import GoldCorpus
 from .. import util
 from .. import displacy
@app.command("evaluate")
-def evaluate(
+def evaluate_cli(
    # fmt: off
    model: str = Arg(..., help="Model name or path"),
-    data_path: str = Arg(..., help="Location of JSON-formatted evaluation data"),
+    data_path: Path = Arg(..., help="Location of JSON-formatted evaluation data", exists=True),
    gpu_id: int = Opt(-1, "--gpu-id", "-g", help="Use GPU"),
    gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"),
-    displacy_path: Optional[str] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML"),
+    displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
    displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"),
    return_scores: bool = Opt(False, "--return-scores", "-R", help="Return dict containing model scores"),
    # fmt: on
 ):
    """
    Evaluate a model. To render a sample of parses in a HTML file, set an
    output directory as the displacy_path argument.
    """
    evaluate(
        model,
        data_path,
        gpu_id=gpu_id,
        gold_preproc=gold_preproc,
        displacy_path=displacy_path,
        displacy_limit=displacy_limit,
        silent=False,
    )
 def evaluate(
    model: str,
    data_path: Path,
    gpu_id: int = -1,
    gold_preproc: bool = False,
    displacy_path: Optional[Path] = None,
    displacy_limit: int = 25,
    silent: bool = True,
 ) -> Scorer:
    msg = Printer(no_print=silent, pretty=not silent)
    util.fix_random_seed()
    if gpu_id >= 0:
        util.use_gpu(gpu_id)
@ -78,11 +101,17 @@ def evaluate(
            ents=render_ents,
        )
        msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path)
    if return_scores:
    return scorer.scores
-def render_parses(docs, output_path, model_name="", limit=250, deps=True, ents=True):
+def render_parses(
    docs: List[Doc],
    output_path: Path,
    model_name: str = "",
    limit: int = 250,
    deps: bool = True,
    ents: bool = True,
 ):
    docs[0].user_data["title"] = model_name
    if ents:
        html = displacy.render(docs[:limit], style="ent", page=True)
--- a/spacy/cli/info.py
+++ b/spacy/cli/info.py
@ -1,7 +1,7 @@
-from typing import Optional
+from typing import Optional, Dict, Any, Union
 import platform
 from pathlib import Path
-from wasabi import msg
+from wasabi import Printer
 import srsly
 from ._app import app, Arg, Opt
@ -11,7 +11,7 @@ from .. import about
@app.command("info")
-def info(
+def info_cli(
    # fmt: off
    model: Optional[str] = Arg(None, help="Optional model name"),
    markdown: bool = Opt(False, "--markdown", "-md", help="Generate Markdown for GitHub issues"),
@ -23,7 +23,54 @@ def info(
    print model information. Flag --markdown prints details in Markdown for easy
    copy-pasting to GitHub issues.
    """
    info(model, markdown=markdown, silent=silent)
 def info(
    model: Optional[str], *, markdown: bool = False, silent: bool = True
 ) -> Union[str, dict]:
    msg = Printer(no_print=silent, pretty=not silent)
    if model:
        title = f"Info about model '{model}'"
        data = info_model(model, silent=silent)
    else:
        title = "Info about spaCy"
        data = info_spacy(silent=silent)
    markdown_data = get_markdown(data, title=title)
    if markdown:
        if not silent:
            print(markdown_data)
        return markdown_data
    if not silent:
        msg.table(data, title=title)
    return data
 def info_spacy(*, silent: bool = True) -> Dict[str, any]:
    """Generate info about the current spaCy intallation.
    silent (bool): Don't print anything, just return.
    RETURNS (dict): The spaCy info.
    """
    all_models, _ = get_model_pkgs(silent=silent)
    models = ", ".join(f"{m['name']} ({m['version']})" for m in all_models.values())
    return {
        "spaCy version": about.__version__,
        "Location": str(Path(__file__).parent.parent),
        "Platform": platform.platform(),
        "Python version": platform.python_version(),
        "Models": models,
    }
 def info_model(model: str, *, silent: bool = True) -> Dict[str, Any]:
    """Generate info about a specific model.
    model (str): Model name of path.
    silent (bool): Don't print anything, just return.
    RETURNS (dict): The model meta.
    """
    msg = Printer(no_print=silent, pretty=not silent)
    if util.is_package(model):
        model_path = util.get_package_path(model)
    else:
@ -37,46 +84,22 @@ def info(
        meta["source"] = str(model_path.resolve())
    else:
        meta["source"] = str(model_path)
-        if not silent:
+    return {k: v for k, v in meta.items() if k not in ("accuracy", "speed")}
            title = f"Info about model '{model}'"
            model_meta = {
                k: v for k, v in meta.items() if k not in ("accuracy", "speed")
            }
            if markdown:
                print_markdown(model_meta, title=title)
            else:
                msg.table(model_meta, title=title)
        return meta
    all_models, _ = get_model_pkgs()
    data = {
        "spaCy version": about.__version__,
        "Location": str(Path(__file__).parent.parent),
        "Platform": platform.platform(),
        "Python version": platform.python_version(),
        "Models": ", ".join(
            f"{m['name']} ({m['version']})" for m in all_models.values()
        ),
    }
    if not silent:
        title = "Info about spaCy"
        if markdown:
            print_markdown(data, title=title)
        else:
            msg.table(data, title=title)
    return data
-def print_markdown(data, title=None):
+def get_markdown(data: Dict[str, Any], title: Optional[str] = None) -> str:
-    """Print data in GitHub-flavoured Markdown format for issues etc.
+    """Get data in GitHub-flavoured Markdown format for issues etc.
    data (dict or list of tuples): Label/value pairs.
    title (str / None): Title, will be rendered as headline 2.
    RETURNS (str): The Markdown string.
    """
    markdown = []
    for key, value in data.items():
        if isinstance(value, str) and Path(value).exists():
            continue
        markdown.append(f"* **{key}:** {value}")
    result = "\n{}\n".format("\n".join(markdown))
    if title:
-        print(f"\n## {title}")
+        result = f"\n## {title}\n{result}"
-    print("\n{}\n".format("\n".join(markdown)))
+    return result
--- a/spacy/cli/init_model.py
+++ b/spacy/cli/init_model.py
@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, List, Dict, Any, Union, IO
 import math
 from tqdm import tqdm
 import numpy
@ -10,11 +10,12 @@ import gzip
 import zipfile
 import srsly
 import warnings
-from wasabi import msg
+from wasabi import Printer
 from ._app import app, Arg, Opt
 from ..vectors import Vectors
 from ..errors import Errors, Warnings
 from ..language import Language
 from ..util import ensure_path, get_lang_class, load_model, OOV_RANK
 from ..lookups import Lookups
@ -28,14 +29,14 @@ DEFAULT_OOV_PROB = -20
@app.command("init-model")
-def init_model(
+def init_model_cli(
    # fmt: off
    lang: str = Arg(..., help="Model language"),
    output_dir: Path = Arg(..., help="Model output directory"),
-    freqs_loc: Optional[Path] = Arg(None, help="Location of words frequencies file"),
+    freqs_loc: Optional[Path] = Arg(None, help="Location of words frequencies file", exists=True),
-    clusters_loc: Optional[str] = Opt(None, "--clusters-loc", "-c", help="Optional location of brown clusters data"),
+    clusters_loc: Optional[Path] = Opt(None, "--clusters-loc", "-c", help="Optional location of brown clusters data", exists=True),
-    jsonl_loc: Optional[Path] = Opt(None, "--jsonl-loc", "-j", help="Location of JSONL-formatted attributes file"),
+    jsonl_loc: Optional[Path] = Opt(None, "--jsonl-loc", "-j", help="Location of JSONL-formatted attributes file", exists=True),
-    vectors_loc: Optional[str] = Opt(None, "--vectors-loc", "-v", help="Optional vectors file in Word2Vec format"),
+    vectors_loc: Optional[Path] = Opt(None, "--vectors-loc", "-v", help="Optional vectors file in Word2Vec format", exists=True),
    prune_vectors: int = Opt(-1 , "--prune-vectors", "-V", help="Optional number of vectors to prune to"),
    truncate_vectors: int = Opt(0, "--truncate-vectors", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
    vectors_name: Optional[str] = Opt(None, "--vectors-name", "-vn", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
@ -49,6 +50,38 @@ def init_model(
    and word vectors. If vectors are provided in Word2Vec format, they can
    be either a .txt or zipped as a .zip or .tar.gz.
    """
    init_model(
        lang,
        output_dir,
        freqs_loc=freqs_loc,
        clusters_loc=clusters_loc,
        jsonl_loc=jsonl_loc,
        prune_vectors=prune_vectors,
        truncate_vectors=truncate_vectors,
        vectors_name=vectors_name,
        model_name=model_name,
        omit_extra_lookups=omit_extra_lookups,
        base_model=base_model,
        silent=False,
    )
 def init_model(
    lang: str,
    output_dir: Path,
    freqs_loc: Optional[Path] = None,
    clusters_loc: Optional[Path] = None,
    jsonl_loc: Optional[Path] = None,
    vectors_loc: Optional[Path] = None,
    prune_vectors: int = -1,
    truncate_vectors: int = 0,
    vectors_name: Optional[str] = None,
    model_name: Optional[str] = None,
    omit_extra_lookups: bool = False,
    base_model: Optional[str] = None,
    silent: bool = True,
 ) -> Language:
    msg = Printer(no_print=silent, pretty=not silent)
    if jsonl_loc is not None:
        if freqs_loc is not None or clusters_loc is not None:
            settings = ["-j"]
@ -71,7 +104,7 @@ def init_model(
        freqs_loc = ensure_path(freqs_loc)
        if freqs_loc is not None and not freqs_loc.exists():
            msg.fail("Can't find words frequencies file", freqs_loc, exits=1)
-        lex_attrs = read_attrs_from_deprecated(freqs_loc, clusters_loc)
+        lex_attrs = read_attrs_from_deprecated(msg, freqs_loc, clusters_loc)
    with msg.loading("Creating model..."):
        nlp = create_model(lang, lex_attrs, name=model_name, base_model=base_model)
@ -86,7 +119,9 @@ def init_model(
    msg.good("Successfully created model")
    if vectors_loc is not None:
-        add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name)
+        add_vectors(
            msg, nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name
        )
    vec_added = len(nlp.vocab.vectors)
    lex_added = len(nlp.vocab)
    msg.good(
@ -98,7 +133,7 @@ def init_model(
    return nlp
-def open_file(loc):
+def open_file(loc: Union[str, Path]) -> IO:
    """Handle .gz, .tar.gz or unzipped files"""
    loc = ensure_path(loc)
    if tarfile.is_tarfile(str(loc)):
@ -114,7 +149,9 @@ def open_file(loc):
        return loc.open("r", encoding="utf8")
-def read_attrs_from_deprecated(freqs_loc, clusters_loc):
+def read_attrs_from_deprecated(
    msg: Printer, freqs_loc: Optional[Path], clusters_loc: Optional[Path]
 ) -> List[Dict[str, Any]]:
    if freqs_loc is not None:
        with msg.loading("Counting frequencies..."):
            probs, _ = read_freqs(freqs_loc)
@ -142,7 +179,12 @@ def read_attrs_from_deprecated(freqs_loc, clusters_loc):
    return lex_attrs
-def create_model(lang, lex_attrs, name=None, base_model=None):
+def create_model(
    lang: str,
    lex_attrs: List[Dict[str, Any]],
    name: Optional[str] = None,
    base_model: Optional[Union[str, Path]] = None,
 ) -> Language:
    if base_model:
        nlp = load_model(base_model)
        # keep the tokenizer but remove any existing pipeline components due to
@ -169,7 +211,14 @@ def create_model(lang, lex_attrs, name=None, base_model=None):
    return nlp
-def add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, name=None):
+def add_vectors(
    msg: Printer,
    nlp: Language,
    vectors_loc: Optional[Path],
    truncate_vectors: int,
    prune_vectors: int,
    name: Optional[str] = None,
 ) -> None:
    vectors_loc = ensure_path(vectors_loc)
    if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
        nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))
@ -179,7 +228,7 @@ def add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, name=None):
    else:
        if vectors_loc:
            with msg.loading(f"Reading vectors from {vectors_loc}"):
-                vectors_data, vector_keys = read_vectors(vectors_loc)
+                vectors_data, vector_keys = read_vectors(msg, vectors_loc)
            msg.good(f"Loaded vectors from {vectors_loc}")
        else:
            vectors_data, vector_keys = (None, None)
@ -198,7 +247,7 @@ def add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, name=None):
        nlp.vocab.prune_vectors(prune_vectors)
-def read_vectors(vectors_loc, truncate_vectors=0):
+def read_vectors(msg: Printer, vectors_loc: Path, truncate_vectors: int = 0):
    f = open_file(vectors_loc)
    shape = tuple(int(size) for size in next(f).split())
    if truncate_vectors >= 1:
@ -218,7 +267,9 @@ def read_vectors(vectors_loc, truncate_vectors=0):
    return vectors_data, vectors_keys
-def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
+def read_freqs(
    freqs_loc: Path, max_length: int = 100, min_doc_freq: int = 5, min_freq: int = 50
 ):
    counts = PreshCounter()
    total = 0
    with freqs_loc.open() as f:
@ -247,7 +298,7 @@ def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
    return probs, oov_prob
-def read_clusters(clusters_loc):
+def read_clusters(clusters_loc: Path) -> dict:
    clusters = {}
    if ftfy is None:
        warnings.warn(Warnings.W004)
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@ -1,22 +1,24 @@
-from typing import Optional
+from typing import Optional, Union, Any, Dict
 import shutil
 from pathlib import Path
-from wasabi import msg, get_raw_input
+from wasabi import Printer, get_raw_input
 import srsly
 import sys
 from ._app import app, Arg, Opt
 from ..schemas import validate, ModelMetaSchema
 from .. import util
 from .. import about
@app.command("package")
-def package(
+def package_cli(
    # fmt: off
-    input_dir: str = Arg(..., help="Directory with model data"),
+    input_dir: Path = Arg(..., help="Directory with model data", exists=True, file_okay=False),
-    output_dir: str = Arg(..., help="Output parent directory"),
+    output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False),
-    meta_path: Optional[str] = Opt(None, "--meta-path", "-m", help="Path to meta.json"),
+    meta_path: Optional[Path] = Opt(None, "--meta-path", "-m", help="Path to meta.json", exists=True, dir_okay=False),
    create_meta: bool = Opt(False, "--create-meta", "-c", help="Create meta.json, even if one exists"),
-    force: bool = Opt(False, "--force", "-f", help="Force overwriting existing model in output directory"),
+    force: bool = Opt(False, "--force", "-f", "-F", help="Force overwriting existing model in output directory"),
    # fmt: on
 ):
    """
@ -26,6 +28,25 @@ def package(
    set and a meta.json already exists in the output directory, the existing
    values will be used as the defaults in the command-line prompt.
    """
    package(
        input_dir,
        output_dir,
        meta_path=meta_path,
        create_meta=create_meta,
        force=force,
        silent=False,
    )
 def package(
    input_dir: Path,
    output_dir: Path,
    meta_path: Optional[Path] = None,
    create_meta: bool = False,
    force: bool = False,
    silent: bool = True,
 ) -> None:
    msg = Printer(no_print=silent, pretty=not silent)
    input_path = util.ensure_path(input_dir)
    output_path = util.ensure_path(output_dir)
    meta_path = util.ensure_path(meta_path)
@ -36,23 +57,20 @@ def package(
    if meta_path and not meta_path.exists():
        msg.fail("Can't find model meta.json", meta_path, exits=1)
-    meta_path = meta_path or input_path / "meta.json"
+    meta_path = meta_path or input_dir / "meta.json"
-    if meta_path.is_file():
+    if not meta_path.exists() or not meta_path.is_file():
        msg.fail("Can't load model meta.json", meta_path, exits=1)
    meta = srsly.read_json(meta_path)
    if not create_meta:  # only print if user doesn't want to overwrite
        msg.good("Loaded meta.json from file", meta_path)
    else:
        meta = generate_meta(input_dir, meta, msg)
-    for key in ("lang", "name", "version"):
+    errors = validate(ModelMetaSchema, meta)
-        if key not in meta or meta[key] == "":
+    if errors:
-            msg.fail(
+        msg.fail("Invalid model meta.json", "\n".join(errors), exits=1)
                f"No '{key}' setting found in meta.json",
                "This setting is required to build your package.",
                exits=1,
            )
    model_name = meta["lang"] + "_" + meta["name"]
    model_name_v = model_name + "-" + meta["version"]
-    main_path = output_path / model_name_v
+    main_path = output_dir / model_name_v
    package_path = main_path / model_name
    if package_path.exists():
@ -66,21 +84,26 @@ def package(
                exits=1,
            )
    Path.mkdir(package_path, parents=True)
-    shutil.copytree(str(input_path), str(package_path / model_name_v))
+    shutil.copytree(str(input_dir), str(package_path / model_name_v))
    create_file(main_path / "meta.json", srsly.json_dumps(meta, indent=2))
    create_file(main_path / "setup.py", TEMPLATE_SETUP)
    create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST)
    create_file(package_path / "__init__.py", TEMPLATE_INIT)
    msg.good(f"Successfully created package '{model_name_v}'", main_path)
-    msg.text("To build the package, run `python setup.py sdist` in this directory.")
+    with util.working_dir(main_path):
        util.run_command([sys.executable, "setup.py", "sdist"])
    zip_file = main_path / "dist" / f"{model_name_v}.tar.gz"
    msg.good(f"Successfully created zipped Python package", zip_file)
-def create_file(file_path, contents):
+def create_file(file_path: Path, contents: str) -> None:
    file_path.touch()
    file_path.open("w", encoding="utf-8").write(contents)
-def generate_meta(model_path, existing_meta, msg):
+def generate_meta(
    model_path: Union[str, Path], existing_meta: Dict[str, Any], msg: Printer
 ) -> Dict[str, Any]:
    meta = existing_meta or {}
    settings = [
        ("lang", "Model language", meta.get("lang", "en")),
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@ -19,12 +19,12 @@ from ..gold import Example
@app.command("pretrain")
-def pretrain(
+def pretrain_cli(
    # fmt: off
-    texts_loc: str =Arg(..., help="Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the key 'tokens'"),
+    texts_loc: Path = Arg(..., help="Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the key 'tokens'", exists=True),
    vectors_model: str = Arg(..., help="Name or path to spaCy model with vectors to learn from"),
    output_dir: Path = Arg(..., help="Directory to write models to on each epoch"),
-    config_path: Path = Arg(..., help="Path to config file"),
+    config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False),
    use_gpu: int = Opt(-1, "--use-gpu", "-g", help="Use GPU"),
    resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
    epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files."),
@ -45,6 +45,26 @@ def pretrain(
    all settings are the same between pretraining and training. Ideally,
    this is done by using the same config file for both commands.
    """
    pretrain(
        texts_loc,
        vectors_model,
        output_dir,
        config_path,
        use_gpu=use_gpu,
        resume_path=resume_path,
        epoch_resume=epoch_resume,
    )
 def pretrain(
    texts_loc: Path,
    vectors_model: str,
    output_dir: Path,
    config_path: Path,
    use_gpu: int = -1,
    resume_path: Optional[Path] = None,
    epoch_resume: Optional[int] = None,
 ):
    if not config_path or not config_path.exists():
        msg.fail("Config file not found", config_path, exits=1)
--- a/spacy/cli/profile.py
+++ b/spacy/cli/profile.py
@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, Sequence, Union, Iterator
 import tqdm
 from pathlib import Path
 import srsly
@ -7,17 +7,18 @@ import pstats
 import sys
 import itertools
 import ml_datasets
-from wasabi import msg
+from wasabi import msg, Printer
 from ._app import app, Arg, Opt
 from ..language import Language
 from ..util import load_model
@app.command("profile")
-def profile(
+def profile_cli(
    # fmt: off
    model: str = Arg(..., help="Model to load"),
-    inputs: Optional[str] = Arg(None, help="Location of input file. '-' for stdin."),
+    inputs: Optional[Path] = Arg(None, help="Location of input file. '-' for stdin.", exists=True, allow_dash=True),
    n_texts: int = Opt(10000, "--n-texts", "-n", help="Maximum number of texts to use if available"),
    # fmt: on
 ):
@ -27,6 +28,10 @@ def profile(
    It can either be provided as a JSONL file, or be read from sys.sytdin.
    If no input file is specified, the IMDB dataset is loaded via Thinc.
    """
    profile(model, inputs=inputs, n_texts=n_texts)
 def profile(model: str, inputs: Optional[Path] = None, n_texts: int = 10000) -> None:
    if inputs is not None:
        inputs = _read_inputs(inputs, msg)
    if inputs is None:
@ -46,12 +51,12 @@ def profile(
    s.strip_dirs().sort_stats("time").print_stats()
-def parse_texts(nlp, texts):
+def parse_texts(nlp: Language, texts: Sequence[str]) -> None:
    for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=16):
        pass
-def _read_inputs(loc, msg):
+def _read_inputs(loc: Union[Path, str], msg: Printer) -> Iterator[str]:
    if loc == "-":
        msg.info("Reading input from sys.stdin")
        file_ = sys.stdin
--- a/spacy/cli/project.py
+++ b/spacy/cli/project.py
@ -1,64 +1,25 @@
-from typing import List, Dict
+from typing import List, Dict, Any
 import typer
 import srsly
 from pathlib import Path
 import os
 import subprocess
 import sys
 from wasabi import msg
 import shlex
 from ._app import app, Arg, Opt
 from .. import about
 from ..schemas import ProjectConfigSchema, validate
 from ..util import run_command
 CONFIG_FILE = "project.yml"
-SUBDIRS = [
+DIRS = ["assets", "configs", "packages", "metrics", "scripts", "notebooks", "training"]
    "assets",
    "configs",
    "packages",
    "metrics",
    "scripts",
    "notebooks",
    "training",
 ]
 project_cli = typer.Typer(help="Command-line interface for spaCy projects")
 def load_project_config(path):
    config_path = path / CONFIG_FILE
    if not config_path.exists():
        msg.fail("Can't find project config", config_path, exits=1)
    config = srsly.read_yaml(config_path)
    errors = validate(ProjectConfigSchema, config)
    if errors:
        msg.fail(f"Invalid project config in {CONFIG_FILE}", "\n".join(errors), exits=1)
    return config
 def create_dirs(project_dir: Path):
    for subdir in SUBDIRS:
        (project_dir / subdir).mkdir(parents=True)
 def run_cmd(command: str):
    status = subprocess.call(shlex.split(command), env=os.environ.copy())
    if status != 0:
        sys.exit(status)
 def run_commands(commands: List[str] = tuple(), variables: Dict[str, str] = {}):
    for command in commands:
        # Substitute variables, e.g. "./{NAME}.json"
        command = command.format(**variables)
        msg.info(command)
        run_cmd(command)
@project_cli.command("clone")
-def project_clone(
+def project_clone_cli(
    # fmt: off
    name: str = Arg(..., help="The name of the template to fetch"),
    dest: Path = Arg(Path.cwd(), help="Where to download and work. Defaults to current working directory.", exists=True, file_okay=False),
@ -70,13 +31,17 @@ def project_clone(
@project_cli.command("run")
-def project_run(
+def project_run_cli(
    # fmt: off
    project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False),
    subcommand: str = Arg(None, help="Name of command defined in project config")
    # fmt: on
 ):
    """Run scripts defined in the project."""
    project_run(project_dir, subcommand)
 def project_run(project_dir: Path, subcommand: str) -> None:
    config = load_project_config(project_dir)
    config_commands = config.get("commands", [])
    variables = config.get("variables", {})
@ -98,3 +63,27 @@ def project_run(
 app.add_typer(project_cli, name="project")
 def load_project_config(path: Path) -> Dict[str, Any]:
    config_path = path / CONFIG_FILE
    if not config_path.exists():
        msg.fail("Can't find project config", config_path, exits=1)
    config = srsly.read_yaml(config_path)
    errors = validate(ProjectConfigSchema, config)
    if errors:
        msg.fail(f"Invalid project config in {CONFIG_FILE}", "\n".join(errors), exits=1)
    return config
 def create_dirs(project_dir: Path) -> None:
    for subdir in DIRS:
        (project_dir / subdir).mkdir(parents=True)
 def run_commands(commands: List[str] = tuple(), variables: Dict[str, str] = {}) -> None:
    for command in commands:
        # Substitute variables, e.g. "./{NAME}.json"
        command = command.format(**variables)
        msg.info(command)
        run_command(shlex.split(command))
--- a/spacy/cli/train_from_config.py
+++ b/spacy/cli/train_from_config.py
@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, Dict
 from timeit import default_timer as timer
 import srsly
 import tqdm
@ -85,9 +85,9 @@ subword_features = true
@app.command("train")
 def train_cli(
    # fmt: off
-    train_path: Path = Arg(..., help="Location of JSON-formatted training data"),
+    train_path: Path = Arg(..., help="Location of JSON-formatted training data", exists=True),
-    dev_path: Path = Arg(..., help="Location of JSON-formatted development data"),
+    dev_path: Path = Arg(..., help="Location of JSON-formatted development data", exists=True),
-    config_path: Path = Arg(..., help="Path to config file"),
+    config_path: Path = Arg(..., help="Path to config file", exists=True),
    output_path: Optional[Path] = Opt(None, "--output-path", "-o", help="Output directory to store model in"),
    code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
    init_tok2vec: Optional[Path] = Opt(None, "--init-tok2vec", "-t2v", help="Path to pretrained weights for the tok2vec components. See 'spacy pretrain'. Experimental."),
@ -162,14 +162,14 @@ def train_cli(
 def train(
-    config_path,
+    config_path: Path,
-    data_paths,
+    data_paths: Dict[str, Path],
-    raw_text=None,
+    raw_text: Optional[Path] = None,
-    output_path=None,
+    output_path: Optional[Path] = None,
-    tag_map=None,
+    tag_map: Optional[Path] = None,
-    weights_data=None,
+    weights_data: Optional[bytes] = None,
-    omit_extra_lookups=False,
+    omit_extra_lookups: bool = False,
-):
+) -> None:
    msg.info(f"Loading config from: {config_path}")
    # Read the config first without creating objects, to get to the original nlp_config
    config = util.load_config(config_path, create_objects=False)
--- a/spacy/cli/validate.py
+++ b/spacy/cli/validate.py
@ -1,7 +1,8 @@
 from typing import Tuple
 from pathlib import Path
 import sys
 import requests
-from wasabi import msg
+from wasabi import msg, Printer
 from ._app import app
 from .. import about
@ -10,11 +11,15 @@ from ..util import get_package_path, get_model_meta, is_compatible_version
@app.command("validate")
-def validate():
+def validate_cli():
    """
    Validate that the currently installed version of spaCy is compatible
    with the installed models. Should be run after `pip install -U spacy`.
    """
    validate()
 def validate() -> None:
    model_pkgs, compat = get_model_pkgs()
    spacy_version = get_base_version(about.__version__)
    current_compat = compat.get(spacy_version, {})
@ -57,7 +62,8 @@ def validate():
        sys.exit(1)
-def get_model_pkgs():
+def get_model_pkgs(silent: bool = False) -> Tuple[dict, dict]:
    msg = Printer(no_print=silent, pretty=not silent)
    with msg.loading("Loading compatibility table..."):
        r = requests.get(about.__compatibility__)
        if r.status_code != 200:
@ -95,7 +101,7 @@ def get_model_pkgs():
    return pkgs, compat
-def reformat_version(version):
+def reformat_version(version: str) -> str:
    """Hack to reformat old versions ending on '-alpha' to match pip format."""
    if version.endswith("-alpha"):
        return version.replace("-alpha", "a0")
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@ -1,4 +1,4 @@
-from typing import Dict, List, Union, Optional, Sequence
+from typing import Dict, List, Union, Optional, Sequence, Any
 from enum import Enum
 from pydantic import BaseModel, Field, ValidationError, validator
 from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool, FilePath
@ -164,7 +164,7 @@ class ModelMetaSchema(BaseModel):
    email: Optional[StrictStr] = Field(None, title="Model author email")
    url: Optional[StrictStr] = Field(None, title="Model author URL")
    sources: Optional[Union[List[StrictStr], Dict[str, str]]] = Field(None, title="Training data sources")
-    vectors: Optional[Dict[str, int]] = Field(None, title="Included word vectors")
+    vectors: Optional[Dict[str, Any]] = Field(None, title="Included word vectors")
    accuracy: Optional[Dict[str, Union[float, int]]] = Field(None, title="Accuracy numbers")
    speed: Optional[Dict[str, Union[float, int]]] = Field(None, title="Speed evaluation numbers")
    # fmt: on
--- a/spacy/util.py
+++ b/spacy/util.py
@ -1,10 +1,10 @@
 from typing import List, Union
 import os
 import importlib
 import importlib.util
 import re
 from pathlib import Path
 import random
 from typing import List
 import thinc
 from thinc.api import NumpyOps, get_current_ops, Adam, require_gpu, Config
 import functools
@ -17,6 +17,8 @@ import sys
 import warnings
 from packaging.specifiers import SpecifierSet, InvalidSpecifier
 from packaging.version import Version, InvalidVersion
 import subprocess
 from contextlib import contextmanager
 try:
@ -427,6 +429,30 @@ def get_package_path(name):
    return Path(pkg.__file__).parent
 def run_command(command: List[str]) -> None:
    """Run a command on the command line as a subprocess.
    command (list): The split command.
    """
    status = subprocess.call(command, env=os.environ.copy())
    if status != 0:
        sys.exit(status)
@contextmanager
 def working_dir(path: Union[str, Path]) -> None:
    """Change current working directory and returns to previous on exit.
    path (str / Path): The directory to navigate to.
    """
    prev_cwd = Path.cwd()
    os.chdir(str(path))
    try:
        yield
    finally:
        os.chdir(prev_cwd)
 def is_in_jupyter():
    """Check if user is running spaCy from a Jupyter notebook by detecting the
    IPython kernel. Mainly used for the displaCy visualizer.