diff --git a/requirements.txt b/requirements.txt index a104b68ba..55b234073 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,13 +5,13 @@ thinc==8.0.0a9 blis>=0.4.0,<0.5.0 ml_datasets>=0.1.1 murmurhash>=0.28.0,<1.1.0 -wasabi>=0.4.0,<1.1.0 -srsly>=2.0.0,<3.0.0 +wasabi>=0.7.0,<1.1.0 +srsly>=2.1.0,<3.0.0 catalogue>=0.0.7,<1.1.0 +typer>=0.2.1,<1.0.0 # Third party dependencies numpy>=1.15.0 requests>=2.13.0,<3.0.0 -plac>=0.9.6,<1.2.0 tqdm>=4.38.0,<5.0.0 pydantic>=1.3.0,<2.0.0 # Official Python utilities diff --git a/setup.cfg b/setup.cfg index b18f2c6f0..01b18ef29 100644 --- a/setup.cfg +++ b/setup.cfg @@ -44,14 +44,13 @@ install_requires = preshed>=3.0.2,<3.1.0 thinc==8.0.0a9 blis>=0.4.0,<0.5.0 - wasabi>=0.4.0,<1.1.0 - srsly>=2.0.0,<3.0.0 + wasabi>=0.7.0,<1.1.0 + srsly>=2.1.0,<3.0.0 catalogue>=0.0.7,<1.1.0 - ml_datasets>=0.1.1 + typer>=0.2.1,<1.0.0 # Third-party dependencies tqdm>=4.38.0,<5.0.0 numpy>=1.15.0 - plac>=0.9.6,<1.2.0 requests>=2.13.0,<3.0.0 pydantic>=1.3.0,<2.0.0 # Official Python utilities diff --git a/spacy/__init__.py b/spacy/__init__.py index e4e1f6c8e..b525a5ba5 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -8,7 +8,7 @@ warnings.filterwarnings("ignore", message="numpy.ufunc size changed") from thinc.api import prefer_gpu, require_gpu from . import pipeline -from .cli.info import info as cli_info +from .cli.info import info from .glossary import explain from .about import __version__ from .errors import Errors, Warnings @@ -34,7 +34,3 @@ def load(name, **overrides): def blank(name, **kwargs): LangClass = util.get_lang_class(name) return LangClass(**kwargs) - - -def info(model=None, markdown=False, silent=False): - return cli_info(model, markdown, silent) diff --git a/spacy/__main__.py b/spacy/__main__.py index beed3170d..f6b5066b7 100644 --- a/spacy/__main__.py +++ b/spacy/__main__.py @@ -1,31 +1,4 @@ if __name__ == "__main__": - import plac - import sys - from wasabi import msg - from spacy.cli import download, link, info, package, pretrain, convert - from spacy.cli import init_model, profile, evaluate, validate, debug_data - from spacy.cli import train_cli + from spacy.cli import setup_cli - commands = { - "download": download, - "link": link, - "info": info, - "train": train_cli, - "pretrain": pretrain, - "debug-data": debug_data, - "evaluate": evaluate, - "convert": convert, - "package": package, - "init-model": init_model, - "profile": profile, - "validate": validate, - } - if len(sys.argv) == 1: - msg.info("Available commands", ", ".join(commands), exits=1) - command = sys.argv.pop(1) - sys.argv[0] = f"spacy {command}" - if command in commands: - plac.call(commands[command], sys.argv[1:]) - else: - available = f"Available: {', '.join(commands)}" - msg.fail(f"Unknown command: {command}", available, exits=1) + setup_cli() diff --git a/spacy/about.py b/spacy/about.py index 04a660ad1..54753b5a1 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -5,3 +5,4 @@ __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __shortcuts__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts-v2.json" +__projects__ = "https://github.com/explosion/spacy-boilerplates" diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index 2ffbe2d0c..206f8dd3b 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -1,10 +1,14 @@ from wasabi import msg +from ._app import app, setup_cli # noqa: F401 + +# These are the actual functions, NOT the wrapped CLI commands. The CLI commands +# are registered automatically and won't have to be imported here. from .download import download # noqa: F401 from .info import info # noqa: F401 from .package import package # noqa: F401 from .profile import profile # noqa: F401 -from .train_from_config import train_cli # noqa: F401 +from .train_from_config import train # noqa: F401 from .pretrain import pretrain # noqa: F401 from .debug_data import debug_data # noqa: F401 from .evaluate import evaluate # noqa: F401 @@ -13,7 +17,10 @@ from .init_model import init_model # noqa: F401 from .validate import validate # noqa: F401 +@app.command("link", no_args_is_help=True, deprecated=True, hidden=True) def link(*args, **kwargs): + """As of spaCy v3.0, model symlinks are deprecated. You can load models + using their full names or from a directory path.""" msg.warn( "As of spaCy v3.0, model symlinks are deprecated. You can load models " "using their full names or from a directory path." diff --git a/spacy/cli/_app.py b/spacy/cli/_app.py new file mode 100644 index 000000000..d1c470b32 --- /dev/null +++ b/spacy/cli/_app.py @@ -0,0 +1,31 @@ +from typing import Optional +import typer +from typer.main import get_command + + +COMMAND = "python -m spacy" +NAME = "spacy" +HELP = """spaCy Command-line Interface + +DOCS: https://spacy.io/api/cli +""" + + +app = typer.Typer(name=NAME, help=HELP) + + +def Arg(*args, help: Optional[str] = None, **kwargs) -> typer.Argument: + """Wrapper for Typer's annotation to keep it short and set defaults.""" + # Filter out help for now until it's officially supported + return typer.Argument(*args, **kwargs) + + +def Opt(*args, **kwargs) -> typer.Option: + """Wrapper for Typer's annotation to keep it short and set defaults.""" + return typer.Option(*args, show_default=True, **kwargs) + + +def setup_cli() -> None: + # Ensure that the help messages always display the correct prompt + command = get_command(app) + command(prog_name=COMMAND) diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index 2ffbeb458..24d266504 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -1,8 +1,11 @@ +from typing import Optional +from enum import Enum from pathlib import Path from wasabi import Printer import srsly import re +from ._app import app, Arg, Opt from .converters import conllu2json, iob2json, conll_ner2json from .converters import ner_jsonl2json @@ -21,23 +24,29 @@ CONVERTERS = { } # File types -FILE_TYPES = ("json", "jsonl", "msg") FILE_TYPES_STDOUT = ("json", "jsonl") -def convert( +class FileTypes(str, Enum): + json = "json" + jsonl = "jsonl" + msg = "msg" + + +@app.command("convert") +def convert_cli( # fmt: off - input_file: ("Input file", "positional", None, str), - output_dir: ("Output directory. '-' for stdout.", "positional", None, str) = "-", - file_type: (f"Type of data to produce: {FILE_TYPES}", "option", "t", str, FILE_TYPES) = "json", - n_sents: ("Number of sentences per doc (0 to disable)", "option", "n", int) = 1, - seg_sents: ("Segment sentences (for -c ner)", "flag", "s") = False, - model: ("Model for sentence segmentation (for -s)", "option", "b", str) = None, - morphology: ("Enable appending morphology to tags", "flag", "m", bool) = False, - merge_subtokens: ("Merge CoNLL-U subtokens", "flag", "T", bool) = False, - converter: (f"Converter: {tuple(CONVERTERS.keys())}", "option", "c", str) = "auto", - ner_map_path: ("NER tag mapping (as JSON-encoded dict of entity types)", "option", "N", Path) = None, - lang: ("Language (if tokenizer required)", "option", "l", str) = None, + input_file: str = Arg(..., help="Input file", exists=True), + output_dir: Path = Arg("-", help="Output directory. '-' for stdout.", allow_dash=True, exists=True), + file_type: FileTypes = Opt(FileTypes.json.value, "--file-type", "-t", help="Type of data to produce"), + n_sents: int = Opt(1, "--n-sents", "-n", help="Number of sentences per doc (0 to disable)"), + seg_sents: bool = Opt(False, "--seg-sents", "-s", help="Segment sentences (for -c ner)"), + model: Optional[str] = Opt(None, "--model", "-b", help="Model for sentence segmentation (for -s)"), + morphology: bool = Opt(False, "--morphology", "-m", help="Enable appending morphology to tags"), + merge_subtokens: bool = Opt(False, "--merge-subtokens", "-T", help="Merge CoNLL-U subtokens"), + converter: str = Opt("auto", "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"), + ner_map_path: Optional[Path] = Opt(None, "--ner-map-path", "-N", help="NER tag mapping (as JSON-encoded dict of entity types)", exists=True), + lang: Optional[str] = Opt(None, "--lang", "-l", help="Language (if tokenizer required)"), # fmt: on ): """ @@ -46,8 +55,42 @@ def convert( is written to stdout, so you can pipe them forward to a JSON file: $ spacy convert some_file.conllu > some_file.json """ - no_print = output_dir == "-" - msg = Printer(no_print=no_print) + if isinstance(file_type, FileTypes): + # We get an instance of the FileTypes from the CLI so we need its string value + file_type = file_type.value + silent = output_dir == "-" + convert( + input_file, + output_dir, + file_type=file_type, + n_sents=n_sents, + seg_sents=seg_sents, + model=model, + morphology=morphology, + merge_subtokens=merge_subtokens, + converter=converter, + ner_map_path=ner_map_path, + lang=lang, + silent=silent, + ) + + +def convert( + input_file: Path, + output_dir: Path, + *, + file_type: str = "json", + n_sents: int = 1, + seg_sents: bool = False, + model: Optional[str] = None, + morphology: bool = False, + merge_subtokens: bool = False, + converter: str = "auto", + ner_map_path: Optional[Path] = None, + lang: Optional[str] = None, + silent: bool = True, +) -> None: + msg = Printer(no_print=silent, pretty=not silent) input_path = Path(input_file) if file_type not in FILE_TYPES_STDOUT and output_dir == "-": # TODO: support msgpack via stdout in srsly? @@ -73,7 +116,8 @@ def convert( converter = converter_autodetect else: msg.warn( - "Can't automatically detect NER format. Conversion may not succeed. See https://spacy.io/api/cli#convert" + "Can't automatically detect NER format. Conversion may not " + "succeed. See https://spacy.io/api/cli#convert" ) if converter not in CONVERTERS: msg.fail(f"Can't find converter for {converter}", exits=1) @@ -90,7 +134,7 @@ def convert( merge_subtokens=merge_subtokens, lang=lang, model=model, - no_print=no_print, + no_print=silent, ner_map=ner_map, ) if output_dir != "-": @@ -112,7 +156,7 @@ def convert( srsly.write_jsonl("-", data) -def autodetect_ner_format(input_data): +def autodetect_ner_format(input_data: str) -> str: # guess format from the first 20 lines lines = input_data.split("\n")[:20] format_guesses = {"ner": 0, "iob": 0} diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 21f49956d..2cc3020e6 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -1,11 +1,14 @@ +from typing import Optional, List, Sequence, Dict, Any, Tuple from pathlib import Path from collections import Counter import sys import srsly from wasabi import Printer, MESSAGES -from ..gold import GoldCorpus +from ._app import app, Arg, Opt +from ..gold import GoldCorpus, Example from ..syntax import nonproj +from ..language import Language from ..util import load_model, get_lang_class @@ -18,17 +21,18 @@ BLANK_MODEL_MIN_THRESHOLD = 100 BLANK_MODEL_THRESHOLD = 2000 -def debug_data( +@app.command("debug-data") +def debug_data_cli( # fmt: off - lang: ("Model language", "positional", None, str), - train_path: ("Location of JSON-formatted training data", "positional", None, Path), - dev_path: ("Location of JSON-formatted development data", "positional", None, Path), - tag_map_path: ("Location of JSON-formatted tag map", "option", "tm", Path) = None, - base_model: ("Name of model to update (optional)", "option", "b", str) = None, - pipeline: ("Comma-separated names of pipeline components to train", "option", "p", str) = "tagger,parser,ner", - ignore_warnings: ("Ignore warnings, only show stats and errors", "flag", "IW", bool) = False, - verbose: ("Print additional information and explanations", "flag", "V", bool) = False, - no_format: ("Don't pretty-print the results", "flag", "NF", bool) = False, + lang: str = Arg(..., help="Model language"), + train_path: Path = Arg(..., help="Location of JSON-formatted training data", exists=True), + dev_path: Path = Arg(..., help="Location of JSON-formatted development data", exists=True), + tag_map_path: Optional[Path] = Opt(None, "--tag-map-path", "-tm", help="Location of JSON-formatted tag map", exists=True, dir_okay=False), + base_model: Optional[str] = Opt(None, "--base-model", "-b", help="Name of model to update (optional)"), + pipeline: str = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of pipeline components to train"), + ignore_warnings: bool = Opt(False, "--ignore-warnings", "-IW", help="Ignore warnings, only show stats and errors"), + verbose: bool = Opt(False, "--verbose", "-V", help="Print additional information and explanations"), + no_format: bool = Opt(False, "--no-format", "-NF", help="Don't pretty-print the results"), # fmt: on ): """ @@ -36,8 +40,36 @@ def debug_data( stats, and find problems like invalid entity annotations, cyclic dependencies, low data labels and more. """ - msg = Printer(pretty=not no_format, ignore_warnings=ignore_warnings) + debug_data( + lang, + train_path, + dev_path, + tag_map_path=tag_map_path, + base_model=base_model, + pipeline=[p.strip() for p in pipeline.split(",")], + ignore_warnings=ignore_warnings, + verbose=verbose, + no_format=no_format, + silent=False, + ) + +def debug_data( + lang: str, + train_path: Path, + dev_path: Path, + *, + tag_map_path: Optional[Path] = None, + base_model: Optional[str] = None, + pipeline: List[str] = ["tagger", "parser", "ner"], + ignore_warnings: bool = False, + verbose: bool = False, + no_format: bool = True, + silent: bool = True, +): + msg = Printer( + no_print=silent, pretty=not no_format, ignore_warnings=ignore_warnings + ) # Make sure all files and paths exists if they are needed if not train_path.exists(): msg.fail("Training data not found", train_path, exits=1) @@ -49,7 +81,6 @@ def debug_data( tag_map = srsly.read_json(tag_map_path) # Initialize the model and pipeline - pipeline = [p.strip() for p in pipeline.split(",")] if base_model: nlp = load_model(base_model) else: @@ -446,7 +477,7 @@ def debug_data( sys.exit(1) -def _load_file(file_path, msg): +def _load_file(file_path: Path, msg: Printer) -> None: file_name = file_path.parts[-1] if file_path.suffix == ".json": with msg.loading(f"Loading {file_name}..."): @@ -465,7 +496,9 @@ def _load_file(file_path, msg): ) -def _compile_gold(examples, pipeline, nlp): +def _compile_gold( + examples: Sequence[Example], pipeline: List[str], nlp: Language +) -> Dict[str, Any]: data = { "ner": Counter(), "cats": Counter(), @@ -537,13 +570,13 @@ def _compile_gold(examples, pipeline, nlp): return data -def _format_labels(labels, counts=False): +def _format_labels(labels: List[Tuple[str, int]], counts: bool = False) -> str: if counts: return ", ".join([f"'{l}' ({c})" for l, c in labels]) return ", ".join([f"'{l}'" for l in labels]) -def _get_examples_without_label(data, label): +def _get_examples_without_label(data: Sequence[Example], label: str) -> int: count = 0 for ex in data: labels = [ @@ -556,7 +589,7 @@ def _get_examples_without_label(data, label): return count -def _get_labels_from_model(nlp, pipe_name): +def _get_labels_from_model(nlp: Language, pipe_name: str) -> Sequence[str]: if pipe_name not in nlp.pipe_names: return set() pipe = nlp.get_pipe(pipe_name) diff --git a/spacy/cli/download.py b/spacy/cli/download.py index 3d56822a5..adc8d09fa 100644 --- a/spacy/cli/download.py +++ b/spacy/cli/download.py @@ -1,23 +1,36 @@ +from typing import Optional, Sequence, Union import requests -import os -import subprocess import sys from wasabi import msg +import typer +from ._app import app, Arg, Opt from .. import about -from ..util import is_package, get_base_version +from ..util import is_package, get_base_version, run_command -def download( - model: ("Model to download (shortcut or name)", "positional", None, str), - direct: ("Force direct download of name + version", "flag", "d", bool) = False, - *pip_args: ("Additional arguments to be passed to `pip install` on model install"), +@app.command( + "download", + context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, +) +def download_cli( + # fmt: off + ctx: typer.Context, + model: str = Arg(..., help="Model to download (shortcut or name)"), + direct: bool = Opt(False, "--direct", "-d", "-D", help="Force direct download of name + version"), + # fmt: on ): """ Download compatible model from default download path using pip. If --direct flag is set, the command expects the full model name with version. - For direct downloads, the compatibility check will be skipped. + For direct downloads, the compatibility check will be skipped. All + additional arguments provided to this command will be passed to `pip install` + on model installation. """ + download(model, direct, *ctx.args) + + +def download(model: str, direct: bool = False, *pip_args) -> None: if not is_package("spacy") and "--no-deps" not in pip_args: msg.warn( "Skipping model package dependencies and setting `--no-deps`. " @@ -33,22 +46,20 @@ def download( components = model.split("-") model_name = "".join(components[:-1]) version = components[-1] - dl = download_model(dl_tpl.format(m=model_name, v=version), pip_args) + download_model(dl_tpl.format(m=model_name, v=version), pip_args) else: shortcuts = get_json(about.__shortcuts__, "available shortcuts") model_name = shortcuts.get(model, model) compatibility = get_compatibility() version = get_version(model_name, compatibility) - dl = download_model(dl_tpl.format(m=model_name, v=version), pip_args) - if dl != 0: # if download subprocess doesn't return 0, exit - sys.exit(dl) - msg.good( - "Download and installation successful", - f"You can now load the model via spacy.load('{model_name}')", - ) + download_model(dl_tpl.format(m=model_name, v=version), pip_args) + msg.good( + "Download and installation successful", + f"You can now load the model via spacy.load('{model_name}')", + ) -def get_json(url, desc): +def get_json(url: str, desc: str) -> Union[dict, list]: r = requests.get(url) if r.status_code != 200: msg.fail( @@ -62,7 +73,7 @@ def get_json(url, desc): return r.json() -def get_compatibility(): +def get_compatibility() -> dict: version = get_base_version(about.__version__) comp_table = get_json(about.__compatibility__, "compatibility table") comp = comp_table["spacy"] @@ -71,7 +82,7 @@ def get_compatibility(): return comp[version] -def get_version(model, comp): +def get_version(model: str, comp: dict) -> str: model = get_base_version(model) if model not in comp: msg.fail( @@ -81,10 +92,12 @@ def get_version(model, comp): return comp[model][0] -def download_model(filename, user_pip_args=None): +def download_model( + filename: str, user_pip_args: Optional[Sequence[str]] = None +) -> None: download_url = about.__download_url__ + "/" + filename pip_args = ["--no-cache-dir"] if user_pip_args: pip_args.extend(user_pip_args) cmd = [sys.executable, "-m", "pip", "install"] + pip_args + [download_url] - return subprocess.call(cmd, env=os.environ.copy()) + run_command(cmd) diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index bae252b1c..8d0f67316 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -1,26 +1,52 @@ +from typing import Optional, List from timeit import default_timer as timer -from wasabi import msg +from wasabi import Printer +from pathlib import Path +from ._app import app, Arg, Opt +from ..tokens import Doc +from ..scorer import Scorer from ..gold import GoldCorpus from .. import util from .. import displacy -def evaluate( +@app.command("evaluate") +def evaluate_cli( # fmt: off - model: ("Model name or path", "positional", None, str), - data_path: ("Location of JSON-formatted evaluation data", "positional", None, str), - gpu_id: ("Use GPU", "option", "g", int) = -1, - gold_preproc: ("Use gold preprocessing", "flag", "G", bool) = False, - displacy_path: ("Directory to output rendered parses as HTML", "option", "dp", str) = None, - displacy_limit: ("Limit of parses to render as HTML", "option", "dl", int) = 25, - return_scores: ("Return dict containing model scores", "flag", "R", bool) = False, + model: str = Arg(..., help="Model name or path"), + data_path: Path = Arg(..., help="Location of JSON-formatted evaluation data", exists=True), + gpu_id: int = Opt(-1, "--gpu-id", "-g", help="Use GPU"), + gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"), + displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False), + displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"), # fmt: on ): """ Evaluate a model. To render a sample of parses in a HTML file, set an output directory as the displacy_path argument. """ + evaluate( + model, + data_path, + gpu_id=gpu_id, + gold_preproc=gold_preproc, + displacy_path=displacy_path, + displacy_limit=displacy_limit, + silent=False, + ) + + +def evaluate( + model: str, + data_path: Path, + gpu_id: int = -1, + gold_preproc: bool = False, + displacy_path: Optional[Path] = None, + displacy_limit: int = 25, + silent: bool = True, +) -> Scorer: + msg = Printer(no_print=silent, pretty=not silent) util.fix_random_seed() if gpu_id >= 0: util.use_gpu(gpu_id) @@ -75,11 +101,17 @@ def evaluate( ents=render_ents, ) msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path) - if return_scores: - return scorer.scores + return scorer.scores -def render_parses(docs, output_path, model_name="", limit=250, deps=True, ents=True): +def render_parses( + docs: List[Doc], + output_path: Path, + model_name: str = "", + limit: int = 250, + deps: bool = True, + ents: bool = True, +): docs[0].user_data["title"] = model_name if ents: html = displacy.render(docs[:limit], style="ent", page=True) diff --git a/spacy/cli/info.py b/spacy/cli/info.py index 98fd5cabf..9f1ec3855 100644 --- a/spacy/cli/info.py +++ b/spacy/cli/info.py @@ -1,77 +1,109 @@ +from typing import Optional, Dict, Any, Union import platform from pathlib import Path -from wasabi import msg +from wasabi import Printer import srsly -from .validate import get_model_pkgs +from ._app import app, Arg, Opt from .. import util from .. import about -def info( - model: ("Optional model name", "positional", None, str) = None, - markdown: ("Generate Markdown for GitHub issues", "flag", "md", str) = False, - silent: ("Don't print anything (just return)", "flag", "s") = False, +@app.command("info") +def info_cli( + # fmt: off + model: Optional[str] = Arg(None, help="Optional model name"), + markdown: bool = Opt(False, "--markdown", "-md", help="Generate Markdown for GitHub issues"), + silent: bool = Opt(False, "--silent", "-s", "-S", help="Don't print anything (just return)"), + # fmt: on ): """ Print info about spaCy installation. If a model is speficied as an argument, print model information. Flag --markdown prints details in Markdown for easy copy-pasting to GitHub issues. """ + info(model, markdown=markdown, silent=silent) + + +def info( + model: Optional[str] = None, *, markdown: bool = False, silent: bool = True +) -> Union[str, dict]: + msg = Printer(no_print=silent, pretty=not silent) if model: - if util.is_package(model): - model_path = util.get_package_path(model) - else: - model_path = model - meta_path = model_path / "meta.json" - if not meta_path.is_file(): - msg.fail("Can't find model meta.json", meta_path, exits=1) - meta = srsly.read_json(meta_path) - if model_path.resolve() != model_path: - meta["link"] = str(model_path) - meta["source"] = str(model_path.resolve()) - else: - meta["source"] = str(model_path) + title = f"Info about model '{model}'" + data = info_model(model, silent=silent) + else: + title = "Info about spaCy" + data = info_spacy() + raw_data = {k.lower().replace(" ", "_"): v for k, v in data.items()} + if "Models" in data and isinstance(data["Models"], dict): + data["Models"] = ", ".join(f"{n} ({v})" for n, v in data["Models"].items()) + markdown_data = get_markdown(data, title=title) + if markdown: if not silent: - title = f"Info about model '{model}'" - model_meta = { - k: v for k, v in meta.items() if k not in ("accuracy", "speed") - } - if markdown: - print_markdown(model_meta, title=title) - else: - msg.table(model_meta, title=title) - return meta - all_models, _ = get_model_pkgs() - data = { + print(markdown_data) + return markdown_data + if not silent: + table_data = dict(data) + msg.table(table_data, title=title) + return raw_data + + +def info_spacy() -> Dict[str, any]: + """Generate info about the current spaCy intallation. + + RETURNS (dict): The spaCy info. + """ + all_models = {} + for pkg_name in util.get_installed_models(): + package = pkg_name.replace("-", "_") + all_models[package] = util.get_package_version(pkg_name) + return { "spaCy version": about.__version__, "Location": str(Path(__file__).parent.parent), "Platform": platform.platform(), "Python version": platform.python_version(), - "Models": ", ".join( - f"{m['name']} ({m['version']})" for m in all_models.values() - ), + "Models": all_models, } - if not silent: - title = "Info about spaCy" - if markdown: - print_markdown(data, title=title) - else: - msg.table(data, title=title) - return data -def print_markdown(data, title=None): - """Print data in GitHub-flavoured Markdown format for issues etc. +def info_model(model: str, *, silent: bool = True) -> Dict[str, Any]: + """Generate info about a specific model. + + model (str): Model name of path. + silent (bool): Don't print anything, just return. + RETURNS (dict): The model meta. + """ + msg = Printer(no_print=silent, pretty=not silent) + if util.is_package(model): + model_path = util.get_package_path(model) + else: + model_path = model + meta_path = model_path / "meta.json" + if not meta_path.is_file(): + msg.fail("Can't find model meta.json", meta_path, exits=1) + meta = srsly.read_json(meta_path) + if model_path.resolve() != model_path: + meta["link"] = str(model_path) + meta["source"] = str(model_path.resolve()) + else: + meta["source"] = str(model_path) + return {k: v for k, v in meta.items() if k not in ("accuracy", "speed")} + + +def get_markdown(data: Dict[str, Any], title: Optional[str] = None) -> str: + """Get data in GitHub-flavoured Markdown format for issues etc. data (dict or list of tuples): Label/value pairs. title (str / None): Title, will be rendered as headline 2. + RETURNS (str): The Markdown string. """ markdown = [] for key, value in data.items(): if isinstance(value, str) and Path(value).exists(): continue markdown.append(f"* **{key}:** {value}") + result = "\n{}\n".format("\n".join(markdown)) if title: - print(f"\n## {title}") - print("\n{}\n".format("\n".join(markdown))) + result = f"\n## {title}\n{result}" + return result diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py index 700fa43de..37f862ef2 100644 --- a/spacy/cli/init_model.py +++ b/spacy/cli/init_model.py @@ -1,3 +1,4 @@ +from typing import Optional, List, Dict, Any, Union, IO import math from tqdm import tqdm import numpy @@ -9,10 +10,12 @@ import gzip import zipfile import srsly import warnings -from wasabi import msg +from wasabi import Printer +from ._app import app, Arg, Opt from ..vectors import Vectors from ..errors import Errors, Warnings +from ..language import Language from ..util import ensure_path, get_lang_class, load_model, OOV_RANK from ..lookups import Lookups @@ -25,20 +28,21 @@ except ImportError: DEFAULT_OOV_PROB = -20 -def init_model( +@app.command("init-model") +def init_model_cli( # fmt: off - lang: ("Model language", "positional", None, str), - output_dir: ("Model output directory", "positional", None, Path), - freqs_loc: ("Location of words frequencies file", "option", "f", Path) = None, - clusters_loc: ("Optional location of brown clusters data", "option", "c", str) = None, - jsonl_loc: ("Location of JSONL-formatted attributes file", "option", "j", Path) = None, - vectors_loc: ("Optional vectors file in Word2Vec format", "option", "v", str) = None, - prune_vectors: ("Optional number of vectors to prune to", "option", "V", int) = -1, - truncate_vectors: ("Optional number of vectors to truncate to when reading in vectors file", "option", "t", int) = 0, - vectors_name: ("Optional name for the word vectors, e.g. en_core_web_lg.vectors", "option", "vn", str) = None, - model_name: ("Optional name for the model meta", "option", "mn", str) = None, - omit_extra_lookups: ("Don't include extra lookups in model", "flag", "OEL", bool) = False, - base_model: ("Base model (for languages with custom tokenizers)", "option", "b", str) = None + lang: str = Arg(..., help="Model language"), + output_dir: Path = Arg(..., help="Model output directory"), + freqs_loc: Optional[Path] = Arg(None, help="Location of words frequencies file", exists=True), + clusters_loc: Optional[Path] = Opt(None, "--clusters-loc", "-c", help="Optional location of brown clusters data", exists=True), + jsonl_loc: Optional[Path] = Opt(None, "--jsonl-loc", "-j", help="Location of JSONL-formatted attributes file", exists=True), + vectors_loc: Optional[Path] = Opt(None, "--vectors-loc", "-v", help="Optional vectors file in Word2Vec format", exists=True), + prune_vectors: int = Opt(-1 , "--prune-vectors", "-V", help="Optional number of vectors to prune to"), + truncate_vectors: int = Opt(0, "--truncate-vectors", "-t", help="Optional number of vectors to truncate to when reading in vectors file"), + vectors_name: Optional[str] = Opt(None, "--vectors-name", "-vn", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"), + model_name: Optional[str] = Opt(None, "--model-name", "-mn", help="Optional name for the model meta"), + omit_extra_lookups: bool = Opt(False, "--omit-extra-lookups", "-OEL", help="Don't include extra lookups in model"), + base_model: Optional[str] = Opt(None, "--base-model", "-b", help="Base model (for languages with custom tokenizers)") # fmt: on ): """ @@ -46,6 +50,38 @@ def init_model( and word vectors. If vectors are provided in Word2Vec format, they can be either a .txt or zipped as a .zip or .tar.gz. """ + init_model( + lang, + output_dir, + freqs_loc=freqs_loc, + clusters_loc=clusters_loc, + jsonl_loc=jsonl_loc, + prune_vectors=prune_vectors, + truncate_vectors=truncate_vectors, + vectors_name=vectors_name, + model_name=model_name, + omit_extra_lookups=omit_extra_lookups, + base_model=base_model, + silent=False, + ) + + +def init_model( + lang: str, + output_dir: Path, + freqs_loc: Optional[Path] = None, + clusters_loc: Optional[Path] = None, + jsonl_loc: Optional[Path] = None, + vectors_loc: Optional[Path] = None, + prune_vectors: int = -1, + truncate_vectors: int = 0, + vectors_name: Optional[str] = None, + model_name: Optional[str] = None, + omit_extra_lookups: bool = False, + base_model: Optional[str] = None, + silent: bool = True, +) -> Language: + msg = Printer(no_print=silent, pretty=not silent) if jsonl_loc is not None: if freqs_loc is not None or clusters_loc is not None: settings = ["-j"] @@ -68,7 +104,7 @@ def init_model( freqs_loc = ensure_path(freqs_loc) if freqs_loc is not None and not freqs_loc.exists(): msg.fail("Can't find words frequencies file", freqs_loc, exits=1) - lex_attrs = read_attrs_from_deprecated(freqs_loc, clusters_loc) + lex_attrs = read_attrs_from_deprecated(msg, freqs_loc, clusters_loc) with msg.loading("Creating model..."): nlp = create_model(lang, lex_attrs, name=model_name, base_model=base_model) @@ -83,7 +119,9 @@ def init_model( msg.good("Successfully created model") if vectors_loc is not None: - add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name) + add_vectors( + msg, nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name + ) vec_added = len(nlp.vocab.vectors) lex_added = len(nlp.vocab) msg.good( @@ -95,7 +133,7 @@ def init_model( return nlp -def open_file(loc): +def open_file(loc: Union[str, Path]) -> IO: """Handle .gz, .tar.gz or unzipped files""" loc = ensure_path(loc) if tarfile.is_tarfile(str(loc)): @@ -111,7 +149,9 @@ def open_file(loc): return loc.open("r", encoding="utf8") -def read_attrs_from_deprecated(freqs_loc, clusters_loc): +def read_attrs_from_deprecated( + msg: Printer, freqs_loc: Optional[Path], clusters_loc: Optional[Path] +) -> List[Dict[str, Any]]: if freqs_loc is not None: with msg.loading("Counting frequencies..."): probs, _ = read_freqs(freqs_loc) @@ -139,7 +179,12 @@ def read_attrs_from_deprecated(freqs_loc, clusters_loc): return lex_attrs -def create_model(lang, lex_attrs, name=None, base_model=None): +def create_model( + lang: str, + lex_attrs: List[Dict[str, Any]], + name: Optional[str] = None, + base_model: Optional[Union[str, Path]] = None, +) -> Language: if base_model: nlp = load_model(base_model) # keep the tokenizer but remove any existing pipeline components due to @@ -166,7 +211,14 @@ def create_model(lang, lex_attrs, name=None, base_model=None): return nlp -def add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, name=None): +def add_vectors( + msg: Printer, + nlp: Language, + vectors_loc: Optional[Path], + truncate_vectors: int, + prune_vectors: int, + name: Optional[str] = None, +) -> None: vectors_loc = ensure_path(vectors_loc) if vectors_loc and vectors_loc.parts[-1].endswith(".npz"): nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb"))) @@ -176,7 +228,7 @@ def add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, name=None): else: if vectors_loc: with msg.loading(f"Reading vectors from {vectors_loc}"): - vectors_data, vector_keys = read_vectors(vectors_loc) + vectors_data, vector_keys = read_vectors(msg, vectors_loc) msg.good(f"Loaded vectors from {vectors_loc}") else: vectors_data, vector_keys = (None, None) @@ -195,7 +247,7 @@ def add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, name=None): nlp.vocab.prune_vectors(prune_vectors) -def read_vectors(vectors_loc, truncate_vectors=0): +def read_vectors(msg: Printer, vectors_loc: Path, truncate_vectors: int = 0): f = open_file(vectors_loc) shape = tuple(int(size) for size in next(f).split()) if truncate_vectors >= 1: @@ -215,7 +267,9 @@ def read_vectors(vectors_loc, truncate_vectors=0): return vectors_data, vectors_keys -def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50): +def read_freqs( + freqs_loc: Path, max_length: int = 100, min_doc_freq: int = 5, min_freq: int = 50 +): counts = PreshCounter() total = 0 with freqs_loc.open() as f: @@ -244,7 +298,7 @@ def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50): return probs, oov_prob -def read_clusters(clusters_loc): +def read_clusters(clusters_loc: Path) -> dict: clusters = {} if ftfy is None: warnings.warn(Warnings.W004) diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 153e61ba3..24d9a0a08 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -1,19 +1,24 @@ +from typing import Optional, Union, Any, Dict import shutil from pathlib import Path -from wasabi import msg, get_raw_input +from wasabi import Printer, get_raw_input import srsly +import sys +from ._app import app, Arg, Opt +from ..schemas import validate, ModelMetaSchema from .. import util from .. import about -def package( +@app.command("package") +def package_cli( # fmt: off - input_dir: ("Directory with model data", "positional", None, str), - output_dir: ("Output parent directory", "positional", None, str), - meta_path: ("Path to meta.json", "option", "m", str) = None, - create_meta: ("Create meta.json, even if one exists", "flag", "c", bool) = False, - force: ("Force overwriting existing model in output directory", "flag", "f", bool) = False, + input_dir: Path = Arg(..., help="Directory with model data", exists=True, file_okay=False), + output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False), + meta_path: Optional[Path] = Opt(None, "--meta-path", "-m", help="Path to meta.json", exists=True, dir_okay=False), + create_meta: bool = Opt(False, "--create-meta", "-c", "-C", help="Create meta.json, even if one exists"), + force: bool = Opt(False, "--force", "-f", "-F", help="Force overwriting existing model in output directory"), # fmt: on ): """ @@ -23,6 +28,25 @@ def package( set and a meta.json already exists in the output directory, the existing values will be used as the defaults in the command-line prompt. """ + package( + input_dir, + output_dir, + meta_path=meta_path, + create_meta=create_meta, + force=force, + silent=False, + ) + + +def package( + input_dir: Path, + output_dir: Path, + meta_path: Optional[Path] = None, + create_meta: bool = False, + force: bool = False, + silent: bool = True, +) -> None: + msg = Printer(no_print=silent, pretty=not silent) input_path = util.ensure_path(input_dir) output_path = util.ensure_path(output_dir) meta_path = util.ensure_path(meta_path) @@ -33,23 +57,20 @@ def package( if meta_path and not meta_path.exists(): msg.fail("Can't find model meta.json", meta_path, exits=1) - meta_path = meta_path or input_path / "meta.json" - if meta_path.is_file(): - meta = srsly.read_json(meta_path) - if not create_meta: # only print if user doesn't want to overwrite - msg.good("Loaded meta.json from file", meta_path) - else: - meta = generate_meta(input_dir, meta, msg) - for key in ("lang", "name", "version"): - if key not in meta or meta[key] == "": - msg.fail( - f"No '{key}' setting found in meta.json", - "This setting is required to build your package.", - exits=1, - ) + meta_path = meta_path or input_dir / "meta.json" + if not meta_path.exists() or not meta_path.is_file(): + msg.fail("Can't load model meta.json", meta_path, exits=1) + meta = srsly.read_json(meta_path) + if not create_meta: # only print if user doesn't want to overwrite + msg.good("Loaded meta.json from file", meta_path) + else: + meta = generate_meta(input_dir, meta, msg) + errors = validate(ModelMetaSchema, meta) + if errors: + msg.fail("Invalid model meta.json", "\n".join(errors), exits=1) model_name = meta["lang"] + "_" + meta["name"] model_name_v = model_name + "-" + meta["version"] - main_path = output_path / model_name_v + main_path = output_dir / model_name_v package_path = main_path / model_name if package_path.exists(): @@ -63,21 +84,26 @@ def package( exits=1, ) Path.mkdir(package_path, parents=True) - shutil.copytree(str(input_path), str(package_path / model_name_v)) + shutil.copytree(str(input_dir), str(package_path / model_name_v)) create_file(main_path / "meta.json", srsly.json_dumps(meta, indent=2)) create_file(main_path / "setup.py", TEMPLATE_SETUP) create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST) create_file(package_path / "__init__.py", TEMPLATE_INIT) msg.good(f"Successfully created package '{model_name_v}'", main_path) - msg.text("To build the package, run `python setup.py sdist` in this directory.") + with util.working_dir(main_path): + util.run_command([sys.executable, "setup.py", "sdist"]) + zip_file = main_path / "dist" / f"{model_name_v}.tar.gz" + msg.good(f"Successfully created zipped Python package", zip_file) -def create_file(file_path, contents): +def create_file(file_path: Path, contents: str) -> None: file_path.touch() file_path.open("w", encoding="utf-8").write(contents) -def generate_meta(model_path, existing_meta, msg): +def generate_meta( + model_path: Union[str, Path], existing_meta: Dict[str, Any], msg: Printer +) -> Dict[str, Any]: meta = existing_meta or {} settings = [ ("lang", "Model language", meta.get("lang", "en")), diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 4f4029834..2962e5022 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -1,14 +1,15 @@ +from typing import Optional import random import numpy import time import re from collections import Counter -import plac from pathlib import Path from thinc.api import Linear, Maxout, chain, list2array, use_pytorch_for_gpu_memory from wasabi import msg import srsly +from ._app import app, Arg, Opt from ..errors import Errors from ..ml.models.multi_task import build_masked_language_model from ..tokens import Doc @@ -17,25 +18,17 @@ from .. import util from ..gold import Example -@plac.annotations( +@app.command("pretrain") +def pretrain_cli( # fmt: off - texts_loc=("Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the key 'tokens'", "positional", None, str), - vectors_model=("Name or path to spaCy model with vectors to learn from", "positional", None, str), - output_dir=("Directory to write models to on each epoch", "positional", None, Path), - config_path=("Path to config file", "positional", None, Path), - use_gpu=("Use GPU", "option", "g", int), - resume_path=("Path to pretrained weights from which to resume pretraining", "option", "r", Path), - epoch_resume=("The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files.", "option", "er", int), + texts_loc: Path = Arg(..., help="Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the key 'tokens'", exists=True), + vectors_model: str = Arg(..., help="Name or path to spaCy model with vectors to learn from"), + output_dir: Path = Arg(..., help="Directory to write models to on each epoch"), + config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False), + use_gpu: int = Opt(-1, "--use-gpu", "-g", help="Use GPU"), + resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"), + epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files."), # fmt: on -) -def pretrain( - texts_loc, - vectors_model, - config_path, - output_dir, - use_gpu=-1, - resume_path=None, - epoch_resume=None, ): """ Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components, @@ -52,6 +45,26 @@ def pretrain( all settings are the same between pretraining and training. Ideally, this is done by using the same config file for both commands. """ + pretrain( + texts_loc, + vectors_model, + output_dir, + config_path, + use_gpu=use_gpu, + resume_path=resume_path, + epoch_resume=epoch_resume, + ) + + +def pretrain( + texts_loc: Path, + vectors_model: str, + output_dir: Path, + config_path: Path, + use_gpu: int = -1, + resume_path: Optional[Path] = None, + epoch_resume: Optional[int] = None, +): if not config_path or not config_path.exists(): msg.fail("Config file not found", config_path, exits=1) diff --git a/spacy/cli/profile.py b/spacy/cli/profile.py index 5b7a02212..ee9f3e707 100644 --- a/spacy/cli/profile.py +++ b/spacy/cli/profile.py @@ -1,3 +1,4 @@ +from typing import Optional, Sequence, Union, Iterator import tqdm from pathlib import Path import srsly @@ -5,17 +6,19 @@ import cProfile import pstats import sys import itertools -import ml_datasets -from wasabi import msg +from wasabi import msg, Printer +from ._app import app, Arg, Opt +from ..language import Language from ..util import load_model -def profile( +@app.command("profile") +def profile_cli( # fmt: off - model: ("Model to load", "positional", None, str), - inputs: ("Location of input file. '-' for stdin.", "positional", None, str) = None, - n_texts: ("Maximum number of texts to use if available", "option", "n", int) = 10000, + model: str = Arg(..., help="Model to load"), + inputs: Optional[Path] = Arg(None, help="Location of input file. '-' for stdin.", exists=True, allow_dash=True), + n_texts: int = Opt(10000, "--n-texts", "-n", help="Maximum number of texts to use if available"), # fmt: on ): """ @@ -24,6 +27,18 @@ def profile( It can either be provided as a JSONL file, or be read from sys.sytdin. If no input file is specified, the IMDB dataset is loaded via Thinc. """ + profile(model, inputs=inputs, n_texts=n_texts) + + +def profile(model: str, inputs: Optional[Path] = None, n_texts: int = 10000) -> None: + try: + import ml_datasets + except ImportError: + msg.fail( + "This command requires the ml_datasets library to be installed:" + "pip install ml_datasets", + exits=1, + ) if inputs is not None: inputs = _read_inputs(inputs, msg) if inputs is None: @@ -43,12 +58,12 @@ def profile( s.strip_dirs().sort_stats("time").print_stats() -def parse_texts(nlp, texts): +def parse_texts(nlp: Language, texts: Sequence[str]) -> None: for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=16): pass -def _read_inputs(loc, msg): +def _read_inputs(loc: Union[Path, str], msg: Printer) -> Iterator[str]: if loc == "-": msg.info("Reading input from sys.stdin") file_ = sys.stdin diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train_from_config.py index 14e6d5b56..79c3bf259 100644 --- a/spacy/cli/train_from_config.py +++ b/spacy/cli/train_from_config.py @@ -1,16 +1,15 @@ -from typing import Optional, Dict, List, Union, Sequence +from typing import Optional, Dict from timeit import default_timer as timer - import srsly -from pydantic import BaseModel, FilePath import tqdm from pathlib import Path from wasabi import msg import thinc import thinc.schedules -from thinc.api import Model, use_pytorch_for_gpu_memory +from thinc.api import use_pytorch_for_gpu_memory import random +from ._app import app, Arg, Opt from ..gold import GoldCorpus from ..lookups import Lookups from .. import util @@ -19,6 +18,9 @@ from ..errors import Errors # Don't remove - required to load the built-in architectures from ..ml import models # noqa: F401 +# from ..schemas import ConfigSchema # TODO: include? + + registry = util.registry CONFIG_STR = """ @@ -80,54 +82,20 @@ subword_features = true """ -class PipelineComponent(BaseModel): - factory: str - model: Model - - class Config: - arbitrary_types_allowed = True - - -class ConfigSchema(BaseModel): - optimizer: Optional["Optimizer"] - - class training(BaseModel): - patience: int = 10 - eval_frequency: int = 100 - dropout: float = 0.2 - init_tok2vec: Optional[FilePath] = None - max_epochs: int = 100 - orth_variant_level: float = 0.0 - gold_preproc: bool = False - max_length: int = 0 - use_gpu: int = 0 - scores: List[str] = ["ents_p", "ents_r", "ents_f"] - score_weights: Dict[str, Union[int, float]] = {"ents_f": 1.0} - limit: int = 0 - batch_size: Union[Sequence[int], int] - - class nlp(BaseModel): - lang: str - vectors: Optional[str] - pipeline: Optional[Dict[str, PipelineComponent]] - - class Config: - extra = "allow" - - +@app.command("train") def train_cli( # fmt: off - train_path: ("Location of JSON-formatted training data", "positional", None, Path), - dev_path: ("Location of JSON-formatted development data", "positional", None, Path), - config_path: ("Path to config file", "positional", None, Path), - output_path: ("Output directory to store model in", "option", "o", Path) = None, - code_path: ("Path to Python file with additional code (registered functions) to be imported", "option", "c", Path) = None, - init_tok2vec: ("Path to pretrained weights for the tok2vec components. See 'spacy pretrain'. Experimental.", "option", "t2v", Path) = None, - raw_text: ("Path to jsonl file with unlabelled text documents.", "option", "rt", Path) = None, - verbose: ("Display more information for debugging purposes", "flag", "VV", bool) = False, - use_gpu: ("Use GPU", "option", "g", int) = -1, - tag_map_path: ("Location of JSON-formatted tag map", "option", "tm", Path) = None, - omit_extra_lookups: ("Don't include extra lookups in model", "flag", "OEL", bool) = False, + train_path: Path = Arg(..., help="Location of JSON-formatted training data", exists=True), + dev_path: Path = Arg(..., help="Location of JSON-formatted development data", exists=True), + config_path: Path = Arg(..., help="Path to config file", exists=True), + output_path: Optional[Path] = Opt(None, "--output-path", "-o", help="Output directory to store model in"), + code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"), + init_tok2vec: Optional[Path] = Opt(None, "--init-tok2vec", "-t2v", help="Path to pretrained weights for the tok2vec components. See 'spacy pretrain'. Experimental."), + raw_text: Optional[Path] = Opt(None, "--raw-text", "-rt", help="Path to jsonl file with unlabelled text documents."), + verbose: bool = Opt(False, "--verbose", "-VV", help="Display more information for debugging purposes"), + use_gpu: int = Opt(-1, "--use-gpu", "-g", help="Use GPU"), + tag_map_path: Optional[Path] = Opt(None, "--tag-map-path", "-tm", help="Location of JSON-formatted tag map"), + omit_extra_lookups: bool = Opt(False, "--omit-extra-lookups", "-OEL", help="Don't include extra lookups in model"), # fmt: on ): """ @@ -194,14 +162,14 @@ def train_cli( def train( - config_path, - data_paths, - raw_text=None, - output_path=None, - tag_map=None, - weights_data=None, - omit_extra_lookups=False, -): + config_path: Path, + data_paths: Dict[str, Path], + raw_text: Optional[Path] = None, + output_path: Optional[Path] = None, + tag_map: Optional[Path] = None, + weights_data: Optional[bytes] = None, + omit_extra_lookups: bool = False, +) -> None: msg.info(f"Loading config from: {config_path}") # Read the config first without creating objects, to get to the original nlp_config config = util.load_config(config_path, create_objects=False) diff --git a/spacy/cli/validate.py b/spacy/cli/validate.py index 080cd77e2..4271817f1 100644 --- a/spacy/cli/validate.py +++ b/spacy/cli/validate.py @@ -1,18 +1,25 @@ +from typing import Tuple from pathlib import Path import sys import requests -from wasabi import msg +from wasabi import msg, Printer +from ._app import app from .. import about from ..util import get_package_version, get_installed_models, get_base_version from ..util import get_package_path, get_model_meta, is_compatible_version -def validate(): +@app.command("validate") +def validate_cli(): """ Validate that the currently installed version of spaCy is compatible with the installed models. Should be run after `pip install -U spacy`. """ + validate() + + +def validate() -> None: model_pkgs, compat = get_model_pkgs() spacy_version = get_base_version(about.__version__) current_compat = compat.get(spacy_version, {}) @@ -55,7 +62,8 @@ def validate(): sys.exit(1) -def get_model_pkgs(): +def get_model_pkgs(silent: bool = False) -> Tuple[dict, dict]: + msg = Printer(no_print=silent, pretty=not silent) with msg.loading("Loading compatibility table..."): r = requests.get(about.__compatibility__) if r.status_code != 200: @@ -93,7 +101,7 @@ def get_model_pkgs(): return pkgs, compat -def reformat_version(version): +def reformat_version(version: str) -> str: """Hack to reformat old versions ending on '-alpha' to match pip format.""" if version.endswith("-alpha"): return version.replace("-alpha", "a0") diff --git a/spacy/schemas.py b/spacy/schemas.py index 3024326dd..04f9bbffa 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -1,8 +1,9 @@ -from typing import Dict, List, Union, Optional +from typing import Dict, List, Union, Optional, Sequence, Any from enum import Enum from pydantic import BaseModel, Field, ValidationError, validator -from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool +from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool, FilePath from collections import defaultdict +from thinc.api import Model from .attrs import NAMES @@ -163,24 +164,48 @@ class ModelMetaSchema(BaseModel): email: Optional[StrictStr] = Field(None, title="Model author email") url: Optional[StrictStr] = Field(None, title="Model author URL") sources: Optional[Union[List[StrictStr], Dict[str, str]]] = Field(None, title="Training data sources") - vectors: Optional[Dict[str, int]] = Field(None, title="Included word vectors") + vectors: Optional[Dict[str, Any]] = Field(None, title="Included word vectors") accuracy: Optional[Dict[str, Union[float, int]]] = Field(None, title="Accuracy numbers") speed: Optional[Dict[str, Union[float, int]]] = Field(None, title="Speed evaluation numbers") # fmt: on -# Training data object in "simple training style" +# JSON training format -class SimpleTrainingSchema(BaseModel): - # TODO: write +class PipelineComponent(BaseModel): + factory: str + model: Model class Config: - title = "Schema for training data dict in passed to nlp.update" - extra = "forbid" + arbitrary_types_allowed = True -# JSON training format +class ConfigSchema(BaseModel): + optimizer: Optional["Optimizer"] + + class training(BaseModel): + patience: int = 10 + eval_frequency: int = 100 + dropout: float = 0.2 + init_tok2vec: Optional[FilePath] = None + max_epochs: int = 100 + orth_variant_level: float = 0.0 + gold_preproc: bool = False + max_length: int = 0 + use_gpu: int = 0 + scores: List[str] = ["ents_p", "ents_r", "ents_f"] + score_weights: Dict[str, Union[int, float]] = {"ents_f": 1.0} + limit: int = 0 + batch_size: Union[Sequence[int], int] + + class nlp(BaseModel): + lang: str + vectors: Optional[str] + pipeline: Optional[Dict[str, PipelineComponent]] + + class Config: + extra = "allow" class TrainingSchema(BaseModel): @@ -189,3 +214,34 @@ class TrainingSchema(BaseModel): class Config: title = "Schema for training data in spaCy's JSON format" extra = "forbid" + + +# Project config Schema + + +class ProjectConfigAsset(BaseModel): + dest: StrictStr = Field(..., title="Destination of downloaded asset") + url: StrictStr = Field(..., title="URL of asset") + + +class ProjectConfigCommand(BaseModel): + # fmt: off + name: StrictStr = Field(..., title="Name of command") + help: Optional[StrictStr] = Field(None, title="Command description") + script: List[StrictStr] = Field([], title="List of CLI commands to run, in order") + dvc_deps: List[StrictStr] = Field([], title="Data Version Control dependencies") + dvc_outputs: List[StrictStr] = Field([], title="Data Version Control outputs") + dvc_outputs_no_cache: List[StrictStr] = Field([], title="Data Version Control outputs (no cache)") + # fmt: on + + +class ProjectConfigSchema(BaseModel): + # fmt: off + variables: Dict[StrictStr, Union[str, int, float, bool]] = Field({}, title="Optional variables to substitute in commands") + assets: List[ProjectConfigAsset] = Field([], title="Data assets") + run: List[StrictStr] = Field([], title="Names of project commands to execute, in order") + commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts") + # fmt: on + + class Config: + title = "Schema for project configuration file" diff --git a/spacy/util.py b/spacy/util.py index d15d826bf..ed7ca5b3c 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -1,10 +1,10 @@ +from typing import List, Union import os import importlib import importlib.util import re from pathlib import Path import random -from typing import List import thinc from thinc.api import NumpyOps, get_current_ops, Adam, require_gpu, Config import functools @@ -17,6 +17,8 @@ import sys import warnings from packaging.specifiers import SpecifierSet, InvalidSpecifier from packaging.version import Version, InvalidVersion +import subprocess +from contextlib import contextmanager try: @@ -429,6 +431,30 @@ def get_package_path(name): return Path(pkg.__file__).parent +def run_command(command: List[str]) -> None: + """Run a command on the command line as a subprocess. + + command (list): The split command. + """ + status = subprocess.call(command, env=os.environ.copy()) + if status != 0: + sys.exit(status) + + +@contextmanager +def working_dir(path: Union[str, Path]) -> None: + """Change current working directory and returns to previous on exit. + + path (str / Path): The directory to navigate to. + """ + prev_cwd = Path.cwd() + os.chdir(str(path)) + try: + yield + finally: + os.chdir(prev_cwd) + + def is_in_jupyter(): """Check if user is running spaCy from a Jupyter notebook by detecting the IPython kernel. Mainly used for the displaCy visualizer.