Refactor CLI

This commit is contained in:
Ines Montani 2020-06-21 21:35:01 +02:00
parent c12713a8be
commit 275bab62df
15 changed files with 451 additions and 209 deletions

View File

@ -1,4 +1,7 @@
from spacy.cli import app from spacy.cli import app
from typer.main import get_command
if __name__ == "__main__": if __name__ == "__main__":
app() command = get_command(app)
# Ensure that the help messages always display the correct prompt
command(prog_name="python -m spacy")

View File

@ -34,10 +34,10 @@ class FileTypes(str, Enum):
@app.command("convert") @app.command("convert")
def convert( def convert_cli(
# fmt: off # fmt: off
input_file: str = Arg(..., help="Input file"), input_file: str = Arg(..., help="Input file", exists=True),
output_dir: str = Arg("-", help="Output directory. '-' for stdout."), output_dir: Path = Arg("-", help="Output directory. '-' for stdout.", allow_dash=True, exists=True),
file_type: FileTypes = Opt(FileTypes.json.value, "--file-type", "-t", help="Type of data to produce"), file_type: FileTypes = Opt(FileTypes.json.value, "--file-type", "-t", help="Type of data to produce"),
n_sents: int = Opt(1, "--n-sents", "-n", help="Number of sentences per doc (0 to disable)"), n_sents: int = Opt(1, "--n-sents", "-n", help="Number of sentences per doc (0 to disable)"),
seg_sents: bool = Opt(False, "--seg-sents", "-s", help="Segment sentences (for -c ner)"), seg_sents: bool = Opt(False, "--seg-sents", "-s", help="Segment sentences (for -c ner)"),
@ -45,7 +45,7 @@ def convert(
morphology: bool = Opt(False, "--morphology", "-m", help="Enable appending morphology to tags"), morphology: bool = Opt(False, "--morphology", "-m", help="Enable appending morphology to tags"),
merge_subtokens: bool = Opt(False, "--merge-subtokens", "-T", help="Merge CoNLL-U subtokens"), merge_subtokens: bool = Opt(False, "--merge-subtokens", "-T", help="Merge CoNLL-U subtokens"),
converter: str = Opt("auto", "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"), converter: str = Opt("auto", "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"),
ner_map_path: Optional[Path] = Opt(None, "--ner-map-path", "-N", help="NER tag mapping (as JSON-encoded dict of entity types)"), ner_map_path: Optional[Path] = Opt(None, "--ner-map-path", "-N", help="NER tag mapping (as JSON-encoded dict of entity types)", exists=True),
lang: Optional[str] = Opt(None, "--lang", "-l", help="Language (if tokenizer required)"), lang: Optional[str] = Opt(None, "--lang", "-l", help="Language (if tokenizer required)"),
# fmt: on # fmt: on
): ):
@ -58,8 +58,39 @@ def convert(
if isinstance(file_type, FileTypes): if isinstance(file_type, FileTypes):
# We get an instance of the FileTypes from the CLI so we need its string value # We get an instance of the FileTypes from the CLI so we need its string value
file_type = file_type.value file_type = file_type.value
no_print = output_dir == "-" silent = output_dir == "-"
msg = Printer(no_print=no_print) convert(
input_file,
output_dir,
file_type=file_type,
n_sents=n_sents,
seg_sents=seg_sents,
model=model,
morphology=morphology,
merge_subtokens=merge_subtokens,
converter=converter,
ner_map_path=ner_map_path,
lang=lang,
silent=silent,
)
def convert(
input_file: Path,
output_dir: Path,
*,
file_type: str = "json",
n_sents: int = 1,
seg_sents: bool = False,
model: Optional[str] = None,
morphology: bool = False,
merge_subtokens: bool = False,
converter: str = "auto",
ner_map_path: Optional[Path] = None,
lang: Optional[str] = None,
silent: bool = True,
) -> None:
msg = Printer(no_print=silent, pretty=not silent)
input_path = Path(input_file) input_path = Path(input_file)
if file_type not in FILE_TYPES_STDOUT and output_dir == "-": if file_type not in FILE_TYPES_STDOUT and output_dir == "-":
# TODO: support msgpack via stdout in srsly? # TODO: support msgpack via stdout in srsly?
@ -85,7 +116,8 @@ def convert(
converter = converter_autodetect converter = converter_autodetect
else: else:
msg.warn( msg.warn(
"Can't automatically detect NER format. Conversion may not succeed. See https://spacy.io/api/cli#convert" "Can't automatically detect NER format. Conversion may not "
"succeed. See https://spacy.io/api/cli#convert"
) )
if converter not in CONVERTERS: if converter not in CONVERTERS:
msg.fail(f"Can't find converter for {converter}", exits=1) msg.fail(f"Can't find converter for {converter}", exits=1)
@ -102,7 +134,7 @@ def convert(
merge_subtokens=merge_subtokens, merge_subtokens=merge_subtokens,
lang=lang, lang=lang,
model=model, model=model,
no_print=no_print, no_print=silent,
ner_map=ner_map, ner_map=ner_map,
) )
if output_dir != "-": if output_dir != "-":
@ -124,7 +156,7 @@ def convert(
srsly.write_jsonl("-", data) srsly.write_jsonl("-", data)
def autodetect_ner_format(input_data): def autodetect_ner_format(input_data: str) -> str:
# guess format from the first 20 lines # guess format from the first 20 lines
lines = input_data.split("\n")[:20] lines = input_data.split("\n")[:20]
format_guesses = {"ner": 0, "iob": 0} format_guesses = {"ner": 0, "iob": 0}

View File

@ -1,4 +1,4 @@
from typing import Optional from typing import Optional, List, Sequence, Dict, Any, Tuple
from pathlib import Path from pathlib import Path
from collections import Counter from collections import Counter
import sys import sys
@ -6,8 +6,9 @@ import srsly
from wasabi import Printer, MESSAGES from wasabi import Printer, MESSAGES
from ._app import app, Arg, Opt from ._app import app, Arg, Opt
from ..gold import GoldCorpus from ..gold import GoldCorpus, Example
from ..syntax import nonproj from ..syntax import nonproj
from ..language import Language
from ..util import load_model, get_lang_class from ..util import load_model, get_lang_class
@ -21,12 +22,12 @@ BLANK_MODEL_THRESHOLD = 2000
@app.command("debug-data") @app.command("debug-data")
def debug_data( def debug_data_cli(
# fmt: off # fmt: off
lang: str = Arg(..., help="Model language"), lang: str = Arg(..., help="Model language"),
train_path: Path = Arg(..., help="Location of JSON-formatted training data"), train_path: Path = Arg(..., help="Location of JSON-formatted training data", exists=True),
dev_path: Path = Arg(..., help="Location of JSON-formatted development data"), dev_path: Path = Arg(..., help="Location of JSON-formatted development data", exists=True),
tag_map_path: Optional[Path] = Opt(None, "--tag-map-path", "-tm", help="Location of JSON-formatted tag map"), tag_map_path: Optional[Path] = Opt(None, "--tag-map-path", "-tm", help="Location of JSON-formatted tag map", exists=True, dir_okay=False),
base_model: Optional[str] = Opt(None, "--base-model", "-b", help="Name of model to update (optional)"), base_model: Optional[str] = Opt(None, "--base-model", "-b", help="Name of model to update (optional)"),
pipeline: str = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of pipeline components to train"), pipeline: str = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of pipeline components to train"),
ignore_warnings: bool = Opt(False, "--ignore-warnings", "-IW", help="Ignore warnings, only show stats and errors"), ignore_warnings: bool = Opt(False, "--ignore-warnings", "-IW", help="Ignore warnings, only show stats and errors"),
@ -39,8 +40,36 @@ def debug_data(
stats, and find problems like invalid entity annotations, cyclic stats, and find problems like invalid entity annotations, cyclic
dependencies, low data labels and more. dependencies, low data labels and more.
""" """
msg = Printer(pretty=not no_format, ignore_warnings=ignore_warnings) debug_data(
lang,
train_path,
dev_path,
tag_map_path=tag_map_path,
base_model=base_model,
pipeline=[p.strip() for p in pipeline.split(",")],
ignore_warnings=ignore_warnings,
verbose=verbose,
no_format=no_format,
silent=False,
)
def debug_data(
lang: str,
train_path: Path,
dev_path: Path,
*,
tag_map_path: Optional[Path] = None,
base_model: Optional[str] = None,
pipeline: List[str] = ["tagger", "parser", "ner"],
ignore_warnings: bool = False,
verbose: bool = False,
no_format: bool = True,
silent: bool = True,
):
msg = Printer(
no_print=silent, pretty=not no_format, ignore_warnings=ignore_warnings
)
# Make sure all files and paths exists if they are needed # Make sure all files and paths exists if they are needed
if not train_path.exists(): if not train_path.exists():
msg.fail("Training data not found", train_path, exits=1) msg.fail("Training data not found", train_path, exits=1)
@ -52,7 +81,6 @@ def debug_data(
tag_map = srsly.read_json(tag_map_path) tag_map = srsly.read_json(tag_map_path)
# Initialize the model and pipeline # Initialize the model and pipeline
pipeline = [p.strip() for p in pipeline.split(",")]
if base_model: if base_model:
nlp = load_model(base_model) nlp = load_model(base_model)
else: else:
@ -449,7 +477,7 @@ def debug_data(
sys.exit(1) sys.exit(1)
def _load_file(file_path, msg): def _load_file(file_path: Path, msg: Printer) -> None:
file_name = file_path.parts[-1] file_name = file_path.parts[-1]
if file_path.suffix == ".json": if file_path.suffix == ".json":
with msg.loading(f"Loading {file_name}..."): with msg.loading(f"Loading {file_name}..."):
@ -468,7 +496,9 @@ def _load_file(file_path, msg):
) )
def _compile_gold(examples, pipeline, nlp): def _compile_gold(
examples: Sequence[Example], pipeline: List[str], nlp: Language
) -> Dict[str, Any]:
data = { data = {
"ner": Counter(), "ner": Counter(),
"cats": Counter(), "cats": Counter(),
@ -540,13 +570,13 @@ def _compile_gold(examples, pipeline, nlp):
return data return data
def _format_labels(labels, counts=False): def _format_labels(labels: List[Tuple[str, int]], counts: bool = False) -> str:
if counts: if counts:
return ", ".join([f"'{l}' ({c})" for l, c in labels]) return ", ".join([f"'{l}' ({c})" for l, c in labels])
return ", ".join([f"'{l}'" for l in labels]) return ", ".join([f"'{l}'" for l in labels])
def _get_examples_without_label(data, label): def _get_examples_without_label(data: Sequence[Example], label: str) -> int:
count = 0 count = 0
for ex in data: for ex in data:
labels = [ labels = [
@ -559,7 +589,7 @@ def _get_examples_without_label(data, label):
return count return count
def _get_labels_from_model(nlp, pipe_name): def _get_labels_from_model(nlp: Language, pipe_name: str) -> Sequence[str]:
if pipe_name not in nlp.pipe_names: if pipe_name not in nlp.pipe_names:
return set() return set()
pipe = nlp.get_pipe(pipe_name) pipe = nlp.get_pipe(pipe_name)

View File

@ -1,31 +1,36 @@
from typing import List from typing import Optional, Sequence, Union
import requests import requests
import os
import subprocess
import sys import sys
from wasabi import msg from wasabi import msg
import typer
from ._app import app, Arg, Opt from ._app import app, Arg, Opt
from .. import about from .. import about
from ..util import is_package, get_base_version from ..util import is_package, get_base_version, run_command
@app.command( @app.command(
"download", "download",
context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
) )
def download( def download_cli(
# fmt: off # fmt: off
ctx: typer.Context,
model: str = Arg(..., help="Model to download (shortcut or name)"), model: str = Arg(..., help="Model to download (shortcut or name)"),
direct: bool = Opt(False, "--direct", "-d", help="Force direct download of name + version"), direct: bool = Opt(False, "--direct", "-d", help="Force direct download of name + version"),
pip_args: List[str] = Arg(..., help="Additional arguments to be passed to `pip install` on model install"),
# fmt: on # fmt: on
): ):
""" """
Download compatible model from default download path using pip. If --direct Download compatible model from default download path using pip. If --direct
flag is set, the command expects the full model name with version. flag is set, the command expects the full model name with version.
For direct downloads, the compatibility check will be skipped. For direct downloads, the compatibility check will be skipped. All
additional arguments provided to this command will be passed to `pip install`
on model installation.
""" """
download(model, direct, *ctx.args)
def download(model: str, direct: bool = False, *pip_args) -> None:
if not is_package("spacy") and "--no-deps" not in pip_args: if not is_package("spacy") and "--no-deps" not in pip_args:
msg.warn( msg.warn(
"Skipping model package dependencies and setting `--no-deps`. " "Skipping model package dependencies and setting `--no-deps`. "
@ -41,22 +46,20 @@ def download(
components = model.split("-") components = model.split("-")
model_name = "".join(components[:-1]) model_name = "".join(components[:-1])
version = components[-1] version = components[-1]
dl = download_model(dl_tpl.format(m=model_name, v=version), pip_args) download_model(dl_tpl.format(m=model_name, v=version), pip_args)
else: else:
shortcuts = get_json(about.__shortcuts__, "available shortcuts") shortcuts = get_json(about.__shortcuts__, "available shortcuts")
model_name = shortcuts.get(model, model) model_name = shortcuts.get(model, model)
compatibility = get_compatibility() compatibility = get_compatibility()
version = get_version(model_name, compatibility) version = get_version(model_name, compatibility)
dl = download_model(dl_tpl.format(m=model_name, v=version), pip_args) download_model(dl_tpl.format(m=model_name, v=version), pip_args)
if dl != 0: # if download subprocess doesn't return 0, exit
sys.exit(dl)
msg.good( msg.good(
"Download and installation successful", "Download and installation successful",
f"You can now load the model via spacy.load('{model_name}')", f"You can now load the model via spacy.load('{model_name}')",
) )
def get_json(url, desc): def get_json(url: str, desc: str) -> Union[dict, list]:
r = requests.get(url) r = requests.get(url)
if r.status_code != 200: if r.status_code != 200:
msg.fail( msg.fail(
@ -70,7 +73,7 @@ def get_json(url, desc):
return r.json() return r.json()
def get_compatibility(): def get_compatibility() -> dict:
version = get_base_version(about.__version__) version = get_base_version(about.__version__)
comp_table = get_json(about.__compatibility__, "compatibility table") comp_table = get_json(about.__compatibility__, "compatibility table")
comp = comp_table["spacy"] comp = comp_table["spacy"]
@ -79,7 +82,7 @@ def get_compatibility():
return comp[version] return comp[version]
def get_version(model, comp): def get_version(model: str, comp: dict) -> str:
model = get_base_version(model) model = get_base_version(model)
if model not in comp: if model not in comp:
msg.fail( msg.fail(
@ -89,10 +92,12 @@ def get_version(model, comp):
return comp[model][0] return comp[model][0]
def download_model(filename, user_pip_args=None): def download_model(
filename: str, user_pip_args: Optional[Sequence[str]] = None
) -> None:
download_url = about.__download_url__ + "/" + filename download_url = about.__download_url__ + "/" + filename
pip_args = ["--no-cache-dir"] pip_args = ["--no-cache-dir"]
if user_pip_args: if user_pip_args:
pip_args.extend(user_pip_args) pip_args.extend(user_pip_args)
cmd = [sys.executable, "-m", "pip", "install"] + pip_args + [download_url] cmd = [sys.executable, "-m", "pip", "install"] + pip_args + [download_url]
return subprocess.call(cmd, env=os.environ.copy()) run_command(cmd)

View File

@ -1,29 +1,52 @@
from typing import Optional from typing import Optional, List
from timeit import default_timer as timer from timeit import default_timer as timer
from wasabi import msg from wasabi import Printer
from pathlib import Path
from ._app import app, Arg, Opt from ._app import app, Arg, Opt
from ..tokens import Doc
from ..scorer import Scorer
from ..gold import GoldCorpus from ..gold import GoldCorpus
from .. import util from .. import util
from .. import displacy from .. import displacy
@app.command("evaluate") @app.command("evaluate")
def evaluate( def evaluate_cli(
# fmt: off # fmt: off
model: str = Arg(..., help="Model name or path"), model: str = Arg(..., help="Model name or path"),
data_path: str = Arg(..., help="Location of JSON-formatted evaluation data"), data_path: Path = Arg(..., help="Location of JSON-formatted evaluation data", exists=True),
gpu_id: int = Opt(-1, "--gpu-id", "-g", help="Use GPU"), gpu_id: int = Opt(-1, "--gpu-id", "-g", help="Use GPU"),
gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"), gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"),
displacy_path: Optional[str] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML"), displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"), displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"),
return_scores: bool = Opt(False, "--return-scores", "-R", help="Return dict containing model scores"),
# fmt: on # fmt: on
): ):
""" """
Evaluate a model. To render a sample of parses in a HTML file, set an Evaluate a model. To render a sample of parses in a HTML file, set an
output directory as the displacy_path argument. output directory as the displacy_path argument.
""" """
evaluate(
model,
data_path,
gpu_id=gpu_id,
gold_preproc=gold_preproc,
displacy_path=displacy_path,
displacy_limit=displacy_limit,
silent=False,
)
def evaluate(
model: str,
data_path: Path,
gpu_id: int = -1,
gold_preproc: bool = False,
displacy_path: Optional[Path] = None,
displacy_limit: int = 25,
silent: bool = True,
) -> Scorer:
msg = Printer(no_print=silent, pretty=not silent)
util.fix_random_seed() util.fix_random_seed()
if gpu_id >= 0: if gpu_id >= 0:
util.use_gpu(gpu_id) util.use_gpu(gpu_id)
@ -78,11 +101,17 @@ def evaluate(
ents=render_ents, ents=render_ents,
) )
msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path) msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path)
if return_scores:
return scorer.scores return scorer.scores
def render_parses(docs, output_path, model_name="", limit=250, deps=True, ents=True): def render_parses(
docs: List[Doc],
output_path: Path,
model_name: str = "",
limit: int = 250,
deps: bool = True,
ents: bool = True,
):
docs[0].user_data["title"] = model_name docs[0].user_data["title"] = model_name
if ents: if ents:
html = displacy.render(docs[:limit], style="ent", page=True) html = displacy.render(docs[:limit], style="ent", page=True)

View File

@ -1,7 +1,7 @@
from typing import Optional from typing import Optional, Dict, Any, Union
import platform import platform
from pathlib import Path from pathlib import Path
from wasabi import msg from wasabi import Printer
import srsly import srsly
from ._app import app, Arg, Opt from ._app import app, Arg, Opt
@ -11,7 +11,7 @@ from .. import about
@app.command("info") @app.command("info")
def info( def info_cli(
# fmt: off # fmt: off
model: Optional[str] = Arg(None, help="Optional model name"), model: Optional[str] = Arg(None, help="Optional model name"),
markdown: bool = Opt(False, "--markdown", "-md", help="Generate Markdown for GitHub issues"), markdown: bool = Opt(False, "--markdown", "-md", help="Generate Markdown for GitHub issues"),
@ -23,7 +23,54 @@ def info(
print model information. Flag --markdown prints details in Markdown for easy print model information. Flag --markdown prints details in Markdown for easy
copy-pasting to GitHub issues. copy-pasting to GitHub issues.
""" """
info(model, markdown=markdown, silent=silent)
def info(
model: Optional[str], *, markdown: bool = False, silent: bool = True
) -> Union[str, dict]:
msg = Printer(no_print=silent, pretty=not silent)
if model: if model:
title = f"Info about model '{model}'"
data = info_model(model, silent=silent)
else:
title = "Info about spaCy"
data = info_spacy(silent=silent)
markdown_data = get_markdown(data, title=title)
if markdown:
if not silent:
print(markdown_data)
return markdown_data
if not silent:
msg.table(data, title=title)
return data
def info_spacy(*, silent: bool = True) -> Dict[str, any]:
"""Generate info about the current spaCy intallation.
silent (bool): Don't print anything, just return.
RETURNS (dict): The spaCy info.
"""
all_models, _ = get_model_pkgs(silent=silent)
models = ", ".join(f"{m['name']} ({m['version']})" for m in all_models.values())
return {
"spaCy version": about.__version__,
"Location": str(Path(__file__).parent.parent),
"Platform": platform.platform(),
"Python version": platform.python_version(),
"Models": models,
}
def info_model(model: str, *, silent: bool = True) -> Dict[str, Any]:
"""Generate info about a specific model.
model (str): Model name of path.
silent (bool): Don't print anything, just return.
RETURNS (dict): The model meta.
"""
msg = Printer(no_print=silent, pretty=not silent)
if util.is_package(model): if util.is_package(model):
model_path = util.get_package_path(model) model_path = util.get_package_path(model)
else: else:
@ -37,46 +84,22 @@ def info(
meta["source"] = str(model_path.resolve()) meta["source"] = str(model_path.resolve())
else: else:
meta["source"] = str(model_path) meta["source"] = str(model_path)
if not silent: return {k: v for k, v in meta.items() if k not in ("accuracy", "speed")}
title = f"Info about model '{model}'"
model_meta = {
k: v for k, v in meta.items() if k not in ("accuracy", "speed")
}
if markdown:
print_markdown(model_meta, title=title)
else:
msg.table(model_meta, title=title)
return meta
all_models, _ = get_model_pkgs()
data = {
"spaCy version": about.__version__,
"Location": str(Path(__file__).parent.parent),
"Platform": platform.platform(),
"Python version": platform.python_version(),
"Models": ", ".join(
f"{m['name']} ({m['version']})" for m in all_models.values()
),
}
if not silent:
title = "Info about spaCy"
if markdown:
print_markdown(data, title=title)
else:
msg.table(data, title=title)
return data
def print_markdown(data, title=None): def get_markdown(data: Dict[str, Any], title: Optional[str] = None) -> str:
"""Print data in GitHub-flavoured Markdown format for issues etc. """Get data in GitHub-flavoured Markdown format for issues etc.
data (dict or list of tuples): Label/value pairs. data (dict or list of tuples): Label/value pairs.
title (str / None): Title, will be rendered as headline 2. title (str / None): Title, will be rendered as headline 2.
RETURNS (str): The Markdown string.
""" """
markdown = [] markdown = []
for key, value in data.items(): for key, value in data.items():
if isinstance(value, str) and Path(value).exists(): if isinstance(value, str) and Path(value).exists():
continue continue
markdown.append(f"* **{key}:** {value}") markdown.append(f"* **{key}:** {value}")
result = "\n{}\n".format("\n".join(markdown))
if title: if title:
print(f"\n## {title}") result = f"\n## {title}\n{result}"
print("\n{}\n".format("\n".join(markdown))) return result

View File

@ -1,4 +1,4 @@
from typing import Optional from typing import Optional, List, Dict, Any, Union, IO
import math import math
from tqdm import tqdm from tqdm import tqdm
import numpy import numpy
@ -10,11 +10,12 @@ import gzip
import zipfile import zipfile
import srsly import srsly
import warnings import warnings
from wasabi import msg from wasabi import Printer
from ._app import app, Arg, Opt from ._app import app, Arg, Opt
from ..vectors import Vectors from ..vectors import Vectors
from ..errors import Errors, Warnings from ..errors import Errors, Warnings
from ..language import Language
from ..util import ensure_path, get_lang_class, load_model, OOV_RANK from ..util import ensure_path, get_lang_class, load_model, OOV_RANK
from ..lookups import Lookups from ..lookups import Lookups
@ -28,14 +29,14 @@ DEFAULT_OOV_PROB = -20
@app.command("init-model") @app.command("init-model")
def init_model( def init_model_cli(
# fmt: off # fmt: off
lang: str = Arg(..., help="Model language"), lang: str = Arg(..., help="Model language"),
output_dir: Path = Arg(..., help="Model output directory"), output_dir: Path = Arg(..., help="Model output directory"),
freqs_loc: Optional[Path] = Arg(None, help="Location of words frequencies file"), freqs_loc: Optional[Path] = Arg(None, help="Location of words frequencies file", exists=True),
clusters_loc: Optional[str] = Opt(None, "--clusters-loc", "-c", help="Optional location of brown clusters data"), clusters_loc: Optional[Path] = Opt(None, "--clusters-loc", "-c", help="Optional location of brown clusters data", exists=True),
jsonl_loc: Optional[Path] = Opt(None, "--jsonl-loc", "-j", help="Location of JSONL-formatted attributes file"), jsonl_loc: Optional[Path] = Opt(None, "--jsonl-loc", "-j", help="Location of JSONL-formatted attributes file", exists=True),
vectors_loc: Optional[str] = Opt(None, "--vectors-loc", "-v", help="Optional vectors file in Word2Vec format"), vectors_loc: Optional[Path] = Opt(None, "--vectors-loc", "-v", help="Optional vectors file in Word2Vec format", exists=True),
prune_vectors: int = Opt(-1 , "--prune-vectors", "-V", help="Optional number of vectors to prune to"), prune_vectors: int = Opt(-1 , "--prune-vectors", "-V", help="Optional number of vectors to prune to"),
truncate_vectors: int = Opt(0, "--truncate-vectors", "-t", help="Optional number of vectors to truncate to when reading in vectors file"), truncate_vectors: int = Opt(0, "--truncate-vectors", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
vectors_name: Optional[str] = Opt(None, "--vectors-name", "-vn", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"), vectors_name: Optional[str] = Opt(None, "--vectors-name", "-vn", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
@ -49,6 +50,38 @@ def init_model(
and word vectors. If vectors are provided in Word2Vec format, they can and word vectors. If vectors are provided in Word2Vec format, they can
be either a .txt or zipped as a .zip or .tar.gz. be either a .txt or zipped as a .zip or .tar.gz.
""" """
init_model(
lang,
output_dir,
freqs_loc=freqs_loc,
clusters_loc=clusters_loc,
jsonl_loc=jsonl_loc,
prune_vectors=prune_vectors,
truncate_vectors=truncate_vectors,
vectors_name=vectors_name,
model_name=model_name,
omit_extra_lookups=omit_extra_lookups,
base_model=base_model,
silent=False,
)
def init_model(
lang: str,
output_dir: Path,
freqs_loc: Optional[Path] = None,
clusters_loc: Optional[Path] = None,
jsonl_loc: Optional[Path] = None,
vectors_loc: Optional[Path] = None,
prune_vectors: int = -1,
truncate_vectors: int = 0,
vectors_name: Optional[str] = None,
model_name: Optional[str] = None,
omit_extra_lookups: bool = False,
base_model: Optional[str] = None,
silent: bool = True,
) -> Language:
msg = Printer(no_print=silent, pretty=not silent)
if jsonl_loc is not None: if jsonl_loc is not None:
if freqs_loc is not None or clusters_loc is not None: if freqs_loc is not None or clusters_loc is not None:
settings = ["-j"] settings = ["-j"]
@ -71,7 +104,7 @@ def init_model(
freqs_loc = ensure_path(freqs_loc) freqs_loc = ensure_path(freqs_loc)
if freqs_loc is not None and not freqs_loc.exists(): if freqs_loc is not None and not freqs_loc.exists():
msg.fail("Can't find words frequencies file", freqs_loc, exits=1) msg.fail("Can't find words frequencies file", freqs_loc, exits=1)
lex_attrs = read_attrs_from_deprecated(freqs_loc, clusters_loc) lex_attrs = read_attrs_from_deprecated(msg, freqs_loc, clusters_loc)
with msg.loading("Creating model..."): with msg.loading("Creating model..."):
nlp = create_model(lang, lex_attrs, name=model_name, base_model=base_model) nlp = create_model(lang, lex_attrs, name=model_name, base_model=base_model)
@ -86,7 +119,9 @@ def init_model(
msg.good("Successfully created model") msg.good("Successfully created model")
if vectors_loc is not None: if vectors_loc is not None:
add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name) add_vectors(
msg, nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name
)
vec_added = len(nlp.vocab.vectors) vec_added = len(nlp.vocab.vectors)
lex_added = len(nlp.vocab) lex_added = len(nlp.vocab)
msg.good( msg.good(
@ -98,7 +133,7 @@ def init_model(
return nlp return nlp
def open_file(loc): def open_file(loc: Union[str, Path]) -> IO:
"""Handle .gz, .tar.gz or unzipped files""" """Handle .gz, .tar.gz or unzipped files"""
loc = ensure_path(loc) loc = ensure_path(loc)
if tarfile.is_tarfile(str(loc)): if tarfile.is_tarfile(str(loc)):
@ -114,7 +149,9 @@ def open_file(loc):
return loc.open("r", encoding="utf8") return loc.open("r", encoding="utf8")
def read_attrs_from_deprecated(freqs_loc, clusters_loc): def read_attrs_from_deprecated(
msg: Printer, freqs_loc: Optional[Path], clusters_loc: Optional[Path]
) -> List[Dict[str, Any]]:
if freqs_loc is not None: if freqs_loc is not None:
with msg.loading("Counting frequencies..."): with msg.loading("Counting frequencies..."):
probs, _ = read_freqs(freqs_loc) probs, _ = read_freqs(freqs_loc)
@ -142,7 +179,12 @@ def read_attrs_from_deprecated(freqs_loc, clusters_loc):
return lex_attrs return lex_attrs
def create_model(lang, lex_attrs, name=None, base_model=None): def create_model(
lang: str,
lex_attrs: List[Dict[str, Any]],
name: Optional[str] = None,
base_model: Optional[Union[str, Path]] = None,
) -> Language:
if base_model: if base_model:
nlp = load_model(base_model) nlp = load_model(base_model)
# keep the tokenizer but remove any existing pipeline components due to # keep the tokenizer but remove any existing pipeline components due to
@ -169,7 +211,14 @@ def create_model(lang, lex_attrs, name=None, base_model=None):
return nlp return nlp
def add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, name=None): def add_vectors(
msg: Printer,
nlp: Language,
vectors_loc: Optional[Path],
truncate_vectors: int,
prune_vectors: int,
name: Optional[str] = None,
) -> None:
vectors_loc = ensure_path(vectors_loc) vectors_loc = ensure_path(vectors_loc)
if vectors_loc and vectors_loc.parts[-1].endswith(".npz"): if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb"))) nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))
@ -179,7 +228,7 @@ def add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, name=None):
else: else:
if vectors_loc: if vectors_loc:
with msg.loading(f"Reading vectors from {vectors_loc}"): with msg.loading(f"Reading vectors from {vectors_loc}"):
vectors_data, vector_keys = read_vectors(vectors_loc) vectors_data, vector_keys = read_vectors(msg, vectors_loc)
msg.good(f"Loaded vectors from {vectors_loc}") msg.good(f"Loaded vectors from {vectors_loc}")
else: else:
vectors_data, vector_keys = (None, None) vectors_data, vector_keys = (None, None)
@ -198,7 +247,7 @@ def add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, name=None):
nlp.vocab.prune_vectors(prune_vectors) nlp.vocab.prune_vectors(prune_vectors)
def read_vectors(vectors_loc, truncate_vectors=0): def read_vectors(msg: Printer, vectors_loc: Path, truncate_vectors: int = 0):
f = open_file(vectors_loc) f = open_file(vectors_loc)
shape = tuple(int(size) for size in next(f).split()) shape = tuple(int(size) for size in next(f).split())
if truncate_vectors >= 1: if truncate_vectors >= 1:
@ -218,7 +267,9 @@ def read_vectors(vectors_loc, truncate_vectors=0):
return vectors_data, vectors_keys return vectors_data, vectors_keys
def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50): def read_freqs(
freqs_loc: Path, max_length: int = 100, min_doc_freq: int = 5, min_freq: int = 50
):
counts = PreshCounter() counts = PreshCounter()
total = 0 total = 0
with freqs_loc.open() as f: with freqs_loc.open() as f:
@ -247,7 +298,7 @@ def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
return probs, oov_prob return probs, oov_prob
def read_clusters(clusters_loc): def read_clusters(clusters_loc: Path) -> dict:
clusters = {} clusters = {}
if ftfy is None: if ftfy is None:
warnings.warn(Warnings.W004) warnings.warn(Warnings.W004)

View File

@ -1,22 +1,24 @@
from typing import Optional from typing import Optional, Union, Any, Dict
import shutil import shutil
from pathlib import Path from pathlib import Path
from wasabi import msg, get_raw_input from wasabi import Printer, get_raw_input
import srsly import srsly
import sys
from ._app import app, Arg, Opt from ._app import app, Arg, Opt
from ..schemas import validate, ModelMetaSchema
from .. import util from .. import util
from .. import about from .. import about
@app.command("package") @app.command("package")
def package( def package_cli(
# fmt: off # fmt: off
input_dir: str = Arg(..., help="Directory with model data"), input_dir: Path = Arg(..., help="Directory with model data", exists=True, file_okay=False),
output_dir: str = Arg(..., help="Output parent directory"), output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False),
meta_path: Optional[str] = Opt(None, "--meta-path", "-m", help="Path to meta.json"), meta_path: Optional[Path] = Opt(None, "--meta-path", "-m", help="Path to meta.json", exists=True, dir_okay=False),
create_meta: bool = Opt(False, "--create-meta", "-c", help="Create meta.json, even if one exists"), create_meta: bool = Opt(False, "--create-meta", "-c", help="Create meta.json, even if one exists"),
force: bool = Opt(False, "--force", "-f", help="Force overwriting existing model in output directory"), force: bool = Opt(False, "--force", "-f", "-F", help="Force overwriting existing model in output directory"),
# fmt: on # fmt: on
): ):
""" """
@ -26,6 +28,25 @@ def package(
set and a meta.json already exists in the output directory, the existing set and a meta.json already exists in the output directory, the existing
values will be used as the defaults in the command-line prompt. values will be used as the defaults in the command-line prompt.
""" """
package(
input_dir,
output_dir,
meta_path=meta_path,
create_meta=create_meta,
force=force,
silent=False,
)
def package(
input_dir: Path,
output_dir: Path,
meta_path: Optional[Path] = None,
create_meta: bool = False,
force: bool = False,
silent: bool = True,
) -> None:
msg = Printer(no_print=silent, pretty=not silent)
input_path = util.ensure_path(input_dir) input_path = util.ensure_path(input_dir)
output_path = util.ensure_path(output_dir) output_path = util.ensure_path(output_dir)
meta_path = util.ensure_path(meta_path) meta_path = util.ensure_path(meta_path)
@ -36,23 +57,20 @@ def package(
if meta_path and not meta_path.exists(): if meta_path and not meta_path.exists():
msg.fail("Can't find model meta.json", meta_path, exits=1) msg.fail("Can't find model meta.json", meta_path, exits=1)
meta_path = meta_path or input_path / "meta.json" meta_path = meta_path or input_dir / "meta.json"
if meta_path.is_file(): if not meta_path.exists() or not meta_path.is_file():
msg.fail("Can't load model meta.json", meta_path, exits=1)
meta = srsly.read_json(meta_path) meta = srsly.read_json(meta_path)
if not create_meta: # only print if user doesn't want to overwrite if not create_meta: # only print if user doesn't want to overwrite
msg.good("Loaded meta.json from file", meta_path) msg.good("Loaded meta.json from file", meta_path)
else: else:
meta = generate_meta(input_dir, meta, msg) meta = generate_meta(input_dir, meta, msg)
for key in ("lang", "name", "version"): errors = validate(ModelMetaSchema, meta)
if key not in meta or meta[key] == "": if errors:
msg.fail( msg.fail("Invalid model meta.json", "\n".join(errors), exits=1)
f"No '{key}' setting found in meta.json",
"This setting is required to build your package.",
exits=1,
)
model_name = meta["lang"] + "_" + meta["name"] model_name = meta["lang"] + "_" + meta["name"]
model_name_v = model_name + "-" + meta["version"] model_name_v = model_name + "-" + meta["version"]
main_path = output_path / model_name_v main_path = output_dir / model_name_v
package_path = main_path / model_name package_path = main_path / model_name
if package_path.exists(): if package_path.exists():
@ -66,21 +84,26 @@ def package(
exits=1, exits=1,
) )
Path.mkdir(package_path, parents=True) Path.mkdir(package_path, parents=True)
shutil.copytree(str(input_path), str(package_path / model_name_v)) shutil.copytree(str(input_dir), str(package_path / model_name_v))
create_file(main_path / "meta.json", srsly.json_dumps(meta, indent=2)) create_file(main_path / "meta.json", srsly.json_dumps(meta, indent=2))
create_file(main_path / "setup.py", TEMPLATE_SETUP) create_file(main_path / "setup.py", TEMPLATE_SETUP)
create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST) create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST)
create_file(package_path / "__init__.py", TEMPLATE_INIT) create_file(package_path / "__init__.py", TEMPLATE_INIT)
msg.good(f"Successfully created package '{model_name_v}'", main_path) msg.good(f"Successfully created package '{model_name_v}'", main_path)
msg.text("To build the package, run `python setup.py sdist` in this directory.") with util.working_dir(main_path):
util.run_command([sys.executable, "setup.py", "sdist"])
zip_file = main_path / "dist" / f"{model_name_v}.tar.gz"
msg.good(f"Successfully created zipped Python package", zip_file)
def create_file(file_path, contents): def create_file(file_path: Path, contents: str) -> None:
file_path.touch() file_path.touch()
file_path.open("w", encoding="utf-8").write(contents) file_path.open("w", encoding="utf-8").write(contents)
def generate_meta(model_path, existing_meta, msg): def generate_meta(
model_path: Union[str, Path], existing_meta: Dict[str, Any], msg: Printer
) -> Dict[str, Any]:
meta = existing_meta or {} meta = existing_meta or {}
settings = [ settings = [
("lang", "Model language", meta.get("lang", "en")), ("lang", "Model language", meta.get("lang", "en")),

View File

@ -19,12 +19,12 @@ from ..gold import Example
@app.command("pretrain") @app.command("pretrain")
def pretrain( def pretrain_cli(
# fmt: off # fmt: off
texts_loc: str =Arg(..., help="Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the key 'tokens'"), texts_loc: Path = Arg(..., help="Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the key 'tokens'", exists=True),
vectors_model: str = Arg(..., help="Name or path to spaCy model with vectors to learn from"), vectors_model: str = Arg(..., help="Name or path to spaCy model with vectors to learn from"),
output_dir: Path = Arg(..., help="Directory to write models to on each epoch"), output_dir: Path = Arg(..., help="Directory to write models to on each epoch"),
config_path: Path = Arg(..., help="Path to config file"), config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False),
use_gpu: int = Opt(-1, "--use-gpu", "-g", help="Use GPU"), use_gpu: int = Opt(-1, "--use-gpu", "-g", help="Use GPU"),
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"), resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files."), epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files."),
@ -45,6 +45,26 @@ def pretrain(
all settings are the same between pretraining and training. Ideally, all settings are the same between pretraining and training. Ideally,
this is done by using the same config file for both commands. this is done by using the same config file for both commands.
""" """
pretrain(
texts_loc,
vectors_model,
output_dir,
config_path,
use_gpu=use_gpu,
resume_path=resume_path,
epoch_resume=epoch_resume,
)
def pretrain(
texts_loc: Path,
vectors_model: str,
output_dir: Path,
config_path: Path,
use_gpu: int = -1,
resume_path: Optional[Path] = None,
epoch_resume: Optional[int] = None,
):
if not config_path or not config_path.exists(): if not config_path or not config_path.exists():
msg.fail("Config file not found", config_path, exits=1) msg.fail("Config file not found", config_path, exits=1)

View File

@ -1,4 +1,4 @@
from typing import Optional from typing import Optional, Sequence, Union, Iterator
import tqdm import tqdm
from pathlib import Path from pathlib import Path
import srsly import srsly
@ -7,17 +7,18 @@ import pstats
import sys import sys
import itertools import itertools
import ml_datasets import ml_datasets
from wasabi import msg from wasabi import msg, Printer
from ._app import app, Arg, Opt from ._app import app, Arg, Opt
from ..language import Language
from ..util import load_model from ..util import load_model
@app.command("profile") @app.command("profile")
def profile( def profile_cli(
# fmt: off # fmt: off
model: str = Arg(..., help="Model to load"), model: str = Arg(..., help="Model to load"),
inputs: Optional[str] = Arg(None, help="Location of input file. '-' for stdin."), inputs: Optional[Path] = Arg(None, help="Location of input file. '-' for stdin.", exists=True, allow_dash=True),
n_texts: int = Opt(10000, "--n-texts", "-n", help="Maximum number of texts to use if available"), n_texts: int = Opt(10000, "--n-texts", "-n", help="Maximum number of texts to use if available"),
# fmt: on # fmt: on
): ):
@ -27,6 +28,10 @@ def profile(
It can either be provided as a JSONL file, or be read from sys.sytdin. It can either be provided as a JSONL file, or be read from sys.sytdin.
If no input file is specified, the IMDB dataset is loaded via Thinc. If no input file is specified, the IMDB dataset is loaded via Thinc.
""" """
profile(model, inputs=inputs, n_texts=n_texts)
def profile(model: str, inputs: Optional[Path] = None, n_texts: int = 10000) -> None:
if inputs is not None: if inputs is not None:
inputs = _read_inputs(inputs, msg) inputs = _read_inputs(inputs, msg)
if inputs is None: if inputs is None:
@ -46,12 +51,12 @@ def profile(
s.strip_dirs().sort_stats("time").print_stats() s.strip_dirs().sort_stats("time").print_stats()
def parse_texts(nlp, texts): def parse_texts(nlp: Language, texts: Sequence[str]) -> None:
for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=16): for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=16):
pass pass
def _read_inputs(loc, msg): def _read_inputs(loc: Union[Path, str], msg: Printer) -> Iterator[str]:
if loc == "-": if loc == "-":
msg.info("Reading input from sys.stdin") msg.info("Reading input from sys.stdin")
file_ = sys.stdin file_ = sys.stdin

View File

@ -1,64 +1,25 @@
from typing import List, Dict from typing import List, Dict, Any
import typer import typer
import srsly import srsly
from pathlib import Path from pathlib import Path
import os
import subprocess
import sys
from wasabi import msg from wasabi import msg
import shlex import shlex
from ._app import app, Arg, Opt from ._app import app, Arg, Opt
from .. import about from .. import about
from ..schemas import ProjectConfigSchema, validate from ..schemas import ProjectConfigSchema, validate
from ..util import run_command
CONFIG_FILE = "project.yml" CONFIG_FILE = "project.yml"
SUBDIRS = [ DIRS = ["assets", "configs", "packages", "metrics", "scripts", "notebooks", "training"]
"assets",
"configs",
"packages",
"metrics",
"scripts",
"notebooks",
"training",
]
project_cli = typer.Typer(help="Command-line interface for spaCy projects") project_cli = typer.Typer(help="Command-line interface for spaCy projects")
def load_project_config(path):
config_path = path / CONFIG_FILE
if not config_path.exists():
msg.fail("Can't find project config", config_path, exits=1)
config = srsly.read_yaml(config_path)
errors = validate(ProjectConfigSchema, config)
if errors:
msg.fail(f"Invalid project config in {CONFIG_FILE}", "\n".join(errors), exits=1)
return config
def create_dirs(project_dir: Path):
for subdir in SUBDIRS:
(project_dir / subdir).mkdir(parents=True)
def run_cmd(command: str):
status = subprocess.call(shlex.split(command), env=os.environ.copy())
if status != 0:
sys.exit(status)
def run_commands(commands: List[str] = tuple(), variables: Dict[str, str] = {}):
for command in commands:
# Substitute variables, e.g. "./{NAME}.json"
command = command.format(**variables)
msg.info(command)
run_cmd(command)
@project_cli.command("clone") @project_cli.command("clone")
def project_clone( def project_clone_cli(
# fmt: off # fmt: off
name: str = Arg(..., help="The name of the template to fetch"), name: str = Arg(..., help="The name of the template to fetch"),
dest: Path = Arg(Path.cwd(), help="Where to download and work. Defaults to current working directory.", exists=True, file_okay=False), dest: Path = Arg(Path.cwd(), help="Where to download and work. Defaults to current working directory.", exists=True, file_okay=False),
@ -70,13 +31,17 @@ def project_clone(
@project_cli.command("run") @project_cli.command("run")
def project_run( def project_run_cli(
# fmt: off # fmt: off
project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False), project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False),
subcommand: str = Arg(None, help="Name of command defined in project config") subcommand: str = Arg(None, help="Name of command defined in project config")
# fmt: on # fmt: on
): ):
"""Run scripts defined in the project.""" """Run scripts defined in the project."""
project_run(project_dir, subcommand)
def project_run(project_dir: Path, subcommand: str) -> None:
config = load_project_config(project_dir) config = load_project_config(project_dir)
config_commands = config.get("commands", []) config_commands = config.get("commands", [])
variables = config.get("variables", {}) variables = config.get("variables", {})
@ -98,3 +63,27 @@ def project_run(
app.add_typer(project_cli, name="project") app.add_typer(project_cli, name="project")
def load_project_config(path: Path) -> Dict[str, Any]:
config_path = path / CONFIG_FILE
if not config_path.exists():
msg.fail("Can't find project config", config_path, exits=1)
config = srsly.read_yaml(config_path)
errors = validate(ProjectConfigSchema, config)
if errors:
msg.fail(f"Invalid project config in {CONFIG_FILE}", "\n".join(errors), exits=1)
return config
def create_dirs(project_dir: Path) -> None:
for subdir in DIRS:
(project_dir / subdir).mkdir(parents=True)
def run_commands(commands: List[str] = tuple(), variables: Dict[str, str] = {}) -> None:
for command in commands:
# Substitute variables, e.g. "./{NAME}.json"
command = command.format(**variables)
msg.info(command)
run_command(shlex.split(command))

View File

@ -1,4 +1,4 @@
from typing import Optional from typing import Optional, Dict
from timeit import default_timer as timer from timeit import default_timer as timer
import srsly import srsly
import tqdm import tqdm
@ -85,9 +85,9 @@ subword_features = true
@app.command("train") @app.command("train")
def train_cli( def train_cli(
# fmt: off # fmt: off
train_path: Path = Arg(..., help="Location of JSON-formatted training data"), train_path: Path = Arg(..., help="Location of JSON-formatted training data", exists=True),
dev_path: Path = Arg(..., help="Location of JSON-formatted development data"), dev_path: Path = Arg(..., help="Location of JSON-formatted development data", exists=True),
config_path: Path = Arg(..., help="Path to config file"), config_path: Path = Arg(..., help="Path to config file", exists=True),
output_path: Optional[Path] = Opt(None, "--output-path", "-o", help="Output directory to store model in"), output_path: Optional[Path] = Opt(None, "--output-path", "-o", help="Output directory to store model in"),
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"), code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
init_tok2vec: Optional[Path] = Opt(None, "--init-tok2vec", "-t2v", help="Path to pretrained weights for the tok2vec components. See 'spacy pretrain'. Experimental."), init_tok2vec: Optional[Path] = Opt(None, "--init-tok2vec", "-t2v", help="Path to pretrained weights for the tok2vec components. See 'spacy pretrain'. Experimental."),
@ -162,14 +162,14 @@ def train_cli(
def train( def train(
config_path, config_path: Path,
data_paths, data_paths: Dict[str, Path],
raw_text=None, raw_text: Optional[Path] = None,
output_path=None, output_path: Optional[Path] = None,
tag_map=None, tag_map: Optional[Path] = None,
weights_data=None, weights_data: Optional[bytes] = None,
omit_extra_lookups=False, omit_extra_lookups: bool = False,
): ) -> None:
msg.info(f"Loading config from: {config_path}") msg.info(f"Loading config from: {config_path}")
# Read the config first without creating objects, to get to the original nlp_config # Read the config first without creating objects, to get to the original nlp_config
config = util.load_config(config_path, create_objects=False) config = util.load_config(config_path, create_objects=False)

View File

@ -1,7 +1,8 @@
from typing import Tuple
from pathlib import Path from pathlib import Path
import sys import sys
import requests import requests
from wasabi import msg from wasabi import msg, Printer
from ._app import app from ._app import app
from .. import about from .. import about
@ -10,11 +11,15 @@ from ..util import get_package_path, get_model_meta, is_compatible_version
@app.command("validate") @app.command("validate")
def validate(): def validate_cli():
""" """
Validate that the currently installed version of spaCy is compatible Validate that the currently installed version of spaCy is compatible
with the installed models. Should be run after `pip install -U spacy`. with the installed models. Should be run after `pip install -U spacy`.
""" """
validate()
def validate() -> None:
model_pkgs, compat = get_model_pkgs() model_pkgs, compat = get_model_pkgs()
spacy_version = get_base_version(about.__version__) spacy_version = get_base_version(about.__version__)
current_compat = compat.get(spacy_version, {}) current_compat = compat.get(spacy_version, {})
@ -57,7 +62,8 @@ def validate():
sys.exit(1) sys.exit(1)
def get_model_pkgs(): def get_model_pkgs(silent: bool = False) -> Tuple[dict, dict]:
msg = Printer(no_print=silent, pretty=not silent)
with msg.loading("Loading compatibility table..."): with msg.loading("Loading compatibility table..."):
r = requests.get(about.__compatibility__) r = requests.get(about.__compatibility__)
if r.status_code != 200: if r.status_code != 200:
@ -95,7 +101,7 @@ def get_model_pkgs():
return pkgs, compat return pkgs, compat
def reformat_version(version): def reformat_version(version: str) -> str:
"""Hack to reformat old versions ending on '-alpha' to match pip format.""" """Hack to reformat old versions ending on '-alpha' to match pip format."""
if version.endswith("-alpha"): if version.endswith("-alpha"):
return version.replace("-alpha", "a0") return version.replace("-alpha", "a0")

View File

@ -1,4 +1,4 @@
from typing import Dict, List, Union, Optional, Sequence from typing import Dict, List, Union, Optional, Sequence, Any
from enum import Enum from enum import Enum
from pydantic import BaseModel, Field, ValidationError, validator from pydantic import BaseModel, Field, ValidationError, validator
from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool, FilePath from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool, FilePath
@ -164,7 +164,7 @@ class ModelMetaSchema(BaseModel):
email: Optional[StrictStr] = Field(None, title="Model author email") email: Optional[StrictStr] = Field(None, title="Model author email")
url: Optional[StrictStr] = Field(None, title="Model author URL") url: Optional[StrictStr] = Field(None, title="Model author URL")
sources: Optional[Union[List[StrictStr], Dict[str, str]]] = Field(None, title="Training data sources") sources: Optional[Union[List[StrictStr], Dict[str, str]]] = Field(None, title="Training data sources")
vectors: Optional[Dict[str, int]] = Field(None, title="Included word vectors") vectors: Optional[Dict[str, Any]] = Field(None, title="Included word vectors")
accuracy: Optional[Dict[str, Union[float, int]]] = Field(None, title="Accuracy numbers") accuracy: Optional[Dict[str, Union[float, int]]] = Field(None, title="Accuracy numbers")
speed: Optional[Dict[str, Union[float, int]]] = Field(None, title="Speed evaluation numbers") speed: Optional[Dict[str, Union[float, int]]] = Field(None, title="Speed evaluation numbers")
# fmt: on # fmt: on

View File

@ -1,10 +1,10 @@
from typing import List, Union
import os import os
import importlib import importlib
import importlib.util import importlib.util
import re import re
from pathlib import Path from pathlib import Path
import random import random
from typing import List
import thinc import thinc
from thinc.api import NumpyOps, get_current_ops, Adam, require_gpu, Config from thinc.api import NumpyOps, get_current_ops, Adam, require_gpu, Config
import functools import functools
@ -17,6 +17,8 @@ import sys
import warnings import warnings
from packaging.specifiers import SpecifierSet, InvalidSpecifier from packaging.specifiers import SpecifierSet, InvalidSpecifier
from packaging.version import Version, InvalidVersion from packaging.version import Version, InvalidVersion
import subprocess
from contextlib import contextmanager
try: try:
@ -427,6 +429,30 @@ def get_package_path(name):
return Path(pkg.__file__).parent return Path(pkg.__file__).parent
def run_command(command: List[str]) -> None:
"""Run a command on the command line as a subprocess.
command (list): The split command.
"""
status = subprocess.call(command, env=os.environ.copy())
if status != 0:
sys.exit(status)
@contextmanager
def working_dir(path: Union[str, Path]) -> None:
"""Change current working directory and returns to previous on exit.
path (str / Path): The directory to navigate to.
"""
prev_cwd = Path.cwd()
os.chdir(str(path))
try:
yield
finally:
os.chdir(prev_cwd)
def is_in_jupyter(): def is_in_jupyter():
"""Check if user is running spaCy from a Jupyter notebook by detecting the """Check if user is running spaCy from a Jupyter notebook by detecting the
IPython kernel. Mainly used for the displaCy visualizer. IPython kernel. Mainly used for the displaCy visualizer.