Merge pull request #5626 from explosion/feature/typer

This commit is contained in:
Ines Montani 2020-06-22 06:29:03 -07:00 committed by GitHub
commit 83b4aa05c9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
20 changed files with 630 additions and 303 deletions

View File

@ -5,13 +5,13 @@ thinc==8.0.0a9
blis>=0.4.0,<0.5.0
ml_datasets>=0.1.1
murmurhash>=0.28.0,<1.1.0
wasabi>=0.4.0,<1.1.0
srsly>=2.0.0,<3.0.0
wasabi>=0.7.0,<1.1.0
srsly>=2.1.0,<3.0.0
catalogue>=0.0.7,<1.1.0
typer>=0.2.1,<1.0.0
# Third party dependencies
numpy>=1.15.0
requests>=2.13.0,<3.0.0
plac>=0.9.6,<1.2.0
tqdm>=4.38.0,<5.0.0
pydantic>=1.3.0,<2.0.0
# Official Python utilities

View File

@ -44,14 +44,13 @@ install_requires =
preshed>=3.0.2,<3.1.0
thinc==8.0.0a9
blis>=0.4.0,<0.5.0
wasabi>=0.4.0,<1.1.0
srsly>=2.0.0,<3.0.0
wasabi>=0.7.0,<1.1.0
srsly>=2.1.0,<3.0.0
catalogue>=0.0.7,<1.1.0
ml_datasets>=0.1.1
typer>=0.2.1,<1.0.0
# Third-party dependencies
tqdm>=4.38.0,<5.0.0
numpy>=1.15.0
plac>=0.9.6,<1.2.0
requests>=2.13.0,<3.0.0
pydantic>=1.3.0,<2.0.0
# Official Python utilities

View File

@ -8,7 +8,7 @@ warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
from thinc.api import prefer_gpu, require_gpu
from . import pipeline
from .cli.info import info as cli_info
from .cli.info import info
from .glossary import explain
from .about import __version__
from .errors import Errors, Warnings
@ -34,7 +34,3 @@ def load(name, **overrides):
def blank(name, **kwargs):
LangClass = util.get_lang_class(name)
return LangClass(**kwargs)
def info(model=None, markdown=False, silent=False):
return cli_info(model, markdown, silent)

View File

@ -1,31 +1,4 @@
if __name__ == "__main__":
import plac
import sys
from wasabi import msg
from spacy.cli import download, link, info, package, pretrain, convert
from spacy.cli import init_model, profile, evaluate, validate, debug_data
from spacy.cli import train_cli
from spacy.cli import setup_cli
commands = {
"download": download,
"link": link,
"info": info,
"train": train_cli,
"pretrain": pretrain,
"debug-data": debug_data,
"evaluate": evaluate,
"convert": convert,
"package": package,
"init-model": init_model,
"profile": profile,
"validate": validate,
}
if len(sys.argv) == 1:
msg.info("Available commands", ", ".join(commands), exits=1)
command = sys.argv.pop(1)
sys.argv[0] = f"spacy {command}"
if command in commands:
plac.call(commands[command], sys.argv[1:])
else:
available = f"Available: {', '.join(commands)}"
msg.fail(f"Unknown command: {command}", available, exits=1)
setup_cli()

View File

@ -5,3 +5,4 @@ __release__ = True
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
__shortcuts__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts-v2.json"
__projects__ = "https://github.com/explosion/spacy-boilerplates"

View File

@ -1,10 +1,14 @@
from wasabi import msg
from ._app import app, setup_cli # noqa: F401
# These are the actual functions, NOT the wrapped CLI commands. The CLI commands
# are registered automatically and won't have to be imported here.
from .download import download # noqa: F401
from .info import info # noqa: F401
from .package import package # noqa: F401
from .profile import profile # noqa: F401
from .train_from_config import train_cli # noqa: F401
from .train_from_config import train # noqa: F401
from .pretrain import pretrain # noqa: F401
from .debug_data import debug_data # noqa: F401
from .evaluate import evaluate # noqa: F401
@ -13,7 +17,10 @@ from .init_model import init_model # noqa: F401
from .validate import validate # noqa: F401
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
def link(*args, **kwargs):
"""As of spaCy v3.0, model symlinks are deprecated. You can load models
using their full names or from a directory path."""
msg.warn(
"As of spaCy v3.0, model symlinks are deprecated. You can load models "
"using their full names or from a directory path."

31
spacy/cli/_app.py Normal file
View File

@ -0,0 +1,31 @@
from typing import Optional
import typer
from typer.main import get_command
COMMAND = "python -m spacy"
NAME = "spacy"
HELP = """spaCy Command-line Interface
DOCS: https://spacy.io/api/cli
"""
app = typer.Typer(name=NAME, help=HELP)
def Arg(*args, help: Optional[str] = None, **kwargs) -> typer.Argument:
"""Wrapper for Typer's annotation to keep it short and set defaults."""
# Filter out help for now until it's officially supported
return typer.Argument(*args, **kwargs)
def Opt(*args, **kwargs) -> typer.Option:
"""Wrapper for Typer's annotation to keep it short and set defaults."""
return typer.Option(*args, show_default=True, **kwargs)
def setup_cli() -> None:
# Ensure that the help messages always display the correct prompt
command = get_command(app)
command(prog_name=COMMAND)

View File

@ -1,8 +1,11 @@
from typing import Optional
from enum import Enum
from pathlib import Path
from wasabi import Printer
import srsly
import re
from ._app import app, Arg, Opt
from .converters import conllu2json, iob2json, conll_ner2json
from .converters import ner_jsonl2json
@ -21,23 +24,29 @@ CONVERTERS = {
}
# File types
FILE_TYPES = ("json", "jsonl", "msg")
FILE_TYPES_STDOUT = ("json", "jsonl")
def convert(
class FileTypes(str, Enum):
json = "json"
jsonl = "jsonl"
msg = "msg"
@app.command("convert")
def convert_cli(
# fmt: off
input_file: ("Input file", "positional", None, str),
output_dir: ("Output directory. '-' for stdout.", "positional", None, str) = "-",
file_type: (f"Type of data to produce: {FILE_TYPES}", "option", "t", str, FILE_TYPES) = "json",
n_sents: ("Number of sentences per doc (0 to disable)", "option", "n", int) = 1,
seg_sents: ("Segment sentences (for -c ner)", "flag", "s") = False,
model: ("Model for sentence segmentation (for -s)", "option", "b", str) = None,
morphology: ("Enable appending morphology to tags", "flag", "m", bool) = False,
merge_subtokens: ("Merge CoNLL-U subtokens", "flag", "T", bool) = False,
converter: (f"Converter: {tuple(CONVERTERS.keys())}", "option", "c", str) = "auto",
ner_map_path: ("NER tag mapping (as JSON-encoded dict of entity types)", "option", "N", Path) = None,
lang: ("Language (if tokenizer required)", "option", "l", str) = None,
input_file: str = Arg(..., help="Input file", exists=True),
output_dir: Path = Arg("-", help="Output directory. '-' for stdout.", allow_dash=True, exists=True),
file_type: FileTypes = Opt(FileTypes.json.value, "--file-type", "-t", help="Type of data to produce"),
n_sents: int = Opt(1, "--n-sents", "-n", help="Number of sentences per doc (0 to disable)"),
seg_sents: bool = Opt(False, "--seg-sents", "-s", help="Segment sentences (for -c ner)"),
model: Optional[str] = Opt(None, "--model", "-b", help="Model for sentence segmentation (for -s)"),
morphology: bool = Opt(False, "--morphology", "-m", help="Enable appending morphology to tags"),
merge_subtokens: bool = Opt(False, "--merge-subtokens", "-T", help="Merge CoNLL-U subtokens"),
converter: str = Opt("auto", "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"),
ner_map_path: Optional[Path] = Opt(None, "--ner-map-path", "-N", help="NER tag mapping (as JSON-encoded dict of entity types)", exists=True),
lang: Optional[str] = Opt(None, "--lang", "-l", help="Language (if tokenizer required)"),
# fmt: on
):
"""
@ -46,8 +55,42 @@ def convert(
is written to stdout, so you can pipe them forward to a JSON file:
$ spacy convert some_file.conllu > some_file.json
"""
no_print = output_dir == "-"
msg = Printer(no_print=no_print)
if isinstance(file_type, FileTypes):
# We get an instance of the FileTypes from the CLI so we need its string value
file_type = file_type.value
silent = output_dir == "-"
convert(
input_file,
output_dir,
file_type=file_type,
n_sents=n_sents,
seg_sents=seg_sents,
model=model,
morphology=morphology,
merge_subtokens=merge_subtokens,
converter=converter,
ner_map_path=ner_map_path,
lang=lang,
silent=silent,
)
def convert(
input_file: Path,
output_dir: Path,
*,
file_type: str = "json",
n_sents: int = 1,
seg_sents: bool = False,
model: Optional[str] = None,
morphology: bool = False,
merge_subtokens: bool = False,
converter: str = "auto",
ner_map_path: Optional[Path] = None,
lang: Optional[str] = None,
silent: bool = True,
) -> None:
msg = Printer(no_print=silent, pretty=not silent)
input_path = Path(input_file)
if file_type not in FILE_TYPES_STDOUT and output_dir == "-":
# TODO: support msgpack via stdout in srsly?
@ -73,7 +116,8 @@ def convert(
converter = converter_autodetect
else:
msg.warn(
"Can't automatically detect NER format. Conversion may not succeed. See https://spacy.io/api/cli#convert"
"Can't automatically detect NER format. Conversion may not "
"succeed. See https://spacy.io/api/cli#convert"
)
if converter not in CONVERTERS:
msg.fail(f"Can't find converter for {converter}", exits=1)
@ -90,7 +134,7 @@ def convert(
merge_subtokens=merge_subtokens,
lang=lang,
model=model,
no_print=no_print,
no_print=silent,
ner_map=ner_map,
)
if output_dir != "-":
@ -112,7 +156,7 @@ def convert(
srsly.write_jsonl("-", data)
def autodetect_ner_format(input_data):
def autodetect_ner_format(input_data: str) -> str:
# guess format from the first 20 lines
lines = input_data.split("\n")[:20]
format_guesses = {"ner": 0, "iob": 0}

View File

@ -1,11 +1,14 @@
from typing import Optional, List, Sequence, Dict, Any, Tuple
from pathlib import Path
from collections import Counter
import sys
import srsly
from wasabi import Printer, MESSAGES
from ..gold import GoldCorpus
from ._app import app, Arg, Opt
from ..gold import GoldCorpus, Example
from ..syntax import nonproj
from ..language import Language
from ..util import load_model, get_lang_class
@ -18,17 +21,18 @@ BLANK_MODEL_MIN_THRESHOLD = 100
BLANK_MODEL_THRESHOLD = 2000
def debug_data(
@app.command("debug-data")
def debug_data_cli(
# fmt: off
lang: ("Model language", "positional", None, str),
train_path: ("Location of JSON-formatted training data", "positional", None, Path),
dev_path: ("Location of JSON-formatted development data", "positional", None, Path),
tag_map_path: ("Location of JSON-formatted tag map", "option", "tm", Path) = None,
base_model: ("Name of model to update (optional)", "option", "b", str) = None,
pipeline: ("Comma-separated names of pipeline components to train", "option", "p", str) = "tagger,parser,ner",
ignore_warnings: ("Ignore warnings, only show stats and errors", "flag", "IW", bool) = False,
verbose: ("Print additional information and explanations", "flag", "V", bool) = False,
no_format: ("Don't pretty-print the results", "flag", "NF", bool) = False,
lang: str = Arg(..., help="Model language"),
train_path: Path = Arg(..., help="Location of JSON-formatted training data", exists=True),
dev_path: Path = Arg(..., help="Location of JSON-formatted development data", exists=True),
tag_map_path: Optional[Path] = Opt(None, "--tag-map-path", "-tm", help="Location of JSON-formatted tag map", exists=True, dir_okay=False),
base_model: Optional[str] = Opt(None, "--base-model", "-b", help="Name of model to update (optional)"),
pipeline: str = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of pipeline components to train"),
ignore_warnings: bool = Opt(False, "--ignore-warnings", "-IW", help="Ignore warnings, only show stats and errors"),
verbose: bool = Opt(False, "--verbose", "-V", help="Print additional information and explanations"),
no_format: bool = Opt(False, "--no-format", "-NF", help="Don't pretty-print the results"),
# fmt: on
):
"""
@ -36,8 +40,36 @@ def debug_data(
stats, and find problems like invalid entity annotations, cyclic
dependencies, low data labels and more.
"""
msg = Printer(pretty=not no_format, ignore_warnings=ignore_warnings)
debug_data(
lang,
train_path,
dev_path,
tag_map_path=tag_map_path,
base_model=base_model,
pipeline=[p.strip() for p in pipeline.split(",")],
ignore_warnings=ignore_warnings,
verbose=verbose,
no_format=no_format,
silent=False,
)
def debug_data(
lang: str,
train_path: Path,
dev_path: Path,
*,
tag_map_path: Optional[Path] = None,
base_model: Optional[str] = None,
pipeline: List[str] = ["tagger", "parser", "ner"],
ignore_warnings: bool = False,
verbose: bool = False,
no_format: bool = True,
silent: bool = True,
):
msg = Printer(
no_print=silent, pretty=not no_format, ignore_warnings=ignore_warnings
)
# Make sure all files and paths exists if they are needed
if not train_path.exists():
msg.fail("Training data not found", train_path, exits=1)
@ -49,7 +81,6 @@ def debug_data(
tag_map = srsly.read_json(tag_map_path)
# Initialize the model and pipeline
pipeline = [p.strip() for p in pipeline.split(",")]
if base_model:
nlp = load_model(base_model)
else:
@ -446,7 +477,7 @@ def debug_data(
sys.exit(1)
def _load_file(file_path, msg):
def _load_file(file_path: Path, msg: Printer) -> None:
file_name = file_path.parts[-1]
if file_path.suffix == ".json":
with msg.loading(f"Loading {file_name}..."):
@ -465,7 +496,9 @@ def _load_file(file_path, msg):
)
def _compile_gold(examples, pipeline, nlp):
def _compile_gold(
examples: Sequence[Example], pipeline: List[str], nlp: Language
) -> Dict[str, Any]:
data = {
"ner": Counter(),
"cats": Counter(),
@ -537,13 +570,13 @@ def _compile_gold(examples, pipeline, nlp):
return data
def _format_labels(labels, counts=False):
def _format_labels(labels: List[Tuple[str, int]], counts: bool = False) -> str:
if counts:
return ", ".join([f"'{l}' ({c})" for l, c in labels])
return ", ".join([f"'{l}'" for l in labels])
def _get_examples_without_label(data, label):
def _get_examples_without_label(data: Sequence[Example], label: str) -> int:
count = 0
for ex in data:
labels = [
@ -556,7 +589,7 @@ def _get_examples_without_label(data, label):
return count
def _get_labels_from_model(nlp, pipe_name):
def _get_labels_from_model(nlp: Language, pipe_name: str) -> Sequence[str]:
if pipe_name not in nlp.pipe_names:
return set()
pipe = nlp.get_pipe(pipe_name)

View File

@ -1,23 +1,36 @@
from typing import Optional, Sequence, Union
import requests
import os
import subprocess
import sys
from wasabi import msg
import typer
from ._app import app, Arg, Opt
from .. import about
from ..util import is_package, get_base_version
from ..util import is_package, get_base_version, run_command
def download(
model: ("Model to download (shortcut or name)", "positional", None, str),
direct: ("Force direct download of name + version", "flag", "d", bool) = False,
*pip_args: ("Additional arguments to be passed to `pip install` on model install"),
@app.command(
"download",
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
)
def download_cli(
# fmt: off
ctx: typer.Context,
model: str = Arg(..., help="Model to download (shortcut or name)"),
direct: bool = Opt(False, "--direct", "-d", "-D", help="Force direct download of name + version"),
# fmt: on
):
"""
Download compatible model from default download path using pip. If --direct
flag is set, the command expects the full model name with version.
For direct downloads, the compatibility check will be skipped.
For direct downloads, the compatibility check will be skipped. All
additional arguments provided to this command will be passed to `pip install`
on model installation.
"""
download(model, direct, *ctx.args)
def download(model: str, direct: bool = False, *pip_args) -> None:
if not is_package("spacy") and "--no-deps" not in pip_args:
msg.warn(
"Skipping model package dependencies and setting `--no-deps`. "
@ -33,22 +46,20 @@ def download(
components = model.split("-")
model_name = "".join(components[:-1])
version = components[-1]
dl = download_model(dl_tpl.format(m=model_name, v=version), pip_args)
download_model(dl_tpl.format(m=model_name, v=version), pip_args)
else:
shortcuts = get_json(about.__shortcuts__, "available shortcuts")
model_name = shortcuts.get(model, model)
compatibility = get_compatibility()
version = get_version(model_name, compatibility)
dl = download_model(dl_tpl.format(m=model_name, v=version), pip_args)
if dl != 0: # if download subprocess doesn't return 0, exit
sys.exit(dl)
msg.good(
"Download and installation successful",
f"You can now load the model via spacy.load('{model_name}')",
)
download_model(dl_tpl.format(m=model_name, v=version), pip_args)
msg.good(
"Download and installation successful",
f"You can now load the model via spacy.load('{model_name}')",
)
def get_json(url, desc):
def get_json(url: str, desc: str) -> Union[dict, list]:
r = requests.get(url)
if r.status_code != 200:
msg.fail(
@ -62,7 +73,7 @@ def get_json(url, desc):
return r.json()
def get_compatibility():
def get_compatibility() -> dict:
version = get_base_version(about.__version__)
comp_table = get_json(about.__compatibility__, "compatibility table")
comp = comp_table["spacy"]
@ -71,7 +82,7 @@ def get_compatibility():
return comp[version]
def get_version(model, comp):
def get_version(model: str, comp: dict) -> str:
model = get_base_version(model)
if model not in comp:
msg.fail(
@ -81,10 +92,12 @@ def get_version(model, comp):
return comp[model][0]
def download_model(filename, user_pip_args=None):
def download_model(
filename: str, user_pip_args: Optional[Sequence[str]] = None
) -> None:
download_url = about.__download_url__ + "/" + filename
pip_args = ["--no-cache-dir"]
if user_pip_args:
pip_args.extend(user_pip_args)
cmd = [sys.executable, "-m", "pip", "install"] + pip_args + [download_url]
return subprocess.call(cmd, env=os.environ.copy())
run_command(cmd)

View File

@ -1,26 +1,52 @@
from typing import Optional, List
from timeit import default_timer as timer
from wasabi import msg
from wasabi import Printer
from pathlib import Path
from ._app import app, Arg, Opt
from ..tokens import Doc
from ..scorer import Scorer
from ..gold import GoldCorpus
from .. import util
from .. import displacy
def evaluate(
@app.command("evaluate")
def evaluate_cli(
# fmt: off
model: ("Model name or path", "positional", None, str),
data_path: ("Location of JSON-formatted evaluation data", "positional", None, str),
gpu_id: ("Use GPU", "option", "g", int) = -1,
gold_preproc: ("Use gold preprocessing", "flag", "G", bool) = False,
displacy_path: ("Directory to output rendered parses as HTML", "option", "dp", str) = None,
displacy_limit: ("Limit of parses to render as HTML", "option", "dl", int) = 25,
return_scores: ("Return dict containing model scores", "flag", "R", bool) = False,
model: str = Arg(..., help="Model name or path"),
data_path: Path = Arg(..., help="Location of JSON-formatted evaluation data", exists=True),
gpu_id: int = Opt(-1, "--gpu-id", "-g", help="Use GPU"),
gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"),
displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"),
# fmt: on
):
"""
Evaluate a model. To render a sample of parses in a HTML file, set an
output directory as the displacy_path argument.
"""
evaluate(
model,
data_path,
gpu_id=gpu_id,
gold_preproc=gold_preproc,
displacy_path=displacy_path,
displacy_limit=displacy_limit,
silent=False,
)
def evaluate(
model: str,
data_path: Path,
gpu_id: int = -1,
gold_preproc: bool = False,
displacy_path: Optional[Path] = None,
displacy_limit: int = 25,
silent: bool = True,
) -> Scorer:
msg = Printer(no_print=silent, pretty=not silent)
util.fix_random_seed()
if gpu_id >= 0:
util.use_gpu(gpu_id)
@ -75,11 +101,17 @@ def evaluate(
ents=render_ents,
)
msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path)
if return_scores:
return scorer.scores
return scorer.scores
def render_parses(docs, output_path, model_name="", limit=250, deps=True, ents=True):
def render_parses(
docs: List[Doc],
output_path: Path,
model_name: str = "",
limit: int = 250,
deps: bool = True,
ents: bool = True,
):
docs[0].user_data["title"] = model_name
if ents:
html = displacy.render(docs[:limit], style="ent", page=True)

View File

@ -1,77 +1,109 @@
from typing import Optional, Dict, Any, Union
import platform
from pathlib import Path
from wasabi import msg
from wasabi import Printer
import srsly
from .validate import get_model_pkgs
from ._app import app, Arg, Opt
from .. import util
from .. import about
def info(
model: ("Optional model name", "positional", None, str) = None,
markdown: ("Generate Markdown for GitHub issues", "flag", "md", str) = False,
silent: ("Don't print anything (just return)", "flag", "s") = False,
@app.command("info")
def info_cli(
# fmt: off
model: Optional[str] = Arg(None, help="Optional model name"),
markdown: bool = Opt(False, "--markdown", "-md", help="Generate Markdown for GitHub issues"),
silent: bool = Opt(False, "--silent", "-s", "-S", help="Don't print anything (just return)"),
# fmt: on
):
"""
Print info about spaCy installation. If a model is speficied as an argument,
print model information. Flag --markdown prints details in Markdown for easy
copy-pasting to GitHub issues.
"""
info(model, markdown=markdown, silent=silent)
def info(
model: Optional[str] = None, *, markdown: bool = False, silent: bool = True
) -> Union[str, dict]:
msg = Printer(no_print=silent, pretty=not silent)
if model:
if util.is_package(model):
model_path = util.get_package_path(model)
else:
model_path = model
meta_path = model_path / "meta.json"
if not meta_path.is_file():
msg.fail("Can't find model meta.json", meta_path, exits=1)
meta = srsly.read_json(meta_path)
if model_path.resolve() != model_path:
meta["link"] = str(model_path)
meta["source"] = str(model_path.resolve())
else:
meta["source"] = str(model_path)
title = f"Info about model '{model}'"
data = info_model(model, silent=silent)
else:
title = "Info about spaCy"
data = info_spacy()
raw_data = {k.lower().replace(" ", "_"): v for k, v in data.items()}
if "Models" in data and isinstance(data["Models"], dict):
data["Models"] = ", ".join(f"{n} ({v})" for n, v in data["Models"].items())
markdown_data = get_markdown(data, title=title)
if markdown:
if not silent:
title = f"Info about model '{model}'"
model_meta = {
k: v for k, v in meta.items() if k not in ("accuracy", "speed")
}
if markdown:
print_markdown(model_meta, title=title)
else:
msg.table(model_meta, title=title)
return meta
all_models, _ = get_model_pkgs()
data = {
print(markdown_data)
return markdown_data
if not silent:
table_data = dict(data)
msg.table(table_data, title=title)
return raw_data
def info_spacy() -> Dict[str, any]:
"""Generate info about the current spaCy intallation.
RETURNS (dict): The spaCy info.
"""
all_models = {}
for pkg_name in util.get_installed_models():
package = pkg_name.replace("-", "_")
all_models[package] = util.get_package_version(pkg_name)
return {
"spaCy version": about.__version__,
"Location": str(Path(__file__).parent.parent),
"Platform": platform.platform(),
"Python version": platform.python_version(),
"Models": ", ".join(
f"{m['name']} ({m['version']})" for m in all_models.values()
),
"Models": all_models,
}
if not silent:
title = "Info about spaCy"
if markdown:
print_markdown(data, title=title)
else:
msg.table(data, title=title)
return data
def print_markdown(data, title=None):
"""Print data in GitHub-flavoured Markdown format for issues etc.
def info_model(model: str, *, silent: bool = True) -> Dict[str, Any]:
"""Generate info about a specific model.
model (str): Model name of path.
silent (bool): Don't print anything, just return.
RETURNS (dict): The model meta.
"""
msg = Printer(no_print=silent, pretty=not silent)
if util.is_package(model):
model_path = util.get_package_path(model)
else:
model_path = model
meta_path = model_path / "meta.json"
if not meta_path.is_file():
msg.fail("Can't find model meta.json", meta_path, exits=1)
meta = srsly.read_json(meta_path)
if model_path.resolve() != model_path:
meta["link"] = str(model_path)
meta["source"] = str(model_path.resolve())
else:
meta["source"] = str(model_path)
return {k: v for k, v in meta.items() if k not in ("accuracy", "speed")}
def get_markdown(data: Dict[str, Any], title: Optional[str] = None) -> str:
"""Get data in GitHub-flavoured Markdown format for issues etc.
data (dict or list of tuples): Label/value pairs.
title (str / None): Title, will be rendered as headline 2.
RETURNS (str): The Markdown string.
"""
markdown = []
for key, value in data.items():
if isinstance(value, str) and Path(value).exists():
continue
markdown.append(f"* **{key}:** {value}")
result = "\n{}\n".format("\n".join(markdown))
if title:
print(f"\n## {title}")
print("\n{}\n".format("\n".join(markdown)))
result = f"\n## {title}\n{result}"
return result

View File

@ -1,3 +1,4 @@
from typing import Optional, List, Dict, Any, Union, IO
import math
from tqdm import tqdm
import numpy
@ -9,10 +10,12 @@ import gzip
import zipfile
import srsly
import warnings
from wasabi import msg
from wasabi import Printer
from ._app import app, Arg, Opt
from ..vectors import Vectors
from ..errors import Errors, Warnings
from ..language import Language
from ..util import ensure_path, get_lang_class, load_model, OOV_RANK
from ..lookups import Lookups
@ -25,20 +28,21 @@ except ImportError:
DEFAULT_OOV_PROB = -20
def init_model(
@app.command("init-model")
def init_model_cli(
# fmt: off
lang: ("Model language", "positional", None, str),
output_dir: ("Model output directory", "positional", None, Path),
freqs_loc: ("Location of words frequencies file", "option", "f", Path) = None,
clusters_loc: ("Optional location of brown clusters data", "option", "c", str) = None,
jsonl_loc: ("Location of JSONL-formatted attributes file", "option", "j", Path) = None,
vectors_loc: ("Optional vectors file in Word2Vec format", "option", "v", str) = None,
prune_vectors: ("Optional number of vectors to prune to", "option", "V", int) = -1,
truncate_vectors: ("Optional number of vectors to truncate to when reading in vectors file", "option", "t", int) = 0,
vectors_name: ("Optional name for the word vectors, e.g. en_core_web_lg.vectors", "option", "vn", str) = None,
model_name: ("Optional name for the model meta", "option", "mn", str) = None,
omit_extra_lookups: ("Don't include extra lookups in model", "flag", "OEL", bool) = False,
base_model: ("Base model (for languages with custom tokenizers)", "option", "b", str) = None
lang: str = Arg(..., help="Model language"),
output_dir: Path = Arg(..., help="Model output directory"),
freqs_loc: Optional[Path] = Arg(None, help="Location of words frequencies file", exists=True),
clusters_loc: Optional[Path] = Opt(None, "--clusters-loc", "-c", help="Optional location of brown clusters data", exists=True),
jsonl_loc: Optional[Path] = Opt(None, "--jsonl-loc", "-j", help="Location of JSONL-formatted attributes file", exists=True),
vectors_loc: Optional[Path] = Opt(None, "--vectors-loc", "-v", help="Optional vectors file in Word2Vec format", exists=True),
prune_vectors: int = Opt(-1 , "--prune-vectors", "-V", help="Optional number of vectors to prune to"),
truncate_vectors: int = Opt(0, "--truncate-vectors", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
vectors_name: Optional[str] = Opt(None, "--vectors-name", "-vn", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
model_name: Optional[str] = Opt(None, "--model-name", "-mn", help="Optional name for the model meta"),
omit_extra_lookups: bool = Opt(False, "--omit-extra-lookups", "-OEL", help="Don't include extra lookups in model"),
base_model: Optional[str] = Opt(None, "--base-model", "-b", help="Base model (for languages with custom tokenizers)")
# fmt: on
):
"""
@ -46,6 +50,38 @@ def init_model(
and word vectors. If vectors are provided in Word2Vec format, they can
be either a .txt or zipped as a .zip or .tar.gz.
"""
init_model(
lang,
output_dir,
freqs_loc=freqs_loc,
clusters_loc=clusters_loc,
jsonl_loc=jsonl_loc,
prune_vectors=prune_vectors,
truncate_vectors=truncate_vectors,
vectors_name=vectors_name,
model_name=model_name,
omit_extra_lookups=omit_extra_lookups,
base_model=base_model,
silent=False,
)
def init_model(
lang: str,
output_dir: Path,
freqs_loc: Optional[Path] = None,
clusters_loc: Optional[Path] = None,
jsonl_loc: Optional[Path] = None,
vectors_loc: Optional[Path] = None,
prune_vectors: int = -1,
truncate_vectors: int = 0,
vectors_name: Optional[str] = None,
model_name: Optional[str] = None,
omit_extra_lookups: bool = False,
base_model: Optional[str] = None,
silent: bool = True,
) -> Language:
msg = Printer(no_print=silent, pretty=not silent)
if jsonl_loc is not None:
if freqs_loc is not None or clusters_loc is not None:
settings = ["-j"]
@ -68,7 +104,7 @@ def init_model(
freqs_loc = ensure_path(freqs_loc)
if freqs_loc is not None and not freqs_loc.exists():
msg.fail("Can't find words frequencies file", freqs_loc, exits=1)
lex_attrs = read_attrs_from_deprecated(freqs_loc, clusters_loc)
lex_attrs = read_attrs_from_deprecated(msg, freqs_loc, clusters_loc)
with msg.loading("Creating model..."):
nlp = create_model(lang, lex_attrs, name=model_name, base_model=base_model)
@ -83,7 +119,9 @@ def init_model(
msg.good("Successfully created model")
if vectors_loc is not None:
add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name)
add_vectors(
msg, nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name
)
vec_added = len(nlp.vocab.vectors)
lex_added = len(nlp.vocab)
msg.good(
@ -95,7 +133,7 @@ def init_model(
return nlp
def open_file(loc):
def open_file(loc: Union[str, Path]) -> IO:
"""Handle .gz, .tar.gz or unzipped files"""
loc = ensure_path(loc)
if tarfile.is_tarfile(str(loc)):
@ -111,7 +149,9 @@ def open_file(loc):
return loc.open("r", encoding="utf8")
def read_attrs_from_deprecated(freqs_loc, clusters_loc):
def read_attrs_from_deprecated(
msg: Printer, freqs_loc: Optional[Path], clusters_loc: Optional[Path]
) -> List[Dict[str, Any]]:
if freqs_loc is not None:
with msg.loading("Counting frequencies..."):
probs, _ = read_freqs(freqs_loc)
@ -139,7 +179,12 @@ def read_attrs_from_deprecated(freqs_loc, clusters_loc):
return lex_attrs
def create_model(lang, lex_attrs, name=None, base_model=None):
def create_model(
lang: str,
lex_attrs: List[Dict[str, Any]],
name: Optional[str] = None,
base_model: Optional[Union[str, Path]] = None,
) -> Language:
if base_model:
nlp = load_model(base_model)
# keep the tokenizer but remove any existing pipeline components due to
@ -166,7 +211,14 @@ def create_model(lang, lex_attrs, name=None, base_model=None):
return nlp
def add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, name=None):
def add_vectors(
msg: Printer,
nlp: Language,
vectors_loc: Optional[Path],
truncate_vectors: int,
prune_vectors: int,
name: Optional[str] = None,
) -> None:
vectors_loc = ensure_path(vectors_loc)
if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))
@ -176,7 +228,7 @@ def add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, name=None):
else:
if vectors_loc:
with msg.loading(f"Reading vectors from {vectors_loc}"):
vectors_data, vector_keys = read_vectors(vectors_loc)
vectors_data, vector_keys = read_vectors(msg, vectors_loc)
msg.good(f"Loaded vectors from {vectors_loc}")
else:
vectors_data, vector_keys = (None, None)
@ -195,7 +247,7 @@ def add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, name=None):
nlp.vocab.prune_vectors(prune_vectors)
def read_vectors(vectors_loc, truncate_vectors=0):
def read_vectors(msg: Printer, vectors_loc: Path, truncate_vectors: int = 0):
f = open_file(vectors_loc)
shape = tuple(int(size) for size in next(f).split())
if truncate_vectors >= 1:
@ -215,7 +267,9 @@ def read_vectors(vectors_loc, truncate_vectors=0):
return vectors_data, vectors_keys
def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
def read_freqs(
freqs_loc: Path, max_length: int = 100, min_doc_freq: int = 5, min_freq: int = 50
):
counts = PreshCounter()
total = 0
with freqs_loc.open() as f:
@ -244,7 +298,7 @@ def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
return probs, oov_prob
def read_clusters(clusters_loc):
def read_clusters(clusters_loc: Path) -> dict:
clusters = {}
if ftfy is None:
warnings.warn(Warnings.W004)

View File

@ -1,19 +1,24 @@
from typing import Optional, Union, Any, Dict
import shutil
from pathlib import Path
from wasabi import msg, get_raw_input
from wasabi import Printer, get_raw_input
import srsly
import sys
from ._app import app, Arg, Opt
from ..schemas import validate, ModelMetaSchema
from .. import util
from .. import about
def package(
@app.command("package")
def package_cli(
# fmt: off
input_dir: ("Directory with model data", "positional", None, str),
output_dir: ("Output parent directory", "positional", None, str),
meta_path: ("Path to meta.json", "option", "m", str) = None,
create_meta: ("Create meta.json, even if one exists", "flag", "c", bool) = False,
force: ("Force overwriting existing model in output directory", "flag", "f", bool) = False,
input_dir: Path = Arg(..., help="Directory with model data", exists=True, file_okay=False),
output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False),
meta_path: Optional[Path] = Opt(None, "--meta-path", "-m", help="Path to meta.json", exists=True, dir_okay=False),
create_meta: bool = Opt(False, "--create-meta", "-c", "-C", help="Create meta.json, even if one exists"),
force: bool = Opt(False, "--force", "-f", "-F", help="Force overwriting existing model in output directory"),
# fmt: on
):
"""
@ -23,6 +28,25 @@ def package(
set and a meta.json already exists in the output directory, the existing
values will be used as the defaults in the command-line prompt.
"""
package(
input_dir,
output_dir,
meta_path=meta_path,
create_meta=create_meta,
force=force,
silent=False,
)
def package(
input_dir: Path,
output_dir: Path,
meta_path: Optional[Path] = None,
create_meta: bool = False,
force: bool = False,
silent: bool = True,
) -> None:
msg = Printer(no_print=silent, pretty=not silent)
input_path = util.ensure_path(input_dir)
output_path = util.ensure_path(output_dir)
meta_path = util.ensure_path(meta_path)
@ -33,23 +57,20 @@ def package(
if meta_path and not meta_path.exists():
msg.fail("Can't find model meta.json", meta_path, exits=1)
meta_path = meta_path or input_path / "meta.json"
if meta_path.is_file():
meta = srsly.read_json(meta_path)
if not create_meta: # only print if user doesn't want to overwrite
msg.good("Loaded meta.json from file", meta_path)
else:
meta = generate_meta(input_dir, meta, msg)
for key in ("lang", "name", "version"):
if key not in meta or meta[key] == "":
msg.fail(
f"No '{key}' setting found in meta.json",
"This setting is required to build your package.",
exits=1,
)
meta_path = meta_path or input_dir / "meta.json"
if not meta_path.exists() or not meta_path.is_file():
msg.fail("Can't load model meta.json", meta_path, exits=1)
meta = srsly.read_json(meta_path)
if not create_meta: # only print if user doesn't want to overwrite
msg.good("Loaded meta.json from file", meta_path)
else:
meta = generate_meta(input_dir, meta, msg)
errors = validate(ModelMetaSchema, meta)
if errors:
msg.fail("Invalid model meta.json", "\n".join(errors), exits=1)
model_name = meta["lang"] + "_" + meta["name"]
model_name_v = model_name + "-" + meta["version"]
main_path = output_path / model_name_v
main_path = output_dir / model_name_v
package_path = main_path / model_name
if package_path.exists():
@ -63,21 +84,26 @@ def package(
exits=1,
)
Path.mkdir(package_path, parents=True)
shutil.copytree(str(input_path), str(package_path / model_name_v))
shutil.copytree(str(input_dir), str(package_path / model_name_v))
create_file(main_path / "meta.json", srsly.json_dumps(meta, indent=2))
create_file(main_path / "setup.py", TEMPLATE_SETUP)
create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST)
create_file(package_path / "__init__.py", TEMPLATE_INIT)
msg.good(f"Successfully created package '{model_name_v}'", main_path)
msg.text("To build the package, run `python setup.py sdist` in this directory.")
with util.working_dir(main_path):
util.run_command([sys.executable, "setup.py", "sdist"])
zip_file = main_path / "dist" / f"{model_name_v}.tar.gz"
msg.good(f"Successfully created zipped Python package", zip_file)
def create_file(file_path, contents):
def create_file(file_path: Path, contents: str) -> None:
file_path.touch()
file_path.open("w", encoding="utf-8").write(contents)
def generate_meta(model_path, existing_meta, msg):
def generate_meta(
model_path: Union[str, Path], existing_meta: Dict[str, Any], msg: Printer
) -> Dict[str, Any]:
meta = existing_meta or {}
settings = [
("lang", "Model language", meta.get("lang", "en")),

View File

@ -1,14 +1,15 @@
from typing import Optional
import random
import numpy
import time
import re
from collections import Counter
import plac
from pathlib import Path
from thinc.api import Linear, Maxout, chain, list2array, use_pytorch_for_gpu_memory
from wasabi import msg
import srsly
from ._app import app, Arg, Opt
from ..errors import Errors
from ..ml.models.multi_task import build_masked_language_model
from ..tokens import Doc
@ -17,25 +18,17 @@ from .. import util
from ..gold import Example
@plac.annotations(
@app.command("pretrain")
def pretrain_cli(
# fmt: off
texts_loc=("Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the key 'tokens'", "positional", None, str),
vectors_model=("Name or path to spaCy model with vectors to learn from", "positional", None, str),
output_dir=("Directory to write models to on each epoch", "positional", None, Path),
config_path=("Path to config file", "positional", None, Path),
use_gpu=("Use GPU", "option", "g", int),
resume_path=("Path to pretrained weights from which to resume pretraining", "option", "r", Path),
epoch_resume=("The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files.", "option", "er", int),
texts_loc: Path = Arg(..., help="Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the key 'tokens'", exists=True),
vectors_model: str = Arg(..., help="Name or path to spaCy model with vectors to learn from"),
output_dir: Path = Arg(..., help="Directory to write models to on each epoch"),
config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False),
use_gpu: int = Opt(-1, "--use-gpu", "-g", help="Use GPU"),
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files."),
# fmt: on
)
def pretrain(
texts_loc,
vectors_model,
config_path,
output_dir,
use_gpu=-1,
resume_path=None,
epoch_resume=None,
):
"""
Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
@ -52,6 +45,26 @@ def pretrain(
all settings are the same between pretraining and training. Ideally,
this is done by using the same config file for both commands.
"""
pretrain(
texts_loc,
vectors_model,
output_dir,
config_path,
use_gpu=use_gpu,
resume_path=resume_path,
epoch_resume=epoch_resume,
)
def pretrain(
texts_loc: Path,
vectors_model: str,
output_dir: Path,
config_path: Path,
use_gpu: int = -1,
resume_path: Optional[Path] = None,
epoch_resume: Optional[int] = None,
):
if not config_path or not config_path.exists():
msg.fail("Config file not found", config_path, exits=1)

View File

@ -1,3 +1,4 @@
from typing import Optional, Sequence, Union, Iterator
import tqdm
from pathlib import Path
import srsly
@ -5,17 +6,19 @@ import cProfile
import pstats
import sys
import itertools
import ml_datasets
from wasabi import msg
from wasabi import msg, Printer
from ._app import app, Arg, Opt
from ..language import Language
from ..util import load_model
def profile(
@app.command("profile")
def profile_cli(
# fmt: off
model: ("Model to load", "positional", None, str),
inputs: ("Location of input file. '-' for stdin.", "positional", None, str) = None,
n_texts: ("Maximum number of texts to use if available", "option", "n", int) = 10000,
model: str = Arg(..., help="Model to load"),
inputs: Optional[Path] = Arg(None, help="Location of input file. '-' for stdin.", exists=True, allow_dash=True),
n_texts: int = Opt(10000, "--n-texts", "-n", help="Maximum number of texts to use if available"),
# fmt: on
):
"""
@ -24,6 +27,18 @@ def profile(
It can either be provided as a JSONL file, or be read from sys.sytdin.
If no input file is specified, the IMDB dataset is loaded via Thinc.
"""
profile(model, inputs=inputs, n_texts=n_texts)
def profile(model: str, inputs: Optional[Path] = None, n_texts: int = 10000) -> None:
try:
import ml_datasets
except ImportError:
msg.fail(
"This command requires the ml_datasets library to be installed:"
"pip install ml_datasets",
exits=1,
)
if inputs is not None:
inputs = _read_inputs(inputs, msg)
if inputs is None:
@ -43,12 +58,12 @@ def profile(
s.strip_dirs().sort_stats("time").print_stats()
def parse_texts(nlp, texts):
def parse_texts(nlp: Language, texts: Sequence[str]) -> None:
for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=16):
pass
def _read_inputs(loc, msg):
def _read_inputs(loc: Union[Path, str], msg: Printer) -> Iterator[str]:
if loc == "-":
msg.info("Reading input from sys.stdin")
file_ = sys.stdin

View File

@ -1,16 +1,15 @@
from typing import Optional, Dict, List, Union, Sequence
from typing import Optional, Dict
from timeit import default_timer as timer
import srsly
from pydantic import BaseModel, FilePath
import tqdm
from pathlib import Path
from wasabi import msg
import thinc
import thinc.schedules
from thinc.api import Model, use_pytorch_for_gpu_memory
from thinc.api import use_pytorch_for_gpu_memory
import random
from ._app import app, Arg, Opt
from ..gold import GoldCorpus
from ..lookups import Lookups
from .. import util
@ -19,6 +18,9 @@ from ..errors import Errors
# Don't remove - required to load the built-in architectures
from ..ml import models # noqa: F401
# from ..schemas import ConfigSchema # TODO: include?
registry = util.registry
CONFIG_STR = """
@ -80,54 +82,20 @@ subword_features = true
"""
class PipelineComponent(BaseModel):
factory: str
model: Model
class Config:
arbitrary_types_allowed = True
class ConfigSchema(BaseModel):
optimizer: Optional["Optimizer"]
class training(BaseModel):
patience: int = 10
eval_frequency: int = 100
dropout: float = 0.2
init_tok2vec: Optional[FilePath] = None
max_epochs: int = 100
orth_variant_level: float = 0.0
gold_preproc: bool = False
max_length: int = 0
use_gpu: int = 0
scores: List[str] = ["ents_p", "ents_r", "ents_f"]
score_weights: Dict[str, Union[int, float]] = {"ents_f": 1.0}
limit: int = 0
batch_size: Union[Sequence[int], int]
class nlp(BaseModel):
lang: str
vectors: Optional[str]
pipeline: Optional[Dict[str, PipelineComponent]]
class Config:
extra = "allow"
@app.command("train")
def train_cli(
# fmt: off
train_path: ("Location of JSON-formatted training data", "positional", None, Path),
dev_path: ("Location of JSON-formatted development data", "positional", None, Path),
config_path: ("Path to config file", "positional", None, Path),
output_path: ("Output directory to store model in", "option", "o", Path) = None,
code_path: ("Path to Python file with additional code (registered functions) to be imported", "option", "c", Path) = None,
init_tok2vec: ("Path to pretrained weights for the tok2vec components. See 'spacy pretrain'. Experimental.", "option", "t2v", Path) = None,
raw_text: ("Path to jsonl file with unlabelled text documents.", "option", "rt", Path) = None,
verbose: ("Display more information for debugging purposes", "flag", "VV", bool) = False,
use_gpu: ("Use GPU", "option", "g", int) = -1,
tag_map_path: ("Location of JSON-formatted tag map", "option", "tm", Path) = None,
omit_extra_lookups: ("Don't include extra lookups in model", "flag", "OEL", bool) = False,
train_path: Path = Arg(..., help="Location of JSON-formatted training data", exists=True),
dev_path: Path = Arg(..., help="Location of JSON-formatted development data", exists=True),
config_path: Path = Arg(..., help="Path to config file", exists=True),
output_path: Optional[Path] = Opt(None, "--output-path", "-o", help="Output directory to store model in"),
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
init_tok2vec: Optional[Path] = Opt(None, "--init-tok2vec", "-t2v", help="Path to pretrained weights for the tok2vec components. See 'spacy pretrain'. Experimental."),
raw_text: Optional[Path] = Opt(None, "--raw-text", "-rt", help="Path to jsonl file with unlabelled text documents."),
verbose: bool = Opt(False, "--verbose", "-VV", help="Display more information for debugging purposes"),
use_gpu: int = Opt(-1, "--use-gpu", "-g", help="Use GPU"),
tag_map_path: Optional[Path] = Opt(None, "--tag-map-path", "-tm", help="Location of JSON-formatted tag map"),
omit_extra_lookups: bool = Opt(False, "--omit-extra-lookups", "-OEL", help="Don't include extra lookups in model"),
# fmt: on
):
"""
@ -194,14 +162,14 @@ def train_cli(
def train(
config_path,
data_paths,
raw_text=None,
output_path=None,
tag_map=None,
weights_data=None,
omit_extra_lookups=False,
):
config_path: Path,
data_paths: Dict[str, Path],
raw_text: Optional[Path] = None,
output_path: Optional[Path] = None,
tag_map: Optional[Path] = None,
weights_data: Optional[bytes] = None,
omit_extra_lookups: bool = False,
) -> None:
msg.info(f"Loading config from: {config_path}")
# Read the config first without creating objects, to get to the original nlp_config
config = util.load_config(config_path, create_objects=False)

View File

@ -1,18 +1,25 @@
from typing import Tuple
from pathlib import Path
import sys
import requests
from wasabi import msg
from wasabi import msg, Printer
from ._app import app
from .. import about
from ..util import get_package_version, get_installed_models, get_base_version
from ..util import get_package_path, get_model_meta, is_compatible_version
def validate():
@app.command("validate")
def validate_cli():
"""
Validate that the currently installed version of spaCy is compatible
with the installed models. Should be run after `pip install -U spacy`.
"""
validate()
def validate() -> None:
model_pkgs, compat = get_model_pkgs()
spacy_version = get_base_version(about.__version__)
current_compat = compat.get(spacy_version, {})
@ -55,7 +62,8 @@ def validate():
sys.exit(1)
def get_model_pkgs():
def get_model_pkgs(silent: bool = False) -> Tuple[dict, dict]:
msg = Printer(no_print=silent, pretty=not silent)
with msg.loading("Loading compatibility table..."):
r = requests.get(about.__compatibility__)
if r.status_code != 200:
@ -93,7 +101,7 @@ def get_model_pkgs():
return pkgs, compat
def reformat_version(version):
def reformat_version(version: str) -> str:
"""Hack to reformat old versions ending on '-alpha' to match pip format."""
if version.endswith("-alpha"):
return version.replace("-alpha", "a0")

View File

@ -1,8 +1,9 @@
from typing import Dict, List, Union, Optional
from typing import Dict, List, Union, Optional, Sequence, Any
from enum import Enum
from pydantic import BaseModel, Field, ValidationError, validator
from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool
from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool, FilePath
from collections import defaultdict
from thinc.api import Model
from .attrs import NAMES
@ -163,24 +164,48 @@ class ModelMetaSchema(BaseModel):
email: Optional[StrictStr] = Field(None, title="Model author email")
url: Optional[StrictStr] = Field(None, title="Model author URL")
sources: Optional[Union[List[StrictStr], Dict[str, str]]] = Field(None, title="Training data sources")
vectors: Optional[Dict[str, int]] = Field(None, title="Included word vectors")
vectors: Optional[Dict[str, Any]] = Field(None, title="Included word vectors")
accuracy: Optional[Dict[str, Union[float, int]]] = Field(None, title="Accuracy numbers")
speed: Optional[Dict[str, Union[float, int]]] = Field(None, title="Speed evaluation numbers")
# fmt: on
# Training data object in "simple training style"
# JSON training format
class SimpleTrainingSchema(BaseModel):
# TODO: write
class PipelineComponent(BaseModel):
factory: str
model: Model
class Config:
title = "Schema for training data dict in passed to nlp.update"
extra = "forbid"
arbitrary_types_allowed = True
# JSON training format
class ConfigSchema(BaseModel):
optimizer: Optional["Optimizer"]
class training(BaseModel):
patience: int = 10
eval_frequency: int = 100
dropout: float = 0.2
init_tok2vec: Optional[FilePath] = None
max_epochs: int = 100
orth_variant_level: float = 0.0
gold_preproc: bool = False
max_length: int = 0
use_gpu: int = 0
scores: List[str] = ["ents_p", "ents_r", "ents_f"]
score_weights: Dict[str, Union[int, float]] = {"ents_f": 1.0}
limit: int = 0
batch_size: Union[Sequence[int], int]
class nlp(BaseModel):
lang: str
vectors: Optional[str]
pipeline: Optional[Dict[str, PipelineComponent]]
class Config:
extra = "allow"
class TrainingSchema(BaseModel):
@ -189,3 +214,34 @@ class TrainingSchema(BaseModel):
class Config:
title = "Schema for training data in spaCy's JSON format"
extra = "forbid"
# Project config Schema
class ProjectConfigAsset(BaseModel):
dest: StrictStr = Field(..., title="Destination of downloaded asset")
url: StrictStr = Field(..., title="URL of asset")
class ProjectConfigCommand(BaseModel):
# fmt: off
name: StrictStr = Field(..., title="Name of command")
help: Optional[StrictStr] = Field(None, title="Command description")
script: List[StrictStr] = Field([], title="List of CLI commands to run, in order")
dvc_deps: List[StrictStr] = Field([], title="Data Version Control dependencies")
dvc_outputs: List[StrictStr] = Field([], title="Data Version Control outputs")
dvc_outputs_no_cache: List[StrictStr] = Field([], title="Data Version Control outputs (no cache)")
# fmt: on
class ProjectConfigSchema(BaseModel):
# fmt: off
variables: Dict[StrictStr, Union[str, int, float, bool]] = Field({}, title="Optional variables to substitute in commands")
assets: List[ProjectConfigAsset] = Field([], title="Data assets")
run: List[StrictStr] = Field([], title="Names of project commands to execute, in order")
commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts")
# fmt: on
class Config:
title = "Schema for project configuration file"

View File

@ -1,10 +1,10 @@
from typing import List, Union
import os
import importlib
import importlib.util
import re
from pathlib import Path
import random
from typing import List
import thinc
from thinc.api import NumpyOps, get_current_ops, Adam, require_gpu, Config
import functools
@ -17,6 +17,8 @@ import sys
import warnings
from packaging.specifiers import SpecifierSet, InvalidSpecifier
from packaging.version import Version, InvalidVersion
import subprocess
from contextlib import contextmanager
try:
@ -429,6 +431,30 @@ def get_package_path(name):
return Path(pkg.__file__).parent
def run_command(command: List[str]) -> None:
"""Run a command on the command line as a subprocess.
command (list): The split command.
"""
status = subprocess.call(command, env=os.environ.copy())
if status != 0:
sys.exit(status)
@contextmanager
def working_dir(path: Union[str, Path]) -> None:
"""Change current working directory and returns to previous on exit.
path (str / Path): The directory to navigate to.
"""
prev_cwd = Path.cwd()
os.chdir(str(path))
try:
yield
finally:
os.chdir(prev_cwd)
def is_in_jupyter():
"""Check if user is running spaCy from a Jupyter notebook by detecting the
IPython kernel. Mainly used for the displaCy visualizer.