Merge branch 'whatif/arrow' of https://github.com/explosion/spaCy into whatif/arrow

This commit is contained in:
Matthew Honnibal 2020-06-22 17:49:16 +02:00
commit 8f420f3978
27 changed files with 674 additions and 328 deletions

View File

@ -5,13 +5,13 @@ thinc==8.0.0a9
blis>=0.4.0,<0.5.0
ml_datasets>=0.1.1
murmurhash>=0.28.0,<1.1.0
wasabi>=0.4.0,<1.1.0
srsly>=2.0.0,<3.0.0
wasabi>=0.7.0,<1.1.0
srsly>=2.1.0,<3.0.0
catalogue>=0.0.7,<1.1.0
typer>=0.2.1,<1.0.0
# Third party dependencies
numpy>=1.15.0
requests>=2.13.0,<3.0.0
plac>=0.9.6,<1.2.0
tqdm>=4.38.0,<5.0.0
pydantic>=1.3.0,<2.0.0
# Official Python utilities

View File

@ -44,14 +44,13 @@ install_requires =
preshed>=3.0.2,<3.1.0
thinc==8.0.0a9
blis>=0.4.0,<0.5.0
wasabi>=0.4.0,<1.1.0
srsly>=2.0.0,<3.0.0
wasabi>=0.7.0,<1.1.0
srsly>=2.1.0,<3.0.0
catalogue>=0.0.7,<1.1.0
ml_datasets>=0.1.1
typer>=0.2.1,<1.0.0
# Third-party dependencies
tqdm>=4.38.0,<5.0.0
numpy>=1.15.0
plac>=0.9.6,<1.2.0
requests>=2.13.0,<3.0.0
pydantic>=1.3.0,<2.0.0
# Official Python utilities

View File

@ -8,7 +8,7 @@ warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
from thinc.api import prefer_gpu, require_gpu
from . import pipeline
from .cli.info import info as cli_info
from .cli.info import info
from .glossary import explain
from .about import __version__
from .errors import Errors, Warnings
@ -34,7 +34,3 @@ def load(name, **overrides):
def blank(name, **kwargs):
LangClass = util.get_lang_class(name)
return LangClass(**kwargs)
def info(model=None, markdown=False, silent=False):
return cli_info(model, markdown, silent)

View File

@ -1,31 +1,4 @@
if __name__ == "__main__":
import plac
import sys
from wasabi import msg
from spacy.cli import download, link, info, package, pretrain, convert
from spacy.cli import init_model, profile, evaluate, validate, debug_data
from spacy.cli import train_cli
from spacy.cli import setup_cli
commands = {
"download": download,
"link": link,
"info": info,
"train": train_cli,
"pretrain": pretrain,
"debug-data": debug_data,
"evaluate": evaluate,
"convert": convert,
"package": package,
"init-model": init_model,
"profile": profile,
"validate": validate,
}
if len(sys.argv) == 1:
msg.info("Available commands", ", ".join(commands), exits=1)
command = sys.argv.pop(1)
sys.argv[0] = f"spacy {command}"
if command in commands:
plac.call(commands[command], sys.argv[1:])
else:
available = f"Available: {', '.join(commands)}"
msg.fail(f"Unknown command: {command}", available, exits=1)
setup_cli()

View File

@ -5,3 +5,4 @@ __release__ = True
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
__shortcuts__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts-v2.json"
__projects__ = "https://github.com/explosion/spacy-boilerplates"

View File

@ -1,5 +1,9 @@
from wasabi import msg
from ._app import app, setup_cli # noqa: F401
# These are the actual functions, NOT the wrapped CLI commands. The CLI commands
# are registered automatically and won't have to be imported here.
from .download import download # noqa: F401
from .info import info # noqa: F401
from .package import package # noqa: F401
@ -13,7 +17,10 @@ from .init_model import init_model # noqa: F401
from .validate import validate # noqa: F401
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
def link(*args, **kwargs):
"""As of spaCy v3.0, model symlinks are deprecated. You can load models
using their full names or from a directory path."""
msg.warn(
"As of spaCy v3.0, model symlinks are deprecated. You can load models "
"using their full names or from a directory path."

31
spacy/cli/_app.py Normal file
View File

@ -0,0 +1,31 @@
from typing import Optional
import typer
from typer.main import get_command
COMMAND = "python -m spacy"
NAME = "spacy"
HELP = """spaCy Command-line Interface
DOCS: https://spacy.io/api/cli
"""
app = typer.Typer(name=NAME, help=HELP)
def Arg(*args, help: Optional[str] = None, **kwargs) -> typer.Argument:
"""Wrapper for Typer's annotation to keep it short and set defaults."""
# Filter out help for now until it's officially supported
return typer.Argument(*args, **kwargs)
def Opt(*args, **kwargs) -> typer.Option:
"""Wrapper for Typer's annotation to keep it short and set defaults."""
return typer.Option(*args, show_default=True, **kwargs)
def setup_cli() -> None:
# Ensure that the help messages always display the correct prompt
command = get_command(app)
command(prog_name=COMMAND)

View File

@ -1,9 +1,12 @@
from typing import Optional
from enum import Enum
from pathlib import Path
from wasabi import Printer
import srsly
import re
import sys
from ._app import app, Arg, Opt
from ..gold import docs_to_json
from ..tokens import DocBin
from ..gold.converters import iob2docs, conll_ner2docs, json2docs
@ -23,37 +26,82 @@ CONVERTERS = {
}
# File types
FILE_TYPES = ("json", "jsonl", "msg")
FILE_TYPES_STDOUT = ("json", "jsonl")
# File types that can be written to stdout
FILE_TYPES_STDOUT = ("json")
def convert(
class FileTypes(str, Enum):
json = "json"
spacy = "spacy"
@app.command("convert")
def convert_cli(
# fmt: off
input_path: ("Input file or directory", "positional", None, Path),
output_dir: ("Output directory.", "positional", None, Path),
file_type: (f"Type of data to produce: {FILE_TYPES}", "option", "t", str, FILE_TYPES) = "spacy",
n_sents: ("Number of sentences per doc (0 to disable)", "option", "n", int) = 1,
seg_sents: ("Segment sentences (for -c ner)", "flag", "s") = False,
model: ("Model for sentence segmentation (for -s)", "option", "b", str) = None,
morphology: ("Enable appending morphology to tags", "flag", "m", bool) = False,
merge_subtokens: ("Merge CoNLL-U subtokens", "flag", "T", bool) = False,
converter: (f"Converter: {tuple(CONVERTERS.keys())}", "option", "c", str) = "auto",
ner_map: ("NER tag mapping (as JSON-encoded dict of entity types)", "option", "N", Path) = None,
lang: ("Language (if tokenizer required)", "option", "l", str) = None,
input_path: str = Arg(..., help="Input file or directory", exists=True),
output_dir: Path = Arg("-", help="Output directory. '-' for stdout.", allow_dash=True, exists=True),
file_type: FileTypes = Opt("spacy", "--file-type", "-t", help="Type of data to produce"),
n_sents: int = Opt(1, "--n-sents", "-n", help="Number of sentences per doc (0 to disable)"),
seg_sents: bool = Opt(False, "--seg-sents", "-s", help="Segment sentences (for -c ner)"),
model: Optional[str] = Opt(None, "--model", "-b", help="Model for sentence segmentation (for -s)"),
morphology: bool = Opt(False, "--morphology", "-m", help="Enable appending morphology to tags"),
merge_subtokens: bool = Opt(False, "--merge-subtokens", "-T", help="Merge CoNLL-U subtokens"),
converter: str = Opt("auto", "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"),
ner_map: Optional[Path] = Opt(None, "--ner-map", "-N", help="NER tag mapping (as JSON-encoded dict of entity types)", exists=True),
lang: Optional[str] = Opt(None, "--lang", "-l", help="Language (if tokenizer required)"),
# fmt: on
):
"""
Convert files into json or DocBin format for use with train command and other
experiment management functions.
experiment management functions. If no output_dir is specified, the data
is written to stdout, so you can pipe them forward to a JSON file:
$ spacy convert some_file.conllu > some_file.json
"""
if isinstance(file_type, FileTypes):
# We get an instance of the FileTypes from the CLI so we need its string value
file_type = file_type.value
cli_args = locals()
no_print = output_dir == "-"
silent = output_dir == "-"
output_dir = Path(output_dir) if output_dir != "-" else "-"
msg = Printer(no_print=no_print)
msg = Printer(no_print=silent)
verify_cli_args(msg, **cli_args)
converter = _get_converter(msg, converter, input_path)
convert(
input_path,
output_dir,
file_type=file_type,
n_sents=n_sents,
seg_sents=seg_sents,
model=model,
morphology=morphology,
merge_subtokens=merge_subtokens,
converter=converter,
ner_map=ner_map,
lang=lang,
silent=silent,
msg=msg,
)
def convert(
input_path: Path,
output_dir: Path,
*,
file_type: str = "json",
n_sents: int = 1,
seg_sents: bool = False,
model: Optional[str] = None,
morphology: bool = False,
merge_subtokens: bool = False,
converter: str = "auto",
ner_map: Optional[Path] = None,
lang: Optional[str] = None,
silent: bool = True,
msg: Optional[Path] = None,
) -> None:
if not msg:
msg = Printer(no_print=silent)
ner_map = srsly.read_json(ner_map) if ner_map is not None else None
for input_loc in walk_directory(input_path):
input_data = input_loc.open("r", encoding="utf-8").read()
# Use converter function to convert data
@ -66,25 +114,30 @@ def convert(
merge_subtokens=merge_subtokens,
lang=lang,
model=model,
no_print=no_print,
no_print=silent,
ner_map=ner_map,
)
if output_dir != "-":
# Export data to a file
suffix = f".{file_type}"
subpath = input_loc.relative_to(input_path)
output_file = (output_dir / subpath).with_suffix(suffix)
output_file = Path(output_dir) / subpath.with_suffix(suffix)
if not output_file.parent.exists():
output_file.parent.mkdir(parents=True)
if file_type == "json":
data = docs2json(docs)
srsly.write_json(output_file, docs2json(docs))
srsly.write_json(output_file, docs_to_json(docs))
else:
data = DocBin(docs=docs).to_bytes()
with output_file.open("wb") as file_:
file_.write(data)
msg.good(f"Generated output file ({len(docs)} documents): {output_file}")
else:
# Print to stdout
if file_type == "json":
srsly.write_json("-", docs)
def autodetect_ner_format(input_data):
def autodetect_ner_format(input_data: str) -> str:
# guess format from the first 20 lines
lines = input_data.split("\n")[:20]
format_guesses = {"ner": 0, "iob": 0}

View File

@ -1,11 +1,14 @@
from typing import Optional, List, Sequence, Dict, Any, Tuple
from pathlib import Path
from collections import Counter
import sys
import srsly
from wasabi import Printer, MESSAGES
from ..gold import Corpus
from ._app import app, Arg, Opt
from ..gold import Corpus, Example
from ..syntax import nonproj
from ..language import Language
from ..util import load_model, get_lang_class
@ -18,17 +21,18 @@ BLANK_MODEL_MIN_THRESHOLD = 100
BLANK_MODEL_THRESHOLD = 2000
def debug_data(
@app.command("debug-data")
def debug_data_cli(
# fmt: off
lang: ("Model language", "positional", None, str),
train_path: ("Location of JSON-formatted training data", "positional", None, Path),
dev_path: ("Location of JSON-formatted development data", "positional", None, Path),
tag_map_path: ("Location of JSON-formatted tag map", "option", "tm", Path) = None,
base_model: ("Name of model to update (optional)", "option", "b", str) = None,
pipeline: ("Comma-separated names of pipeline components to train", "option", "p", str) = "tagger,parser,ner",
ignore_warnings: ("Ignore warnings, only show stats and errors", "flag", "IW", bool) = False,
verbose: ("Print additional information and explanations", "flag", "V", bool) = False,
no_format: ("Don't pretty-print the results", "flag", "NF", bool) = False,
lang: str = Arg(..., help="Model language"),
train_path: Path = Arg(..., help="Location of JSON-formatted training data", exists=True),
dev_path: Path = Arg(..., help="Location of JSON-formatted development data", exists=True),
tag_map_path: Optional[Path] = Opt(None, "--tag-map-path", "-tm", help="Location of JSON-formatted tag map", exists=True, dir_okay=False),
base_model: Optional[str] = Opt(None, "--base-model", "-b", help="Name of model to update (optional)"),
pipeline: str = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of pipeline components to train"),
ignore_warnings: bool = Opt(False, "--ignore-warnings", "-IW", help="Ignore warnings, only show stats and errors"),
verbose: bool = Opt(False, "--verbose", "-V", help="Print additional information and explanations"),
no_format: bool = Opt(False, "--no-format", "-NF", help="Don't pretty-print the results"),
# fmt: on
):
"""
@ -36,8 +40,36 @@ def debug_data(
stats, and find problems like invalid entity annotations, cyclic
dependencies, low data labels and more.
"""
msg = Printer(pretty=not no_format, ignore_warnings=ignore_warnings)
debug_data(
lang,
train_path,
dev_path,
tag_map_path=tag_map_path,
base_model=base_model,
pipeline=[p.strip() for p in pipeline.split(",")],
ignore_warnings=ignore_warnings,
verbose=verbose,
no_format=no_format,
silent=False,
)
def debug_data(
lang: str,
train_path: Path,
dev_path: Path,
*,
tag_map_path: Optional[Path] = None,
base_model: Optional[str] = None,
pipeline: List[str] = ["tagger", "parser", "ner"],
ignore_warnings: bool = False,
verbose: bool = False,
no_format: bool = True,
silent: bool = True,
):
msg = Printer(
no_print=silent, pretty=not no_format, ignore_warnings=ignore_warnings
)
# Make sure all files and paths exists if they are needed
if not train_path.exists():
msg.fail("Training data not found", train_path, exits=1)
@ -49,7 +81,6 @@ def debug_data(
tag_map = srsly.read_json(tag_map_path)
# Initialize the model and pipeline
pipeline = [p.strip() for p in pipeline.split(",")]
if base_model:
nlp = load_model(base_model)
else:
@ -446,7 +477,7 @@ def debug_data(
sys.exit(1)
def _load_file(file_path, msg):
def _load_file(file_path: Path, msg: Printer) -> None:
file_name = file_path.parts[-1]
if file_path.suffix == ".json":
with msg.loading(f"Loading {file_name}..."):
@ -465,7 +496,9 @@ def _load_file(file_path, msg):
)
def _compile_gold(examples, pipeline, nlp):
def _compile_gold(
examples: Sequence[Example], pipeline: List[str], nlp: Language
) -> Dict[str, Any]:
data = {
"ner": Counter(),
"cats": Counter(),
@ -537,13 +570,13 @@ def _compile_gold(examples, pipeline, nlp):
return data
def _format_labels(labels, counts=False):
def _format_labels(labels: List[Tuple[str, int]], counts: bool = False) -> str:
if counts:
return ", ".join([f"'{l}' ({c})" for l, c in labels])
return ", ".join([f"'{l}'" for l in labels])
def _get_examples_without_label(data, label):
def _get_examples_without_label(data: Sequence[Example], label: str) -> int:
count = 0
for eg in data:
labels = [
@ -556,7 +589,7 @@ def _get_examples_without_label(data, label):
return count
def _get_labels_from_model(nlp, pipe_name):
def _get_labels_from_model(nlp: Language, pipe_name: str) -> Sequence[str]:
if pipe_name not in nlp.pipe_names:
return set()
pipe = nlp.get_pipe(pipe_name)

View File

@ -1,23 +1,36 @@
from typing import Optional, Sequence, Union
import requests
import os
import subprocess
import sys
from wasabi import msg
import typer
from ._app import app, Arg, Opt
from .. import about
from ..util import is_package, get_base_version
from ..util import is_package, get_base_version, run_command
def download(
model: ("Model to download (shortcut or name)", "positional", None, str),
direct: ("Force direct download of name + version", "flag", "d", bool) = False,
*pip_args: ("Additional arguments to be passed to `pip install` on model install"),
@app.command(
"download",
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
)
def download_cli(
# fmt: off
ctx: typer.Context,
model: str = Arg(..., help="Model to download (shortcut or name)"),
direct: bool = Opt(False, "--direct", "-d", "-D", help="Force direct download of name + version"),
# fmt: on
):
"""
Download compatible model from default download path using pip. If --direct
flag is set, the command expects the full model name with version.
For direct downloads, the compatibility check will be skipped.
For direct downloads, the compatibility check will be skipped. All
additional arguments provided to this command will be passed to `pip install`
on model installation.
"""
download(model, direct, *ctx.args)
def download(model: str, direct: bool = False, *pip_args) -> None:
if not is_package("spacy") and "--no-deps" not in pip_args:
msg.warn(
"Skipping model package dependencies and setting `--no-deps`. "
@ -33,22 +46,20 @@ def download(
components = model.split("-")
model_name = "".join(components[:-1])
version = components[-1]
dl = download_model(dl_tpl.format(m=model_name, v=version), pip_args)
download_model(dl_tpl.format(m=model_name, v=version), pip_args)
else:
shortcuts = get_json(about.__shortcuts__, "available shortcuts")
model_name = shortcuts.get(model, model)
compatibility = get_compatibility()
version = get_version(model_name, compatibility)
dl = download_model(dl_tpl.format(m=model_name, v=version), pip_args)
if dl != 0: # if download subprocess doesn't return 0, exit
sys.exit(dl)
download_model(dl_tpl.format(m=model_name, v=version), pip_args)
msg.good(
"Download and installation successful",
f"You can now load the model via spacy.load('{model_name}')",
)
def get_json(url, desc):
def get_json(url: str, desc: str) -> Union[dict, list]:
r = requests.get(url)
if r.status_code != 200:
msg.fail(
@ -62,7 +73,7 @@ def get_json(url, desc):
return r.json()
def get_compatibility():
def get_compatibility() -> dict:
version = get_base_version(about.__version__)
comp_table = get_json(about.__compatibility__, "compatibility table")
comp = comp_table["spacy"]
@ -71,7 +82,7 @@ def get_compatibility():
return comp[version]
def get_version(model, comp):
def get_version(model: str, comp: dict) -> str:
model = get_base_version(model)
if model not in comp:
msg.fail(
@ -81,10 +92,12 @@ def get_version(model, comp):
return comp[model][0]
def download_model(filename, user_pip_args=None):
def download_model(
filename: str, user_pip_args: Optional[Sequence[str]] = None
) -> None:
download_url = about.__download_url__ + "/" + filename
pip_args = ["--no-cache-dir"]
if user_pip_args:
pip_args.extend(user_pip_args)
cmd = [sys.executable, "-m", "pip", "install"] + pip_args + [download_url]
return subprocess.call(cmd, env=os.environ.copy())
run_command(cmd)

View File

@ -1,26 +1,56 @@
from typing import Optional, List
from timeit import default_timer as timer
from wasabi import msg
from wasabi import Printer
from pathlib import Path
from ..gold import Corpus
from ..tokens import Doc
from ._app import app, Arg, Opt
from ..scorer import Scorer
from .. import util
from .. import displacy
def evaluate(
@app.command("evaluate")
def evaluate_cli(
# fmt: off
model: ("Model name or path", "positional", None, str),
data_path: ("Location of JSON-formatted evaluation data", "positional", None, str),
gpu_id: ("Use GPU", "option", "g", int) = -1,
gold_preproc: ("Use gold preprocessing", "flag", "G", bool) = False,
displacy_path: ("Directory to output rendered parses as HTML", "option", "dp", str) = None,
displacy_limit: ("Limit of parses to render as HTML", "option", "dl", int) = 25,
return_scores: ("Return dict containing model scores", "flag", "R", bool) = False,
model: str = Arg(..., help="Model name or path"),
data_path: Path = Arg(..., help="Location of JSON-formatted evaluation data", exists=True),
gpu_id: int = Opt(-1, "--gpu-id", "-g", help="Use GPU"),
gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"),
displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"),
return_scores: bool = Opt(False, "--return-scores", "-R", help="Return dict containing model scores"),
# fmt: on
):
"""
Evaluate a model. To render a sample of parses in a HTML file, set an
output directory as the displacy_path argument.
"""
evaluate(
model,
data_path,
gpu_id=gpu_id,
gold_preproc=gold_preproc,
displacy_path=displacy_path,
displacy_limit=displacy_limit,
silent=False,
return_scores=return_scores,
)
def evaluate(
model: str,
data_path: Path,
gpu_id: int = -1,
gold_preproc: bool = False,
displacy_path: Optional[Path] = None,
displacy_limit: int = 25,
silent: bool = True,
return_scores: bool = False,
) -> Scorer:
msg = Printer(no_print=silent, pretty=not silent)
util.fix_random_seed()
if gpu_id >= 0:
util.use_gpu(gpu_id)
@ -79,7 +109,14 @@ def evaluate(
return scorer.scores
def render_parses(docs, output_path, model_name="", limit=250, deps=True, ents=True):
def render_parses(
docs: List[Doc],
output_path: Path,
model_name: str = "",
limit: int = 250,
deps: bool = True,
ents: bool = True,
):
docs[0].user_data["title"] = model_name
if ents:
html = displacy.render(docs[:limit], style="ent", page=True)

View File

@ -1,24 +1,80 @@
from typing import Optional, Dict, Any, Union
import platform
from pathlib import Path
from wasabi import msg
from wasabi import Printer
import srsly
from .validate import get_model_pkgs
from ._app import app, Arg, Opt
from .. import util
from .. import about
def info(
model: ("Optional model name", "positional", None, str) = None,
markdown: ("Generate Markdown for GitHub issues", "flag", "md", str) = False,
silent: ("Don't print anything (just return)", "flag", "s") = False,
@app.command("info")
def info_cli(
# fmt: off
model: Optional[str] = Arg(None, help="Optional model name"),
markdown: bool = Opt(False, "--markdown", "-md", help="Generate Markdown for GitHub issues"),
silent: bool = Opt(False, "--silent", "-s", "-S", help="Don't print anything (just return)"),
# fmt: on
):
"""
Print info about spaCy installation. If a model is speficied as an argument,
print model information. Flag --markdown prints details in Markdown for easy
copy-pasting to GitHub issues.
"""
info(model, markdown=markdown, silent=silent)
def info(
model: Optional[str] = None, *, markdown: bool = False, silent: bool = True
) -> Union[str, dict]:
msg = Printer(no_print=silent, pretty=not silent)
if model:
title = f"Info about model '{model}'"
data = info_model(model, silent=silent)
else:
title = "Info about spaCy"
data = info_spacy()
raw_data = {k.lower().replace(" ", "_"): v for k, v in data.items()}
if "Models" in data and isinstance(data["Models"], dict):
data["Models"] = ", ".join(f"{n} ({v})" for n, v in data["Models"].items())
markdown_data = get_markdown(data, title=title)
if markdown:
if not silent:
print(markdown_data)
return markdown_data
if not silent:
table_data = dict(data)
msg.table(table_data, title=title)
return raw_data
def info_spacy() -> Dict[str, any]:
"""Generate info about the current spaCy intallation.
RETURNS (dict): The spaCy info.
"""
all_models = {}
for pkg_name in util.get_installed_models():
package = pkg_name.replace("-", "_")
all_models[package] = util.get_package_version(pkg_name)
return {
"spaCy version": about.__version__,
"Location": str(Path(__file__).parent.parent),
"Platform": platform.platform(),
"Python version": platform.python_version(),
"Models": all_models,
}
def info_model(model: str, *, silent: bool = True) -> Dict[str, Any]:
"""Generate info about a specific model.
model (str): Model name of path.
silent (bool): Don't print anything, just return.
RETURNS (dict): The model meta.
"""
msg = Printer(no_print=silent, pretty=not silent)
if util.is_package(model):
model_path = util.get_package_path(model)
else:
@ -32,46 +88,22 @@ def info(
meta["source"] = str(model_path.resolve())
else:
meta["source"] = str(model_path)
if not silent:
title = f"Info about model '{model}'"
model_meta = {
k: v for k, v in meta.items() if k not in ("accuracy", "speed")
}
if markdown:
print_markdown(model_meta, title=title)
else:
msg.table(model_meta, title=title)
return meta
all_models, _ = get_model_pkgs()
data = {
"spaCy version": about.__version__,
"Location": str(Path(__file__).parent.parent),
"Platform": platform.platform(),
"Python version": platform.python_version(),
"Models": ", ".join(
f"{m['name']} ({m['version']})" for m in all_models.values()
),
}
if not silent:
title = "Info about spaCy"
if markdown:
print_markdown(data, title=title)
else:
msg.table(data, title=title)
return data
return {k: v for k, v in meta.items() if k not in ("accuracy", "speed")}
def print_markdown(data, title=None):
"""Print data in GitHub-flavoured Markdown format for issues etc.
def get_markdown(data: Dict[str, Any], title: Optional[str] = None) -> str:
"""Get data in GitHub-flavoured Markdown format for issues etc.
data (dict or list of tuples): Label/value pairs.
title (str / None): Title, will be rendered as headline 2.
RETURNS (str): The Markdown string.
"""
markdown = []
for key, value in data.items():
if isinstance(value, str) and Path(value).exists():
continue
markdown.append(f"* **{key}:** {value}")
result = "\n{}\n".format("\n".join(markdown))
if title:
print(f"\n## {title}")
print("\n{}\n".format("\n".join(markdown)))
result = f"\n## {title}\n{result}"
return result

View File

@ -1,3 +1,4 @@
from typing import Optional, List, Dict, Any, Union, IO
import math
from tqdm import tqdm
import numpy
@ -9,10 +10,12 @@ import gzip
import zipfile
import srsly
import warnings
from wasabi import msg
from wasabi import Printer
from ._app import app, Arg, Opt
from ..vectors import Vectors
from ..errors import Errors, Warnings
from ..language import Language
from ..util import ensure_path, get_lang_class, load_model, OOV_RANK
from ..lookups import Lookups
@ -25,20 +28,21 @@ except ImportError:
DEFAULT_OOV_PROB = -20
def init_model(
@app.command("init-model")
def init_model_cli(
# fmt: off
lang: ("Model language", "positional", None, str),
output_dir: ("Model output directory", "positional", None, Path),
freqs_loc: ("Location of words frequencies file", "option", "f", Path) = None,
clusters_loc: ("Optional location of brown clusters data", "option", "c", str) = None,
jsonl_loc: ("Location of JSONL-formatted attributes file", "option", "j", Path) = None,
vectors_loc: ("Optional vectors file in Word2Vec format", "option", "v", str) = None,
prune_vectors: ("Optional number of vectors to prune to", "option", "V", int) = -1,
truncate_vectors: ("Optional number of vectors to truncate to when reading in vectors file", "option", "t", int) = 0,
vectors_name: ("Optional name for the word vectors, e.g. en_core_web_lg.vectors", "option", "vn", str) = None,
model_name: ("Optional name for the model meta", "option", "mn", str) = None,
omit_extra_lookups: ("Don't include extra lookups in model", "flag", "OEL", bool) = False,
base_model: ("Base model (for languages with custom tokenizers)", "option", "b", str) = None
lang: str = Arg(..., help="Model language"),
output_dir: Path = Arg(..., help="Model output directory"),
freqs_loc: Optional[Path] = Arg(None, help="Location of words frequencies file", exists=True),
clusters_loc: Optional[Path] = Opt(None, "--clusters-loc", "-c", help="Optional location of brown clusters data", exists=True),
jsonl_loc: Optional[Path] = Opt(None, "--jsonl-loc", "-j", help="Location of JSONL-formatted attributes file", exists=True),
vectors_loc: Optional[Path] = Opt(None, "--vectors-loc", "-v", help="Optional vectors file in Word2Vec format", exists=True),
prune_vectors: int = Opt(-1 , "--prune-vectors", "-V", help="Optional number of vectors to prune to"),
truncate_vectors: int = Opt(0, "--truncate-vectors", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
vectors_name: Optional[str] = Opt(None, "--vectors-name", "-vn", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
model_name: Optional[str] = Opt(None, "--model-name", "-mn", help="Optional name for the model meta"),
omit_extra_lookups: bool = Opt(False, "--omit-extra-lookups", "-OEL", help="Don't include extra lookups in model"),
base_model: Optional[str] = Opt(None, "--base-model", "-b", help="Base model (for languages with custom tokenizers)")
# fmt: on
):
"""
@ -46,6 +50,38 @@ def init_model(
and word vectors. If vectors are provided in Word2Vec format, they can
be either a .txt or zipped as a .zip or .tar.gz.
"""
init_model(
lang,
output_dir,
freqs_loc=freqs_loc,
clusters_loc=clusters_loc,
jsonl_loc=jsonl_loc,
prune_vectors=prune_vectors,
truncate_vectors=truncate_vectors,
vectors_name=vectors_name,
model_name=model_name,
omit_extra_lookups=omit_extra_lookups,
base_model=base_model,
silent=False,
)
def init_model(
lang: str,
output_dir: Path,
freqs_loc: Optional[Path] = None,
clusters_loc: Optional[Path] = None,
jsonl_loc: Optional[Path] = None,
vectors_loc: Optional[Path] = None,
prune_vectors: int = -1,
truncate_vectors: int = 0,
vectors_name: Optional[str] = None,
model_name: Optional[str] = None,
omit_extra_lookups: bool = False,
base_model: Optional[str] = None,
silent: bool = True,
) -> Language:
msg = Printer(no_print=silent, pretty=not silent)
if jsonl_loc is not None:
if freqs_loc is not None or clusters_loc is not None:
settings = ["-j"]
@ -68,7 +104,7 @@ def init_model(
freqs_loc = ensure_path(freqs_loc)
if freqs_loc is not None and not freqs_loc.exists():
msg.fail("Can't find words frequencies file", freqs_loc, exits=1)
lex_attrs = read_attrs_from_deprecated(freqs_loc, clusters_loc)
lex_attrs = read_attrs_from_deprecated(msg, freqs_loc, clusters_loc)
with msg.loading("Creating model..."):
nlp = create_model(lang, lex_attrs, name=model_name, base_model=base_model)
@ -83,7 +119,9 @@ def init_model(
msg.good("Successfully created model")
if vectors_loc is not None:
add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name)
add_vectors(
msg, nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name
)
vec_added = len(nlp.vocab.vectors)
lex_added = len(nlp.vocab)
msg.good(
@ -95,7 +133,7 @@ def init_model(
return nlp
def open_file(loc):
def open_file(loc: Union[str, Path]) -> IO:
"""Handle .gz, .tar.gz or unzipped files"""
loc = ensure_path(loc)
if tarfile.is_tarfile(str(loc)):
@ -111,7 +149,9 @@ def open_file(loc):
return loc.open("r", encoding="utf8")
def read_attrs_from_deprecated(freqs_loc, clusters_loc):
def read_attrs_from_deprecated(
msg: Printer, freqs_loc: Optional[Path], clusters_loc: Optional[Path]
) -> List[Dict[str, Any]]:
if freqs_loc is not None:
with msg.loading("Counting frequencies..."):
probs, _ = read_freqs(freqs_loc)
@ -139,7 +179,12 @@ def read_attrs_from_deprecated(freqs_loc, clusters_loc):
return lex_attrs
def create_model(lang, lex_attrs, name=None, base_model=None):
def create_model(
lang: str,
lex_attrs: List[Dict[str, Any]],
name: Optional[str] = None,
base_model: Optional[Union[str, Path]] = None,
) -> Language:
if base_model:
nlp = load_model(base_model)
# keep the tokenizer but remove any existing pipeline components due to
@ -166,7 +211,14 @@ def create_model(lang, lex_attrs, name=None, base_model=None):
return nlp
def add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, name=None):
def add_vectors(
msg: Printer,
nlp: Language,
vectors_loc: Optional[Path],
truncate_vectors: int,
prune_vectors: int,
name: Optional[str] = None,
) -> None:
vectors_loc = ensure_path(vectors_loc)
if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))
@ -176,7 +228,7 @@ def add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, name=None):
else:
if vectors_loc:
with msg.loading(f"Reading vectors from {vectors_loc}"):
vectors_data, vector_keys = read_vectors(vectors_loc)
vectors_data, vector_keys = read_vectors(msg, vectors_loc)
msg.good(f"Loaded vectors from {vectors_loc}")
else:
vectors_data, vector_keys = (None, None)
@ -195,7 +247,7 @@ def add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, name=None):
nlp.vocab.prune_vectors(prune_vectors)
def read_vectors(vectors_loc, truncate_vectors=0):
def read_vectors(msg: Printer, vectors_loc: Path, truncate_vectors: int = 0):
f = open_file(vectors_loc)
shape = tuple(int(size) for size in next(f).split())
if truncate_vectors >= 1:
@ -215,7 +267,9 @@ def read_vectors(vectors_loc, truncate_vectors=0):
return vectors_data, vectors_keys
def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
def read_freqs(
freqs_loc: Path, max_length: int = 100, min_doc_freq: int = 5, min_freq: int = 50
):
counts = PreshCounter()
total = 0
with freqs_loc.open() as f:
@ -244,7 +298,7 @@ def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
return probs, oov_prob
def read_clusters(clusters_loc):
def read_clusters(clusters_loc: Path) -> dict:
clusters = {}
if ftfy is None:
warnings.warn(Warnings.W004)

View File

@ -1,19 +1,24 @@
from typing import Optional, Union, Any, Dict
import shutil
from pathlib import Path
from wasabi import msg, get_raw_input
from wasabi import Printer, get_raw_input
import srsly
import sys
from ._app import app, Arg, Opt
from ..schemas import validate, ModelMetaSchema
from .. import util
from .. import about
def package(
@app.command("package")
def package_cli(
# fmt: off
input_dir: ("Directory with model data", "positional", None, str),
output_dir: ("Output parent directory", "positional", None, str),
meta_path: ("Path to meta.json", "option", "m", str) = None,
create_meta: ("Create meta.json, even if one exists", "flag", "c", bool) = False,
force: ("Force overwriting existing model in output directory", "flag", "f", bool) = False,
input_dir: Path = Arg(..., help="Directory with model data", exists=True, file_okay=False),
output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False),
meta_path: Optional[Path] = Opt(None, "--meta-path", "-m", help="Path to meta.json", exists=True, dir_okay=False),
create_meta: bool = Opt(False, "--create-meta", "-c", "-C", help="Create meta.json, even if one exists"),
force: bool = Opt(False, "--force", "-f", "-F", help="Force overwriting existing model in output directory"),
# fmt: on
):
"""
@ -23,6 +28,25 @@ def package(
set and a meta.json already exists in the output directory, the existing
values will be used as the defaults in the command-line prompt.
"""
package(
input_dir,
output_dir,
meta_path=meta_path,
create_meta=create_meta,
force=force,
silent=False,
)
def package(
input_dir: Path,
output_dir: Path,
meta_path: Optional[Path] = None,
create_meta: bool = False,
force: bool = False,
silent: bool = True,
) -> None:
msg = Printer(no_print=silent, pretty=not silent)
input_path = util.ensure_path(input_dir)
output_path = util.ensure_path(output_dir)
meta_path = util.ensure_path(meta_path)
@ -33,23 +57,20 @@ def package(
if meta_path and not meta_path.exists():
msg.fail("Can't find model meta.json", meta_path, exits=1)
meta_path = meta_path or input_path / "meta.json"
if meta_path.is_file():
meta_path = meta_path or input_dir / "meta.json"
if not meta_path.exists() or not meta_path.is_file():
msg.fail("Can't load model meta.json", meta_path, exits=1)
meta = srsly.read_json(meta_path)
if not create_meta: # only print if user doesn't want to overwrite
msg.good("Loaded meta.json from file", meta_path)
else:
meta = generate_meta(input_dir, meta, msg)
for key in ("lang", "name", "version"):
if key not in meta or meta[key] == "":
msg.fail(
f"No '{key}' setting found in meta.json",
"This setting is required to build your package.",
exits=1,
)
errors = validate(ModelMetaSchema, meta)
if errors:
msg.fail("Invalid model meta.json", "\n".join(errors), exits=1)
model_name = meta["lang"] + "_" + meta["name"]
model_name_v = model_name + "-" + meta["version"]
main_path = output_path / model_name_v
main_path = output_dir / model_name_v
package_path = main_path / model_name
if package_path.exists():
@ -63,21 +84,26 @@ def package(
exits=1,
)
Path.mkdir(package_path, parents=True)
shutil.copytree(str(input_path), str(package_path / model_name_v))
shutil.copytree(str(input_dir), str(package_path / model_name_v))
create_file(main_path / "meta.json", srsly.json_dumps(meta, indent=2))
create_file(main_path / "setup.py", TEMPLATE_SETUP)
create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST)
create_file(package_path / "__init__.py", TEMPLATE_INIT)
msg.good(f"Successfully created package '{model_name_v}'", main_path)
msg.text("To build the package, run `python setup.py sdist` in this directory.")
with util.working_dir(main_path):
util.run_command([sys.executable, "setup.py", "sdist"])
zip_file = main_path / "dist" / f"{model_name_v}.tar.gz"
msg.good(f"Successfully created zipped Python package", zip_file)
def create_file(file_path, contents):
def create_file(file_path: Path, contents: str) -> None:
file_path.touch()
file_path.open("w", encoding="utf-8").write(contents)
def generate_meta(model_path, existing_meta, msg):
def generate_meta(
model_path: Union[str, Path], existing_meta: Dict[str, Any], msg: Printer
) -> Dict[str, Any]:
meta = existing_meta or {}
settings = [
("lang", "Model language", meta.get("lang", "en")),

View File

@ -1,14 +1,15 @@
from typing import Optional
import random
import numpy
import time
import re
from collections import Counter
import plac
from pathlib import Path
from thinc.api import Linear, Maxout, chain, list2array, use_pytorch_for_gpu_memory
from wasabi import msg
import srsly
from ._app import app, Arg, Opt
from ..errors import Errors
from ..ml.models.multi_task import build_masked_language_model
from ..tokens import Doc
@ -17,25 +18,17 @@ from .. import util
from ..gold import Example
@plac.annotations(
@app.command("pretrain")
def pretrain_cli(
# fmt: off
texts_loc=("Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the key 'tokens'", "positional", None, str),
vectors_model=("Name or path to spaCy model with vectors to learn from", "positional", None, str),
output_dir=("Directory to write models to on each epoch", "positional", None, Path),
config_path=("Path to config file", "positional", None, Path),
use_gpu=("Use GPU", "option", "g", int),
resume_path=("Path to pretrained weights from which to resume pretraining", "option", "r", Path),
epoch_resume=("The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files.", "option", "er", int),
texts_loc: Path = Arg(..., help="Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the key 'tokens'", exists=True),
vectors_model: str = Arg(..., help="Name or path to spaCy model with vectors to learn from"),
output_dir: Path = Arg(..., help="Directory to write models to on each epoch"),
config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False),
use_gpu: int = Opt(-1, "--use-gpu", "-g", help="Use GPU"),
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files."),
# fmt: on
)
def pretrain(
texts_loc,
vectors_model,
config_path,
output_dir,
use_gpu=-1,
resume_path=None,
epoch_resume=None,
):
"""
Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
@ -52,6 +45,26 @@ def pretrain(
all settings are the same between pretraining and training. Ideally,
this is done by using the same config file for both commands.
"""
pretrain(
texts_loc,
vectors_model,
output_dir,
config_path,
use_gpu=use_gpu,
resume_path=resume_path,
epoch_resume=epoch_resume,
)
def pretrain(
texts_loc: Path,
vectors_model: str,
output_dir: Path,
config_path: Path,
use_gpu: int = -1,
resume_path: Optional[Path] = None,
epoch_resume: Optional[int] = None,
):
if not config_path or not config_path.exists():
msg.fail("Config file not found", config_path, exits=1)

View File

@ -1,3 +1,4 @@
from typing import Optional, Sequence, Union, Iterator
import tqdm
from pathlib import Path
import srsly
@ -5,17 +6,19 @@ import cProfile
import pstats
import sys
import itertools
import ml_datasets
from wasabi import msg
from wasabi import msg, Printer
from ._app import app, Arg, Opt
from ..language import Language
from ..util import load_model
def profile(
@app.command("profile")
def profile_cli(
# fmt: off
model: ("Model to load", "positional", None, str),
inputs: ("Location of input file. '-' for stdin.", "positional", None, str) = None,
n_texts: ("Maximum number of texts to use if available", "option", "n", int) = 10000,
model: str = Arg(..., help="Model to load"),
inputs: Optional[Path] = Arg(None, help="Location of input file. '-' for stdin.", exists=True, allow_dash=True),
n_texts: int = Opt(10000, "--n-texts", "-n", help="Maximum number of texts to use if available"),
# fmt: on
):
"""
@ -24,6 +27,18 @@ def profile(
It can either be provided as a JSONL file, or be read from sys.sytdin.
If no input file is specified, the IMDB dataset is loaded via Thinc.
"""
profile(model, inputs=inputs, n_texts=n_texts)
def profile(model: str, inputs: Optional[Path] = None, n_texts: int = 10000) -> None:
try:
import ml_datasets
except ImportError:
msg.fail(
"This command requires the ml_datasets library to be installed:"
"pip install ml_datasets",
exits=1,
)
if inputs is not None:
inputs = _read_inputs(inputs, msg)
if inputs is None:
@ -43,12 +58,12 @@ def profile(
s.strip_dirs().sort_stats("time").print_stats()
def parse_texts(nlp, texts):
def parse_texts(nlp: Language, texts: Sequence[str]) -> None:
for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=16):
pass
def _read_inputs(loc, msg):
def _read_inputs(loc: Union[Path, str], msg: Printer) -> Iterator[str]:
if loc == "-":
msg.info("Reading input from sys.stdin")
file_ = sys.stdin

View File

@ -1,10 +1,9 @@
from typing import Optional, Dict, List, Union, Sequence
from timeit import default_timer as timer
import plac
import srsly
from pydantic import BaseModel, FilePath
import tqdm
from pydantic import BaseModel, FilePath
from pathlib import Path
from wasabi import msg
import thinc
@ -12,6 +11,7 @@ import thinc.schedules
from thinc.api import Model, use_pytorch_for_gpu_memory
import random
from ._app import app, Arg, Opt
from ..gold import Corpus
from ..lookups import Lookups
from .. import util
@ -20,6 +20,9 @@ from ..errors import Errors
# Don't remove - required to load the built-in architectures
from ..ml import models # noqa: F401
# from ..schemas import ConfigSchema # TODO: include?
registry = util.registry
CONFIG_STR = """
@ -116,35 +119,21 @@ class ConfigSchema(BaseModel):
extra = "allow"
@plac.annotations(
# fmt: off
train_path=("Location of JSON-formatted training data", "positional", None, Path),
dev_path=("Location of JSON-formatted development data", "positional", None, Path),
config_path=("Path to config file", "positional", None, Path),
output_path=("Output directory to store model in", "option", "o", Path),
code_path=("Path to Python file with additional code (registered functions) to be imported", "option", "c", Path),
init_tok2vec=(
"Path to pretrained weights for the tok2vec components. See 'spacy pretrain'. Experimental.", "option", "t2v",
Path),
raw_text=("Path to jsonl file with unlabelled text documents.", "option", "rt", Path),
verbose=("Display more information for debugging purposes", "flag", "VV", bool),
use_gpu=("Use GPU", "option", "g", int),
tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path),
omit_extra_lookups=("Don't include extra lookups in model", "flag", "OEL", bool),
# fmt: on
)
@app.command("train")
def train_cli(
train_path,
dev_path,
config_path,
output_path=None,
code_path=None,
init_tok2vec=None,
raw_text=None,
verbose=False,
use_gpu=-1,
tag_map_path=None,
omit_extra_lookups=False,
# fmt: off
train_path: Path = Arg(..., help="Location of JSON-formatted training data", exists=True),
dev_path: Path = Arg(..., help="Location of JSON-formatted development data", exists=True),
config_path: Path = Arg(..., help="Path to config file", exists=True),
output_path: Optional[Path] = Opt(None, "--output-path", "-o", help="Output directory to store model in"),
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
init_tok2vec: Optional[Path] = Opt(None, "--init-tok2vec", "-t2v", help="Path to pretrained weights for the tok2vec components. See 'spacy pretrain'. Experimental."),
raw_text: Optional[Path] = Opt(None, "--raw-text", "-rt", help="Path to jsonl file with unlabelled text documents."),
verbose: bool = Opt(False, "--verbose", "-VV", help="Display more information for debugging purposes"),
use_gpu: int = Opt(-1, "--use-gpu", "-g", help="Use GPU"),
tag_map_path: Optional[Path] = Opt(None, "--tag-map-path", "-tm", help="Location of JSON-formatted tag map"),
omit_extra_lookups: bool = Opt(False, "--omit-extra-lookups", "-OEL", help="Don't include extra lookups in model"),
# fmt: on
):
"""
Train or update a spaCy model. Requires data to be formatted in spaCy's
@ -183,14 +172,14 @@ def train_cli(
def train(
config_path,
data_paths,
raw_text=None,
output_path=None,
tag_map=None,
weights_data=None,
omit_extra_lookups=False,
):
config_path: Path,
data_paths: Dict[str, Path],
raw_text: Optional[Path] = None,
output_path: Optional[Path] = None,
tag_map: Optional[Path] = None,
weights_data: Optional[bytes] = None,
omit_extra_lookups: bool = False,
) -> None:
msg.info(f"Loading config from: {config_path}")
# Read the config first without creating objects, to get to the original nlp_config
config = util.load_config(config_path, create_objects=False)
@ -591,8 +580,6 @@ def verify_cli_args(
def verify_textcat_config(nlp, nlp_config):
msg.info(f"Initialized textcat component for {len(textcat_labels)} unique labels")
nlp.get_pipe("textcat").labels = tuple(textcat_labels)
# if 'positive_label' is provided: double check whether it's in the data and
# the task is binary
if nlp_config["pipeline"]["textcat"].get("positive_label", None):

View File

@ -1,18 +1,25 @@
from typing import Tuple
from pathlib import Path
import sys
import requests
from wasabi import msg
from wasabi import msg, Printer
from ._app import app
from .. import about
from ..util import get_package_version, get_installed_models, get_base_version
from ..util import get_package_path, get_model_meta, is_compatible_version
def validate():
@app.command("validate")
def validate_cli():
"""
Validate that the currently installed version of spaCy is compatible
with the installed models. Should be run after `pip install -U spacy`.
"""
validate()
def validate() -> None:
model_pkgs, compat = get_model_pkgs()
spacy_version = get_base_version(about.__version__)
current_compat = compat.get(spacy_version, {})
@ -55,7 +62,8 @@ def validate():
sys.exit(1)
def get_model_pkgs():
def get_model_pkgs(silent: bool = False) -> Tuple[dict, dict]:
msg = Printer(no_print=silent, pretty=not silent)
with msg.loading("Loading compatibility table..."):
r = requests.get(about.__compatibility__)
if r.status_code != 200:
@ -93,7 +101,7 @@ def get_model_pkgs():
return pkgs, compat
def reformat_version(version):
def reformat_version(version: str) -> str:
"""Hack to reformat old versions ending on '-alpha' to match pip format."""
if version.endswith("-alpha"):
return version.replace("-alpha", "a0")

View File

@ -1,9 +1,9 @@
from wasabi import Printer
from .. import tags_to_entities
from ...gold import iob_to_biluo
from ...lang.xx import MultiLanguage
from ...tokens.doc import Doc
from ...vocab import Vocab
from ...tokens import Doc, Span
from ...util import load_model
@ -98,7 +98,7 @@ def conll_ner2docs(
biluo_tags = []
for conll_sent in conll_doc.split("\n\n"):
conll_sent = conll_sent.strip()
if not sent:
if not conll_sent:
continue
lines = [line.strip() for line in conll_sent.split("\n") if line.strip()]
cols = list(zip(*[line.split() for line in lines]))
@ -110,7 +110,7 @@ def conll_ner2docs(
)
length = len(cols[0])
words.extend(cols[0])
sent_stats.extend([True] + [False] * (length - 1))
sent_starts.extend([True] + [False] * (length - 1))
biluo_tags.extend(iob_to_biluo(cols[-1]))
pos_tags.extend(cols[1] if len(cols) > 2 else ["-"] * length)

View File

@ -1,10 +1,10 @@
import re
from .conll_ner2docs import n_sents_info
from ...gold import Example
from ...gold import iob_to_biluo, spans_from_biluo_tags
from ...language import Language
from ...tokens import Doc, Token
from .conll_ner2json import n_sents_info
from wasabi import Printer

View File

@ -1,12 +1,12 @@
from wasabi import Printer
from ...gold import iob_to_biluo, tags_to_entities
from ...util import minibatch
from ...tokens import Doc, Span
from .util import merge_sentences
from .conll_ner2docs import n_sents_info
def iob2docs(input_data, n_sents=10, no_print=False, *args, **kwargs):
def iob2docs(input_data, vocab, n_sents=10, no_print=False, *args, **kwargs):
"""
Convert IOB files with one sentence per line and tags separated with '|'
into Doc objects so they can be saved. IOB and IOB2 are accepted.
@ -19,14 +19,14 @@ def iob2docs(input_data, n_sents=10, no_print=False, *args, **kwargs):
I|PRP|O like|VBP|O London|NNP|B-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O
"""
msg = Printer(no_print=no_print)
docs = read_iob(input_data.split("\n"))
docs = read_iob(input_data.split("\n"), vocab)
if n_sents > 0:
n_sents_info(msg, n_sents)
docs = merge_sentences(docs, n_sents)
return docs
def read_iob(raw_sents):
def read_iob(raw_sents, vocab):
docs = []
for line in raw_sents:
if not line.strip():
@ -42,10 +42,10 @@ def read_iob(raw_sents):
"The sentence-per-line IOB/IOB2 file is not formatted correctly. Try checking whitespace and delimiters. See https://spacy.io/api/cli#convert"
)
doc = Doc(vocab, words=words)
for i, tag in enumerate(pos):
for i, tag in enumerate(tags):
doc[i].tag_ = tag
biluo = iob_to_biluo(iob)
entities = biluo_tags_to_entities(biluo)
entities = tags_to_entities(biluo)
doc.ents = [Span(doc, start=s, end=e, label=L) for (L, s, e) in entities]
docs.append(doc)
return docs

View File

@ -1,3 +1,6 @@
from spacy.util import minibatch
def merge_sentences(docs, n_sents):
merged = []
for group in minibatch(docs, size=n_sents):

View File

@ -1,8 +1,9 @@
from typing import Dict, List, Union, Optional
from typing import Dict, List, Union, Optional, Sequence, Any
from enum import Enum
from pydantic import BaseModel, Field, ValidationError, validator
from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool
from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool, FilePath
from collections import defaultdict
from thinc.api import Model
from .attrs import NAMES
@ -163,24 +164,48 @@ class ModelMetaSchema(BaseModel):
email: Optional[StrictStr] = Field(None, title="Model author email")
url: Optional[StrictStr] = Field(None, title="Model author URL")
sources: Optional[Union[List[StrictStr], Dict[str, str]]] = Field(None, title="Training data sources")
vectors: Optional[Dict[str, int]] = Field(None, title="Included word vectors")
vectors: Optional[Dict[str, Any]] = Field(None, title="Included word vectors")
accuracy: Optional[Dict[str, Union[float, int]]] = Field(None, title="Accuracy numbers")
speed: Optional[Dict[str, Union[float, int]]] = Field(None, title="Speed evaluation numbers")
# fmt: on
# Training data object in "simple training style"
# JSON training format
class SimpleTrainingSchema(BaseModel):
# TODO: write
class PipelineComponent(BaseModel):
factory: str
model: Model
class Config:
title = "Schema for training data dict in passed to nlp.update"
extra = "forbid"
arbitrary_types_allowed = True
# JSON training format
class ConfigSchema(BaseModel):
optimizer: Optional["Optimizer"]
class training(BaseModel):
patience: int = 10
eval_frequency: int = 100
dropout: float = 0.2
init_tok2vec: Optional[FilePath] = None
max_epochs: int = 100
orth_variant_level: float = 0.0
gold_preproc: bool = False
max_length: int = 0
use_gpu: int = 0
scores: List[str] = ["ents_p", "ents_r", "ents_f"]
score_weights: Dict[str, Union[int, float]] = {"ents_f": 1.0}
limit: int = 0
batch_size: Union[Sequence[int], int]
class nlp(BaseModel):
lang: str
vectors: Optional[str]
pipeline: Optional[Dict[str, PipelineComponent]]
class Config:
extra = "allow"
class TrainingSchema(BaseModel):
@ -189,3 +214,34 @@ class TrainingSchema(BaseModel):
class Config:
title = "Schema for training data in spaCy's JSON format"
extra = "forbid"
# Project config Schema
class ProjectConfigAsset(BaseModel):
dest: StrictStr = Field(..., title="Destination of downloaded asset")
url: StrictStr = Field(..., title="URL of asset")
class ProjectConfigCommand(BaseModel):
# fmt: off
name: StrictStr = Field(..., title="Name of command")
help: Optional[StrictStr] = Field(None, title="Command description")
script: List[StrictStr] = Field([], title="List of CLI commands to run, in order")
dvc_deps: List[StrictStr] = Field([], title="Data Version Control dependencies")
dvc_outputs: List[StrictStr] = Field([], title="Data Version Control outputs")
dvc_outputs_no_cache: List[StrictStr] = Field([], title="Data Version Control outputs (no cache)")
# fmt: on
class ProjectConfigSchema(BaseModel):
# fmt: off
variables: Dict[StrictStr, Union[str, int, float, bool]] = Field({}, title="Optional variables to substitute in commands")
assets: List[ProjectConfigAsset] = Field([], title="Data assets")
run: List[StrictStr] = Field([], title="Names of project commands to execute, in order")
commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts")
# fmt: on
class Config:
title = "Schema for project configuration file"

View File

@ -6,7 +6,6 @@ from spacy.pipeline.defaults import default_parser
from spacy.pipeline import DependencyParser
from spacy.tokens import Doc
from spacy.syntax.nonproj import projectivize
from spacy.syntax.stateclass import StateClass
from spacy.syntax.arc_eager import ArcEager
@ -41,26 +40,6 @@ def arc_eager(vocab):
return moves
@pytest.fixture
def words():
return ["a", "b"]
@pytest.fixture
def doc(words, vocab):
if vocab is None:
vocab = Vocab()
return Doc(vocab, words=list(words))
@pytest.fixture
def gold(doc, words):
if len(words) == 2:
return GoldParse(doc, words=["a", "b"], heads=[0, 0], deps=["ROOT", "right"])
else:
raise NotImplementedError
def test_oracle_four_words(arc_eager, vocab):
words = ["a", "b", "c", "d"]
heads = [1, 1, 3, 3]

View File

@ -31,5 +31,5 @@ def test_issue4665():
conllu2json should not raise an exception if the HEAD column contains an
underscore
"""
conllu2json(input_data)
pass
# conllu2json(input_data)

View File

@ -1,7 +1,9 @@
import pytest
from spacy.lang.en import English
from spacy.gold import docs_to_json
from spacy.gold.converters import iob2docs, conll_ner2docs
from spacy.gold.converters.conllu2json import conllu2json
from spacy.lang.en import English
from spacy.cli.pretrain import make_docs
# TODO
@ -116,7 +118,7 @@ def test_cli_converters_conllu2json_subtokens():
@pytest.mark.xfail
def test_cli_converters_iob2json():
def test_cli_converters_iob2json(en_vocab):
lines = [
"I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O",
"I|O like|O London|B-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O",
@ -124,7 +126,8 @@ def test_cli_converters_iob2json():
"I|PRP|O like|VBP|O London|NNP|B-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O",
]
input_data = "\n".join(lines)
converted = iob2json(input_data, n_sents=10)
converted_docs = iob2docs(input_data, en_vocab, n_sents=10)
converted = docs_to_json(converted_docs)
assert len(converted) == 1
assert converted[0]["id"] == 0
assert len(converted[0]["paragraphs"]) == 1
@ -190,7 +193,8 @@ def test_cli_converters_conll_ner2json():
".\t.\t_\tO",
]
input_data = "\n".join(lines)
converted = conll_ner2json(input_data, n_sents=10)
converted_docs = conll_ner2docs(input_data, n_sents=10)
converted = docs_to_json(converted_docs)
assert len(converted) == 1
assert converted[0]["id"] == 0
assert len(converted[0]["paragraphs"]) == 1

View File

@ -1,10 +1,10 @@
from typing import List, Union
import os
import importlib
import importlib.util
import re
from pathlib import Path
import random
from typing import List
import thinc
from thinc.api import NumpyOps, get_current_ops, Adam, require_gpu, Config
import functools
@ -17,6 +17,8 @@ import sys
import warnings
from packaging.specifiers import SpecifierSet, InvalidSpecifier
from packaging.version import Version, InvalidVersion
import subprocess
from contextlib import contextmanager
try:
@ -429,6 +431,30 @@ def get_package_path(name):
return Path(pkg.__file__).parent
def run_command(command: List[str]) -> None:
"""Run a command on the command line as a subprocess.
command (list): The split command.
"""
status = subprocess.call(command, env=os.environ.copy())
if status != 0:
sys.exit(status)
@contextmanager
def working_dir(path: Union[str, Path]) -> None:
"""Change current working directory and returns to previous on exit.
path (str / Path): The directory to navigate to.
"""
prev_cwd = Path.cwd()
os.chdir(str(path))
try:
yield
finally:
os.chdir(prev_cwd)
def is_in_jupyter():
"""Check if user is running spaCy from a Jupyter notebook by detecting the
IPython kernel. Mainly used for the displaCy visualizer.