mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-05 22:20:34 +03:00
Merge remote-tracking branch 'upstream/develop' into whatif/arrow
# Conflicts: # spacy/cli/__init__.py # spacy/cli/convert.py # spacy/cli/debug_data.py # spacy/cli/evaluate.py # spacy/cli/train.py
This commit is contained in:
commit
f7769eb808
|
@ -5,13 +5,13 @@ thinc==8.0.0a9
|
||||||
blis>=0.4.0,<0.5.0
|
blis>=0.4.0,<0.5.0
|
||||||
ml_datasets>=0.1.1
|
ml_datasets>=0.1.1
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
wasabi>=0.4.0,<1.1.0
|
wasabi>=0.7.0,<1.1.0
|
||||||
srsly>=2.0.0,<3.0.0
|
srsly>=2.1.0,<3.0.0
|
||||||
catalogue>=0.0.7,<1.1.0
|
catalogue>=0.0.7,<1.1.0
|
||||||
|
typer>=0.2.1,<1.0.0
|
||||||
# Third party dependencies
|
# Third party dependencies
|
||||||
numpy>=1.15.0
|
numpy>=1.15.0
|
||||||
requests>=2.13.0,<3.0.0
|
requests>=2.13.0,<3.0.0
|
||||||
plac>=0.9.6,<1.2.0
|
|
||||||
tqdm>=4.38.0,<5.0.0
|
tqdm>=4.38.0,<5.0.0
|
||||||
pydantic>=1.3.0,<2.0.0
|
pydantic>=1.3.0,<2.0.0
|
||||||
# Official Python utilities
|
# Official Python utilities
|
||||||
|
|
|
@ -44,14 +44,13 @@ install_requires =
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc==8.0.0a9
|
thinc==8.0.0a9
|
||||||
blis>=0.4.0,<0.5.0
|
blis>=0.4.0,<0.5.0
|
||||||
wasabi>=0.4.0,<1.1.0
|
wasabi>=0.7.0,<1.1.0
|
||||||
srsly>=2.0.0,<3.0.0
|
srsly>=2.1.0,<3.0.0
|
||||||
catalogue>=0.0.7,<1.1.0
|
catalogue>=0.0.7,<1.1.0
|
||||||
ml_datasets>=0.1.1
|
typer>=0.2.1,<1.0.0
|
||||||
# Third-party dependencies
|
# Third-party dependencies
|
||||||
tqdm>=4.38.0,<5.0.0
|
tqdm>=4.38.0,<5.0.0
|
||||||
numpy>=1.15.0
|
numpy>=1.15.0
|
||||||
plac>=0.9.6,<1.2.0
|
|
||||||
requests>=2.13.0,<3.0.0
|
requests>=2.13.0,<3.0.0
|
||||||
pydantic>=1.3.0,<2.0.0
|
pydantic>=1.3.0,<2.0.0
|
||||||
# Official Python utilities
|
# Official Python utilities
|
||||||
|
|
|
@ -8,7 +8,7 @@ warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
|
||||||
from thinc.api import prefer_gpu, require_gpu
|
from thinc.api import prefer_gpu, require_gpu
|
||||||
|
|
||||||
from . import pipeline
|
from . import pipeline
|
||||||
from .cli.info import info as cli_info
|
from .cli.info import info
|
||||||
from .glossary import explain
|
from .glossary import explain
|
||||||
from .about import __version__
|
from .about import __version__
|
||||||
from .errors import Errors, Warnings
|
from .errors import Errors, Warnings
|
||||||
|
@ -34,7 +34,3 @@ def load(name, **overrides):
|
||||||
def blank(name, **kwargs):
|
def blank(name, **kwargs):
|
||||||
LangClass = util.get_lang_class(name)
|
LangClass = util.get_lang_class(name)
|
||||||
return LangClass(**kwargs)
|
return LangClass(**kwargs)
|
||||||
|
|
||||||
|
|
||||||
def info(model=None, markdown=False, silent=False):
|
|
||||||
return cli_info(model, markdown, silent)
|
|
||||||
|
|
|
@ -1,31 +1,4 @@
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import plac
|
from spacy.cli import setup_cli
|
||||||
import sys
|
|
||||||
from wasabi import msg
|
|
||||||
from spacy.cli import download, link, info, package, pretrain, convert
|
|
||||||
from spacy.cli import init_model, profile, evaluate, validate, debug_data
|
|
||||||
from spacy.cli import train_cli
|
|
||||||
|
|
||||||
commands = {
|
setup_cli()
|
||||||
"download": download,
|
|
||||||
"link": link,
|
|
||||||
"info": info,
|
|
||||||
"train": train_cli,
|
|
||||||
"pretrain": pretrain,
|
|
||||||
"debug-data": debug_data,
|
|
||||||
"evaluate": evaluate,
|
|
||||||
"convert": convert,
|
|
||||||
"package": package,
|
|
||||||
"init-model": init_model,
|
|
||||||
"profile": profile,
|
|
||||||
"validate": validate,
|
|
||||||
}
|
|
||||||
if len(sys.argv) == 1:
|
|
||||||
msg.info("Available commands", ", ".join(commands), exits=1)
|
|
||||||
command = sys.argv.pop(1)
|
|
||||||
sys.argv[0] = f"spacy {command}"
|
|
||||||
if command in commands:
|
|
||||||
plac.call(commands[command], sys.argv[1:])
|
|
||||||
else:
|
|
||||||
available = f"Available: {', '.join(commands)}"
|
|
||||||
msg.fail(f"Unknown command: {command}", available, exits=1)
|
|
||||||
|
|
|
@ -5,3 +5,4 @@ __release__ = True
|
||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||||
__shortcuts__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts-v2.json"
|
__shortcuts__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts-v2.json"
|
||||||
|
__projects__ = "https://github.com/explosion/spacy-boilerplates"
|
||||||
|
|
|
@ -1,5 +1,9 @@
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
|
|
||||||
|
from ._app import app, setup_cli # noqa: F401
|
||||||
|
|
||||||
|
# These are the actual functions, NOT the wrapped CLI commands. The CLI commands
|
||||||
|
# are registered automatically and won't have to be imported here.
|
||||||
from .download import download # noqa: F401
|
from .download import download # noqa: F401
|
||||||
from .info import info # noqa: F401
|
from .info import info # noqa: F401
|
||||||
from .package import package # noqa: F401
|
from .package import package # noqa: F401
|
||||||
|
@ -13,7 +17,10 @@ from .init_model import init_model # noqa: F401
|
||||||
from .validate import validate # noqa: F401
|
from .validate import validate # noqa: F401
|
||||||
|
|
||||||
|
|
||||||
|
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
|
||||||
def link(*args, **kwargs):
|
def link(*args, **kwargs):
|
||||||
|
"""As of spaCy v3.0, model symlinks are deprecated. You can load models
|
||||||
|
using their full names or from a directory path."""
|
||||||
msg.warn(
|
msg.warn(
|
||||||
"As of spaCy v3.0, model symlinks are deprecated. You can load models "
|
"As of spaCy v3.0, model symlinks are deprecated. You can load models "
|
||||||
"using their full names or from a directory path."
|
"using their full names or from a directory path."
|
||||||
|
|
31
spacy/cli/_app.py
Normal file
31
spacy/cli/_app.py
Normal file
|
@ -0,0 +1,31 @@
|
||||||
|
from typing import Optional
|
||||||
|
import typer
|
||||||
|
from typer.main import get_command
|
||||||
|
|
||||||
|
|
||||||
|
COMMAND = "python -m spacy"
|
||||||
|
NAME = "spacy"
|
||||||
|
HELP = """spaCy Command-line Interface
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/cli
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
app = typer.Typer(name=NAME, help=HELP)
|
||||||
|
|
||||||
|
|
||||||
|
def Arg(*args, help: Optional[str] = None, **kwargs) -> typer.Argument:
|
||||||
|
"""Wrapper for Typer's annotation to keep it short and set defaults."""
|
||||||
|
# Filter out help for now until it's officially supported
|
||||||
|
return typer.Argument(*args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
def Opt(*args, **kwargs) -> typer.Option:
|
||||||
|
"""Wrapper for Typer's annotation to keep it short and set defaults."""
|
||||||
|
return typer.Option(*args, show_default=True, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
def setup_cli() -> None:
|
||||||
|
# Ensure that the help messages always display the correct prompt
|
||||||
|
command = get_command(app)
|
||||||
|
command(prog_name=COMMAND)
|
|
@ -1,9 +1,12 @@
|
||||||
|
from typing import Optional
|
||||||
|
from enum import Enum
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import Printer
|
from wasabi import Printer
|
||||||
import srsly
|
import srsly
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
from ._app import app, Arg, Opt
|
||||||
from ..tokens import DocBin
|
from ..tokens import DocBin
|
||||||
from ..gold.converters import iob2docs, conll_ner2docs, json2docs
|
from ..gold.converters import iob2docs, conll_ner2docs, json2docs
|
||||||
|
|
||||||
|
@ -24,36 +27,80 @@ CONVERTERS = {
|
||||||
|
|
||||||
|
|
||||||
# File types
|
# File types
|
||||||
FILE_TYPES = ("json", "jsonl", "msg")
|
FILE_TYPES_STDOUT = ("json")
|
||||||
FILE_TYPES_STDOUT = ("json", "jsonl")
|
|
||||||
|
|
||||||
|
|
||||||
def convert(
|
class FileTypes(str, Enum):
|
||||||
|
json = "json"
|
||||||
|
spacy = "spacy"
|
||||||
|
|
||||||
|
|
||||||
|
@app.command("convert")
|
||||||
|
def convert_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
input_path: ("Input file or directory", "positional", None, Path),
|
input_path: str = Arg(..., help="Input file or directory", exists=True),
|
||||||
output_dir: ("Output directory.", "positional", None, Path),
|
output_dir: Path = Arg("-", help="Output directory. '-' for stdout.", allow_dash=True, exists=True),
|
||||||
file_type: (f"Type of data to produce: {FILE_TYPES}", "option", "t", str, FILE_TYPES) = "spacy",
|
file_type: FileTypes = Opt("spacy", "--file-type", "-t", help="Type of data to produce"),
|
||||||
n_sents: ("Number of sentences per doc (0 to disable)", "option", "n", int) = 1,
|
n_sents: int = Opt(1, "--n-sents", "-n", help="Number of sentences per doc (0 to disable)"),
|
||||||
seg_sents: ("Segment sentences (for -c ner)", "flag", "s") = False,
|
seg_sents: bool = Opt(False, "--seg-sents", "-s", help="Segment sentences (for -c ner)"),
|
||||||
model: ("Model for sentence segmentation (for -s)", "option", "b", str) = None,
|
model: Optional[str] = Opt(None, "--model", "-b", help="Model for sentence segmentation (for -s)"),
|
||||||
morphology: ("Enable appending morphology to tags", "flag", "m", bool) = False,
|
morphology: bool = Opt(False, "--morphology", "-m", help="Enable appending morphology to tags"),
|
||||||
merge_subtokens: ("Merge CoNLL-U subtokens", "flag", "T", bool) = False,
|
merge_subtokens: bool = Opt(False, "--merge-subtokens", "-T", help="Merge CoNLL-U subtokens"),
|
||||||
converter: (f"Converter: {tuple(CONVERTERS.keys())}", "option", "c", str) = "auto",
|
converter: str = Opt("auto", "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"),
|
||||||
ner_map: ("NER tag mapping (as JSON-encoded dict of entity types)", "option", "N", Path) = None,
|
ner_map: Optional[Path] = Opt(None, "--ner-map", "-N", help="NER tag mapping (as JSON-encoded dict of entity types)", exists=True),
|
||||||
lang: ("Language (if tokenizer required)", "option", "l", str) = None,
|
lang: Optional[str] = Opt(None, "--lang", "-l", help="Language (if tokenizer required)"),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Convert files into json or DocBin format for use with train command and other
|
Convert files into json or DocBin format for use with train command and other
|
||||||
experiment management functions.
|
experiment management functions. If no output_dir is specified, the data
|
||||||
|
is written to stdout, so you can pipe them forward to a JSON file:
|
||||||
|
$ spacy convert some_file.conllu > some_file.json
|
||||||
"""
|
"""
|
||||||
|
if isinstance(file_type, FileTypes):
|
||||||
|
# We get an instance of the FileTypes from the CLI so we need its string value
|
||||||
|
file_type = file_type.value
|
||||||
cli_args = locals()
|
cli_args = locals()
|
||||||
no_print = output_dir == "-"
|
silent = output_dir == "-"
|
||||||
output_dir = Path(output_dir) if output_dir != "-" else "-"
|
output_dir = Path(output_dir) if output_dir != "-" else "-"
|
||||||
msg = Printer(no_print=no_print)
|
msg = Printer(no_print=silent)
|
||||||
verify_cli_args(msg, **cli_args)
|
verify_cli_args(msg, **cli_args)
|
||||||
converter = _get_converter(msg, converter, input_path)
|
convert(
|
||||||
|
input_path,
|
||||||
|
output_dir,
|
||||||
|
file_type=file_type,
|
||||||
|
n_sents=n_sents,
|
||||||
|
seg_sents=seg_sents,
|
||||||
|
model=model,
|
||||||
|
morphology=morphology,
|
||||||
|
merge_subtokens=merge_subtokens,
|
||||||
|
converter=converter,
|
||||||
|
ner_map=ner_map,
|
||||||
|
lang=lang,
|
||||||
|
silent=silent,
|
||||||
|
msg=msg,
|
||||||
|
)
|
||||||
|
|
||||||
|
def convert(
|
||||||
|
input_path: Path,
|
||||||
|
output_dir: Path,
|
||||||
|
*,
|
||||||
|
file_type: str = "json",
|
||||||
|
n_sents: int = 1,
|
||||||
|
seg_sents: bool = False,
|
||||||
|
model: Optional[str] = None,
|
||||||
|
morphology: bool = False,
|
||||||
|
merge_subtokens: bool = False,
|
||||||
|
converter: str = "auto",
|
||||||
|
ner_map: Optional[Path] = None,
|
||||||
|
lang: Optional[str] = None,
|
||||||
|
silent: bool = True,
|
||||||
|
msg: Optional[Path] = None,
|
||||||
|
) -> None:
|
||||||
|
if not msg:
|
||||||
|
msg = Printer(no_print=silent)
|
||||||
ner_map = srsly.read_json(ner_map) if ner_map is not None else None
|
ner_map = srsly.read_json(ner_map) if ner_map is not None else None
|
||||||
|
|
||||||
for input_loc in walk_directory(input_path):
|
for input_loc in walk_directory(input_path):
|
||||||
input_data = input_loc.open("r", encoding="utf-8").read()
|
input_data = input_loc.open("r", encoding="utf-8").read()
|
||||||
# Use converter function to convert data
|
# Use converter function to convert data
|
||||||
|
@ -66,25 +113,30 @@ def convert(
|
||||||
merge_subtokens=merge_subtokens,
|
merge_subtokens=merge_subtokens,
|
||||||
lang=lang,
|
lang=lang,
|
||||||
model=model,
|
model=model,
|
||||||
no_print=no_print,
|
no_print=silent,
|
||||||
ner_map=ner_map,
|
ner_map=ner_map,
|
||||||
)
|
)
|
||||||
|
if output_dir != "-":
|
||||||
|
# Export data to a file
|
||||||
suffix = f".{file_type}"
|
suffix = f".{file_type}"
|
||||||
subpath = input_loc.relative_to(input_path)
|
subpath = input_loc.relative_to(input_path)
|
||||||
output_file = (output_dir / subpath).with_suffix(suffix)
|
output_file = Path(output_dir) / subpath.with_suffix(suffix)
|
||||||
if not output_file.parent.exists():
|
if not output_file.parent.exists():
|
||||||
output_file.parent.mkdir(parents=True)
|
output_file.parent.mkdir(parents=True)
|
||||||
if file_type == "json":
|
if file_type == "json":
|
||||||
data = docs2json(docs)
|
|
||||||
srsly.write_json(output_file, docs2json(docs))
|
srsly.write_json(output_file, docs2json(docs))
|
||||||
else:
|
else:
|
||||||
data = DocBin(docs=docs).to_bytes()
|
data = DocBin(docs=docs).to_bytes()
|
||||||
with output_file.open("wb") as file_:
|
with output_file.open("wb") as file_:
|
||||||
file_.write(data)
|
file_.write(data)
|
||||||
msg.good(f"Generated output file ({len(docs)} documents): {output_file}")
|
msg.good(f"Generated output file ({len(docs)} documents): {output_file}")
|
||||||
|
else:
|
||||||
|
# Print to stdout
|
||||||
|
if file_type == "json":
|
||||||
|
srsly.write_json("-", docs)
|
||||||
|
|
||||||
|
|
||||||
def autodetect_ner_format(input_data):
|
def autodetect_ner_format(input_data: str) -> str:
|
||||||
# guess format from the first 20 lines
|
# guess format from the first 20 lines
|
||||||
lines = input_data.split("\n")[:20]
|
lines = input_data.split("\n")[:20]
|
||||||
format_guesses = {"ner": 0, "iob": 0}
|
format_guesses = {"ner": 0, "iob": 0}
|
||||||
|
|
|
@ -1,11 +1,14 @@
|
||||||
|
from typing import Optional, List, Sequence, Dict, Any, Tuple
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
import sys
|
import sys
|
||||||
import srsly
|
import srsly
|
||||||
from wasabi import Printer, MESSAGES
|
from wasabi import Printer, MESSAGES
|
||||||
|
|
||||||
|
from ._app import app, Arg, Opt
|
||||||
from ..gold import Corpus
|
from ..gold import Corpus
|
||||||
from ..syntax import nonproj
|
from ..syntax import nonproj
|
||||||
|
from ..language import Language
|
||||||
from ..util import load_model, get_lang_class
|
from ..util import load_model, get_lang_class
|
||||||
|
|
||||||
|
|
||||||
|
@ -18,17 +21,18 @@ BLANK_MODEL_MIN_THRESHOLD = 100
|
||||||
BLANK_MODEL_THRESHOLD = 2000
|
BLANK_MODEL_THRESHOLD = 2000
|
||||||
|
|
||||||
|
|
||||||
def debug_data(
|
@app.command("debug-data")
|
||||||
|
def debug_data_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
lang: ("Model language", "positional", None, str),
|
lang: str = Arg(..., help="Model language"),
|
||||||
train_path: ("Location of JSON-formatted training data", "positional", None, Path),
|
train_path: Path = Arg(..., help="Location of JSON-formatted training data", exists=True),
|
||||||
dev_path: ("Location of JSON-formatted development data", "positional", None, Path),
|
dev_path: Path = Arg(..., help="Location of JSON-formatted development data", exists=True),
|
||||||
tag_map_path: ("Location of JSON-formatted tag map", "option", "tm", Path) = None,
|
tag_map_path: Optional[Path] = Opt(None, "--tag-map-path", "-tm", help="Location of JSON-formatted tag map", exists=True, dir_okay=False),
|
||||||
base_model: ("Name of model to update (optional)", "option", "b", str) = None,
|
base_model: Optional[str] = Opt(None, "--base-model", "-b", help="Name of model to update (optional)"),
|
||||||
pipeline: ("Comma-separated names of pipeline components to train", "option", "p", str) = "tagger,parser,ner",
|
pipeline: str = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of pipeline components to train"),
|
||||||
ignore_warnings: ("Ignore warnings, only show stats and errors", "flag", "IW", bool) = False,
|
ignore_warnings: bool = Opt(False, "--ignore-warnings", "-IW", help="Ignore warnings, only show stats and errors"),
|
||||||
verbose: ("Print additional information and explanations", "flag", "V", bool) = False,
|
verbose: bool = Opt(False, "--verbose", "-V", help="Print additional information and explanations"),
|
||||||
no_format: ("Don't pretty-print the results", "flag", "NF", bool) = False,
|
no_format: bool = Opt(False, "--no-format", "-NF", help="Don't pretty-print the results"),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
|
@ -36,8 +40,36 @@ def debug_data(
|
||||||
stats, and find problems like invalid entity annotations, cyclic
|
stats, and find problems like invalid entity annotations, cyclic
|
||||||
dependencies, low data labels and more.
|
dependencies, low data labels and more.
|
||||||
"""
|
"""
|
||||||
msg = Printer(pretty=not no_format, ignore_warnings=ignore_warnings)
|
debug_data(
|
||||||
|
lang,
|
||||||
|
train_path,
|
||||||
|
dev_path,
|
||||||
|
tag_map_path=tag_map_path,
|
||||||
|
base_model=base_model,
|
||||||
|
pipeline=[p.strip() for p in pipeline.split(",")],
|
||||||
|
ignore_warnings=ignore_warnings,
|
||||||
|
verbose=verbose,
|
||||||
|
no_format=no_format,
|
||||||
|
silent=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def debug_data(
|
||||||
|
lang: str,
|
||||||
|
train_path: Path,
|
||||||
|
dev_path: Path,
|
||||||
|
*,
|
||||||
|
tag_map_path: Optional[Path] = None,
|
||||||
|
base_model: Optional[str] = None,
|
||||||
|
pipeline: List[str] = ["tagger", "parser", "ner"],
|
||||||
|
ignore_warnings: bool = False,
|
||||||
|
verbose: bool = False,
|
||||||
|
no_format: bool = True,
|
||||||
|
silent: bool = True,
|
||||||
|
):
|
||||||
|
msg = Printer(
|
||||||
|
no_print=silent, pretty=not no_format, ignore_warnings=ignore_warnings
|
||||||
|
)
|
||||||
# Make sure all files and paths exists if they are needed
|
# Make sure all files and paths exists if they are needed
|
||||||
if not train_path.exists():
|
if not train_path.exists():
|
||||||
msg.fail("Training data not found", train_path, exits=1)
|
msg.fail("Training data not found", train_path, exits=1)
|
||||||
|
@ -49,7 +81,6 @@ def debug_data(
|
||||||
tag_map = srsly.read_json(tag_map_path)
|
tag_map = srsly.read_json(tag_map_path)
|
||||||
|
|
||||||
# Initialize the model and pipeline
|
# Initialize the model and pipeline
|
||||||
pipeline = [p.strip() for p in pipeline.split(",")]
|
|
||||||
if base_model:
|
if base_model:
|
||||||
nlp = load_model(base_model)
|
nlp = load_model(base_model)
|
||||||
else:
|
else:
|
||||||
|
@ -446,7 +477,7 @@ def debug_data(
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
def _load_file(file_path, msg):
|
def _load_file(file_path: Path, msg: Printer) -> None:
|
||||||
file_name = file_path.parts[-1]
|
file_name = file_path.parts[-1]
|
||||||
if file_path.suffix == ".json":
|
if file_path.suffix == ".json":
|
||||||
with msg.loading(f"Loading {file_name}..."):
|
with msg.loading(f"Loading {file_name}..."):
|
||||||
|
@ -465,7 +496,9 @@ def _load_file(file_path, msg):
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def _compile_gold(examples, pipeline, nlp):
|
def _compile_gold(
|
||||||
|
examples: Sequence[Example], pipeline: List[str], nlp: Language
|
||||||
|
) -> Dict[str, Any]:
|
||||||
data = {
|
data = {
|
||||||
"ner": Counter(),
|
"ner": Counter(),
|
||||||
"cats": Counter(),
|
"cats": Counter(),
|
||||||
|
@ -537,13 +570,13 @@ def _compile_gold(examples, pipeline, nlp):
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
def _format_labels(labels, counts=False):
|
def _format_labels(labels: List[Tuple[str, int]], counts: bool = False) -> str:
|
||||||
if counts:
|
if counts:
|
||||||
return ", ".join([f"'{l}' ({c})" for l, c in labels])
|
return ", ".join([f"'{l}' ({c})" for l, c in labels])
|
||||||
return ", ".join([f"'{l}'" for l in labels])
|
return ", ".join([f"'{l}'" for l in labels])
|
||||||
|
|
||||||
|
|
||||||
def _get_examples_without_label(data, label):
|
def _get_examples_without_label(data: Sequence[Example], label: str) -> int:
|
||||||
count = 0
|
count = 0
|
||||||
for eg in data:
|
for eg in data:
|
||||||
labels = [
|
labels = [
|
||||||
|
@ -556,7 +589,7 @@ def _get_examples_without_label(data, label):
|
||||||
return count
|
return count
|
||||||
|
|
||||||
|
|
||||||
def _get_labels_from_model(nlp, pipe_name):
|
def _get_labels_from_model(nlp: Language, pipe_name: str) -> Sequence[str]:
|
||||||
if pipe_name not in nlp.pipe_names:
|
if pipe_name not in nlp.pipe_names:
|
||||||
return set()
|
return set()
|
||||||
pipe = nlp.get_pipe(pipe_name)
|
pipe = nlp.get_pipe(pipe_name)
|
||||||
|
|
|
@ -1,23 +1,36 @@
|
||||||
|
from typing import Optional, Sequence, Union
|
||||||
import requests
|
import requests
|
||||||
import os
|
|
||||||
import subprocess
|
|
||||||
import sys
|
import sys
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
|
import typer
|
||||||
|
|
||||||
|
from ._app import app, Arg, Opt
|
||||||
from .. import about
|
from .. import about
|
||||||
from ..util import is_package, get_base_version
|
from ..util import is_package, get_base_version, run_command
|
||||||
|
|
||||||
|
|
||||||
def download(
|
@app.command(
|
||||||
model: ("Model to download (shortcut or name)", "positional", None, str),
|
"download",
|
||||||
direct: ("Force direct download of name + version", "flag", "d", bool) = False,
|
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||||
*pip_args: ("Additional arguments to be passed to `pip install` on model install"),
|
)
|
||||||
|
def download_cli(
|
||||||
|
# fmt: off
|
||||||
|
ctx: typer.Context,
|
||||||
|
model: str = Arg(..., help="Model to download (shortcut or name)"),
|
||||||
|
direct: bool = Opt(False, "--direct", "-d", "-D", help="Force direct download of name + version"),
|
||||||
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Download compatible model from default download path using pip. If --direct
|
Download compatible model from default download path using pip. If --direct
|
||||||
flag is set, the command expects the full model name with version.
|
flag is set, the command expects the full model name with version.
|
||||||
For direct downloads, the compatibility check will be skipped.
|
For direct downloads, the compatibility check will be skipped. All
|
||||||
|
additional arguments provided to this command will be passed to `pip install`
|
||||||
|
on model installation.
|
||||||
"""
|
"""
|
||||||
|
download(model, direct, *ctx.args)
|
||||||
|
|
||||||
|
|
||||||
|
def download(model: str, direct: bool = False, *pip_args) -> None:
|
||||||
if not is_package("spacy") and "--no-deps" not in pip_args:
|
if not is_package("spacy") and "--no-deps" not in pip_args:
|
||||||
msg.warn(
|
msg.warn(
|
||||||
"Skipping model package dependencies and setting `--no-deps`. "
|
"Skipping model package dependencies and setting `--no-deps`. "
|
||||||
|
@ -33,22 +46,20 @@ def download(
|
||||||
components = model.split("-")
|
components = model.split("-")
|
||||||
model_name = "".join(components[:-1])
|
model_name = "".join(components[:-1])
|
||||||
version = components[-1]
|
version = components[-1]
|
||||||
dl = download_model(dl_tpl.format(m=model_name, v=version), pip_args)
|
download_model(dl_tpl.format(m=model_name, v=version), pip_args)
|
||||||
else:
|
else:
|
||||||
shortcuts = get_json(about.__shortcuts__, "available shortcuts")
|
shortcuts = get_json(about.__shortcuts__, "available shortcuts")
|
||||||
model_name = shortcuts.get(model, model)
|
model_name = shortcuts.get(model, model)
|
||||||
compatibility = get_compatibility()
|
compatibility = get_compatibility()
|
||||||
version = get_version(model_name, compatibility)
|
version = get_version(model_name, compatibility)
|
||||||
dl = download_model(dl_tpl.format(m=model_name, v=version), pip_args)
|
download_model(dl_tpl.format(m=model_name, v=version), pip_args)
|
||||||
if dl != 0: # if download subprocess doesn't return 0, exit
|
msg.good(
|
||||||
sys.exit(dl)
|
"Download and installation successful",
|
||||||
msg.good(
|
f"You can now load the model via spacy.load('{model_name}')",
|
||||||
"Download and installation successful",
|
)
|
||||||
f"You can now load the model via spacy.load('{model_name}')",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def get_json(url, desc):
|
def get_json(url: str, desc: str) -> Union[dict, list]:
|
||||||
r = requests.get(url)
|
r = requests.get(url)
|
||||||
if r.status_code != 200:
|
if r.status_code != 200:
|
||||||
msg.fail(
|
msg.fail(
|
||||||
|
@ -62,7 +73,7 @@ def get_json(url, desc):
|
||||||
return r.json()
|
return r.json()
|
||||||
|
|
||||||
|
|
||||||
def get_compatibility():
|
def get_compatibility() -> dict:
|
||||||
version = get_base_version(about.__version__)
|
version = get_base_version(about.__version__)
|
||||||
comp_table = get_json(about.__compatibility__, "compatibility table")
|
comp_table = get_json(about.__compatibility__, "compatibility table")
|
||||||
comp = comp_table["spacy"]
|
comp = comp_table["spacy"]
|
||||||
|
@ -71,7 +82,7 @@ def get_compatibility():
|
||||||
return comp[version]
|
return comp[version]
|
||||||
|
|
||||||
|
|
||||||
def get_version(model, comp):
|
def get_version(model: str, comp: dict) -> str:
|
||||||
model = get_base_version(model)
|
model = get_base_version(model)
|
||||||
if model not in comp:
|
if model not in comp:
|
||||||
msg.fail(
|
msg.fail(
|
||||||
|
@ -81,10 +92,12 @@ def get_version(model, comp):
|
||||||
return comp[model][0]
|
return comp[model][0]
|
||||||
|
|
||||||
|
|
||||||
def download_model(filename, user_pip_args=None):
|
def download_model(
|
||||||
|
filename: str, user_pip_args: Optional[Sequence[str]] = None
|
||||||
|
) -> None:
|
||||||
download_url = about.__download_url__ + "/" + filename
|
download_url = about.__download_url__ + "/" + filename
|
||||||
pip_args = ["--no-cache-dir"]
|
pip_args = ["--no-cache-dir"]
|
||||||
if user_pip_args:
|
if user_pip_args:
|
||||||
pip_args.extend(user_pip_args)
|
pip_args.extend(user_pip_args)
|
||||||
cmd = [sys.executable, "-m", "pip", "install"] + pip_args + [download_url]
|
cmd = [sys.executable, "-m", "pip", "install"] + pip_args + [download_url]
|
||||||
return subprocess.call(cmd, env=os.environ.copy())
|
run_command(cmd)
|
||||||
|
|
|
@ -1,26 +1,56 @@
|
||||||
|
from typing import Optional, List
|
||||||
from timeit import default_timer as timer
|
from timeit import default_timer as timer
|
||||||
from wasabi import msg
|
from wasabi import Printer
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
from ..gold import Corpus
|
from ..gold import Corpus
|
||||||
|
from ..tokens import Doc
|
||||||
|
from ._app import app, Arg, Opt
|
||||||
|
from ..scorer import Scorer
|
||||||
from .. import util
|
from .. import util
|
||||||
from .. import displacy
|
from .. import displacy
|
||||||
|
|
||||||
|
|
||||||
def evaluate(
|
@app.command("evaluate")
|
||||||
|
def evaluate_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
model: ("Model name or path", "positional", None, str),
|
model: str = Arg(..., help="Model name or path"),
|
||||||
data_path: ("Location of JSON-formatted evaluation data", "positional", None, str),
|
data_path: Path = Arg(..., help="Location of JSON-formatted evaluation data", exists=True),
|
||||||
gpu_id: ("Use GPU", "option", "g", int) = -1,
|
gpu_id: int = Opt(-1, "--gpu-id", "-g", help="Use GPU"),
|
||||||
gold_preproc: ("Use gold preprocessing", "flag", "G", bool) = False,
|
gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"),
|
||||||
displacy_path: ("Directory to output rendered parses as HTML", "option", "dp", str) = None,
|
displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
|
||||||
displacy_limit: ("Limit of parses to render as HTML", "option", "dl", int) = 25,
|
displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"),
|
||||||
return_scores: ("Return dict containing model scores", "flag", "R", bool) = False,
|
return_scores: bool = Opt(False, "--return-scores", "-R", help="Return dict containing model scores"),
|
||||||
# fmt: on
|
|
||||||
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Evaluate a model. To render a sample of parses in a HTML file, set an
|
Evaluate a model. To render a sample of parses in a HTML file, set an
|
||||||
output directory as the displacy_path argument.
|
output directory as the displacy_path argument.
|
||||||
"""
|
"""
|
||||||
|
evaluate(
|
||||||
|
model,
|
||||||
|
data_path,
|
||||||
|
gpu_id=gpu_id,
|
||||||
|
gold_preproc=gold_preproc,
|
||||||
|
displacy_path=displacy_path,
|
||||||
|
displacy_limit=displacy_limit,
|
||||||
|
silent=False,
|
||||||
|
return_scores=return_scores,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def evaluate(
|
||||||
|
model: str,
|
||||||
|
data_path: Path,
|
||||||
|
gpu_id: int = -1,
|
||||||
|
gold_preproc: bool = False,
|
||||||
|
displacy_path: Optional[Path] = None,
|
||||||
|
displacy_limit: int = 25,
|
||||||
|
silent: bool = True,
|
||||||
|
return_scores: bool = False,
|
||||||
|
) -> Scorer:
|
||||||
|
msg = Printer(no_print=silent, pretty=not silent)
|
||||||
util.fix_random_seed()
|
util.fix_random_seed()
|
||||||
if gpu_id >= 0:
|
if gpu_id >= 0:
|
||||||
util.use_gpu(gpu_id)
|
util.use_gpu(gpu_id)
|
||||||
|
@ -79,7 +109,14 @@ def evaluate(
|
||||||
return scorer.scores
|
return scorer.scores
|
||||||
|
|
||||||
|
|
||||||
def render_parses(docs, output_path, model_name="", limit=250, deps=True, ents=True):
|
def render_parses(
|
||||||
|
docs: List[Doc],
|
||||||
|
output_path: Path,
|
||||||
|
model_name: str = "",
|
||||||
|
limit: int = 250,
|
||||||
|
deps: bool = True,
|
||||||
|
ents: bool = True,
|
||||||
|
):
|
||||||
docs[0].user_data["title"] = model_name
|
docs[0].user_data["title"] = model_name
|
||||||
if ents:
|
if ents:
|
||||||
html = displacy.render(docs[:limit], style="ent", page=True)
|
html = displacy.render(docs[:limit], style="ent", page=True)
|
||||||
|
|
|
@ -1,77 +1,109 @@
|
||||||
|
from typing import Optional, Dict, Any, Union
|
||||||
import platform
|
import platform
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import msg
|
from wasabi import Printer
|
||||||
import srsly
|
import srsly
|
||||||
|
|
||||||
from .validate import get_model_pkgs
|
from ._app import app, Arg, Opt
|
||||||
from .. import util
|
from .. import util
|
||||||
from .. import about
|
from .. import about
|
||||||
|
|
||||||
|
|
||||||
def info(
|
@app.command("info")
|
||||||
model: ("Optional model name", "positional", None, str) = None,
|
def info_cli(
|
||||||
markdown: ("Generate Markdown for GitHub issues", "flag", "md", str) = False,
|
# fmt: off
|
||||||
silent: ("Don't print anything (just return)", "flag", "s") = False,
|
model: Optional[str] = Arg(None, help="Optional model name"),
|
||||||
|
markdown: bool = Opt(False, "--markdown", "-md", help="Generate Markdown for GitHub issues"),
|
||||||
|
silent: bool = Opt(False, "--silent", "-s", "-S", help="Don't print anything (just return)"),
|
||||||
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Print info about spaCy installation. If a model is speficied as an argument,
|
Print info about spaCy installation. If a model is speficied as an argument,
|
||||||
print model information. Flag --markdown prints details in Markdown for easy
|
print model information. Flag --markdown prints details in Markdown for easy
|
||||||
copy-pasting to GitHub issues.
|
copy-pasting to GitHub issues.
|
||||||
"""
|
"""
|
||||||
|
info(model, markdown=markdown, silent=silent)
|
||||||
|
|
||||||
|
|
||||||
|
def info(
|
||||||
|
model: Optional[str] = None, *, markdown: bool = False, silent: bool = True
|
||||||
|
) -> Union[str, dict]:
|
||||||
|
msg = Printer(no_print=silent, pretty=not silent)
|
||||||
if model:
|
if model:
|
||||||
if util.is_package(model):
|
title = f"Info about model '{model}'"
|
||||||
model_path = util.get_package_path(model)
|
data = info_model(model, silent=silent)
|
||||||
else:
|
else:
|
||||||
model_path = model
|
title = "Info about spaCy"
|
||||||
meta_path = model_path / "meta.json"
|
data = info_spacy()
|
||||||
if not meta_path.is_file():
|
raw_data = {k.lower().replace(" ", "_"): v for k, v in data.items()}
|
||||||
msg.fail("Can't find model meta.json", meta_path, exits=1)
|
if "Models" in data and isinstance(data["Models"], dict):
|
||||||
meta = srsly.read_json(meta_path)
|
data["Models"] = ", ".join(f"{n} ({v})" for n, v in data["Models"].items())
|
||||||
if model_path.resolve() != model_path:
|
markdown_data = get_markdown(data, title=title)
|
||||||
meta["link"] = str(model_path)
|
if markdown:
|
||||||
meta["source"] = str(model_path.resolve())
|
|
||||||
else:
|
|
||||||
meta["source"] = str(model_path)
|
|
||||||
if not silent:
|
if not silent:
|
||||||
title = f"Info about model '{model}'"
|
print(markdown_data)
|
||||||
model_meta = {
|
return markdown_data
|
||||||
k: v for k, v in meta.items() if k not in ("accuracy", "speed")
|
if not silent:
|
||||||
}
|
table_data = dict(data)
|
||||||
if markdown:
|
msg.table(table_data, title=title)
|
||||||
print_markdown(model_meta, title=title)
|
return raw_data
|
||||||
else:
|
|
||||||
msg.table(model_meta, title=title)
|
|
||||||
return meta
|
def info_spacy() -> Dict[str, any]:
|
||||||
all_models, _ = get_model_pkgs()
|
"""Generate info about the current spaCy intallation.
|
||||||
data = {
|
|
||||||
|
RETURNS (dict): The spaCy info.
|
||||||
|
"""
|
||||||
|
all_models = {}
|
||||||
|
for pkg_name in util.get_installed_models():
|
||||||
|
package = pkg_name.replace("-", "_")
|
||||||
|
all_models[package] = util.get_package_version(pkg_name)
|
||||||
|
return {
|
||||||
"spaCy version": about.__version__,
|
"spaCy version": about.__version__,
|
||||||
"Location": str(Path(__file__).parent.parent),
|
"Location": str(Path(__file__).parent.parent),
|
||||||
"Platform": platform.platform(),
|
"Platform": platform.platform(),
|
||||||
"Python version": platform.python_version(),
|
"Python version": platform.python_version(),
|
||||||
"Models": ", ".join(
|
"Models": all_models,
|
||||||
f"{m['name']} ({m['version']})" for m in all_models.values()
|
|
||||||
),
|
|
||||||
}
|
}
|
||||||
if not silent:
|
|
||||||
title = "Info about spaCy"
|
|
||||||
if markdown:
|
|
||||||
print_markdown(data, title=title)
|
|
||||||
else:
|
|
||||||
msg.table(data, title=title)
|
|
||||||
return data
|
|
||||||
|
|
||||||
|
|
||||||
def print_markdown(data, title=None):
|
def info_model(model: str, *, silent: bool = True) -> Dict[str, Any]:
|
||||||
"""Print data in GitHub-flavoured Markdown format for issues etc.
|
"""Generate info about a specific model.
|
||||||
|
|
||||||
|
model (str): Model name of path.
|
||||||
|
silent (bool): Don't print anything, just return.
|
||||||
|
RETURNS (dict): The model meta.
|
||||||
|
"""
|
||||||
|
msg = Printer(no_print=silent, pretty=not silent)
|
||||||
|
if util.is_package(model):
|
||||||
|
model_path = util.get_package_path(model)
|
||||||
|
else:
|
||||||
|
model_path = model
|
||||||
|
meta_path = model_path / "meta.json"
|
||||||
|
if not meta_path.is_file():
|
||||||
|
msg.fail("Can't find model meta.json", meta_path, exits=1)
|
||||||
|
meta = srsly.read_json(meta_path)
|
||||||
|
if model_path.resolve() != model_path:
|
||||||
|
meta["link"] = str(model_path)
|
||||||
|
meta["source"] = str(model_path.resolve())
|
||||||
|
else:
|
||||||
|
meta["source"] = str(model_path)
|
||||||
|
return {k: v for k, v in meta.items() if k not in ("accuracy", "speed")}
|
||||||
|
|
||||||
|
|
||||||
|
def get_markdown(data: Dict[str, Any], title: Optional[str] = None) -> str:
|
||||||
|
"""Get data in GitHub-flavoured Markdown format for issues etc.
|
||||||
|
|
||||||
data (dict or list of tuples): Label/value pairs.
|
data (dict or list of tuples): Label/value pairs.
|
||||||
title (str / None): Title, will be rendered as headline 2.
|
title (str / None): Title, will be rendered as headline 2.
|
||||||
|
RETURNS (str): The Markdown string.
|
||||||
"""
|
"""
|
||||||
markdown = []
|
markdown = []
|
||||||
for key, value in data.items():
|
for key, value in data.items():
|
||||||
if isinstance(value, str) and Path(value).exists():
|
if isinstance(value, str) and Path(value).exists():
|
||||||
continue
|
continue
|
||||||
markdown.append(f"* **{key}:** {value}")
|
markdown.append(f"* **{key}:** {value}")
|
||||||
|
result = "\n{}\n".format("\n".join(markdown))
|
||||||
if title:
|
if title:
|
||||||
print(f"\n## {title}")
|
result = f"\n## {title}\n{result}"
|
||||||
print("\n{}\n".format("\n".join(markdown)))
|
return result
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
from typing import Optional, List, Dict, Any, Union, IO
|
||||||
import math
|
import math
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
import numpy
|
import numpy
|
||||||
|
@ -9,10 +10,12 @@ import gzip
|
||||||
import zipfile
|
import zipfile
|
||||||
import srsly
|
import srsly
|
||||||
import warnings
|
import warnings
|
||||||
from wasabi import msg
|
from wasabi import Printer
|
||||||
|
|
||||||
|
from ._app import app, Arg, Opt
|
||||||
from ..vectors import Vectors
|
from ..vectors import Vectors
|
||||||
from ..errors import Errors, Warnings
|
from ..errors import Errors, Warnings
|
||||||
|
from ..language import Language
|
||||||
from ..util import ensure_path, get_lang_class, load_model, OOV_RANK
|
from ..util import ensure_path, get_lang_class, load_model, OOV_RANK
|
||||||
from ..lookups import Lookups
|
from ..lookups import Lookups
|
||||||
|
|
||||||
|
@ -25,20 +28,21 @@ except ImportError:
|
||||||
DEFAULT_OOV_PROB = -20
|
DEFAULT_OOV_PROB = -20
|
||||||
|
|
||||||
|
|
||||||
def init_model(
|
@app.command("init-model")
|
||||||
|
def init_model_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
lang: ("Model language", "positional", None, str),
|
lang: str = Arg(..., help="Model language"),
|
||||||
output_dir: ("Model output directory", "positional", None, Path),
|
output_dir: Path = Arg(..., help="Model output directory"),
|
||||||
freqs_loc: ("Location of words frequencies file", "option", "f", Path) = None,
|
freqs_loc: Optional[Path] = Arg(None, help="Location of words frequencies file", exists=True),
|
||||||
clusters_loc: ("Optional location of brown clusters data", "option", "c", str) = None,
|
clusters_loc: Optional[Path] = Opt(None, "--clusters-loc", "-c", help="Optional location of brown clusters data", exists=True),
|
||||||
jsonl_loc: ("Location of JSONL-formatted attributes file", "option", "j", Path) = None,
|
jsonl_loc: Optional[Path] = Opt(None, "--jsonl-loc", "-j", help="Location of JSONL-formatted attributes file", exists=True),
|
||||||
vectors_loc: ("Optional vectors file in Word2Vec format", "option", "v", str) = None,
|
vectors_loc: Optional[Path] = Opt(None, "--vectors-loc", "-v", help="Optional vectors file in Word2Vec format", exists=True),
|
||||||
prune_vectors: ("Optional number of vectors to prune to", "option", "V", int) = -1,
|
prune_vectors: int = Opt(-1 , "--prune-vectors", "-V", help="Optional number of vectors to prune to"),
|
||||||
truncate_vectors: ("Optional number of vectors to truncate to when reading in vectors file", "option", "t", int) = 0,
|
truncate_vectors: int = Opt(0, "--truncate-vectors", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
|
||||||
vectors_name: ("Optional name for the word vectors, e.g. en_core_web_lg.vectors", "option", "vn", str) = None,
|
vectors_name: Optional[str] = Opt(None, "--vectors-name", "-vn", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
|
||||||
model_name: ("Optional name for the model meta", "option", "mn", str) = None,
|
model_name: Optional[str] = Opt(None, "--model-name", "-mn", help="Optional name for the model meta"),
|
||||||
omit_extra_lookups: ("Don't include extra lookups in model", "flag", "OEL", bool) = False,
|
omit_extra_lookups: bool = Opt(False, "--omit-extra-lookups", "-OEL", help="Don't include extra lookups in model"),
|
||||||
base_model: ("Base model (for languages with custom tokenizers)", "option", "b", str) = None
|
base_model: Optional[str] = Opt(None, "--base-model", "-b", help="Base model (for languages with custom tokenizers)")
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
|
@ -46,6 +50,38 @@ def init_model(
|
||||||
and word vectors. If vectors are provided in Word2Vec format, they can
|
and word vectors. If vectors are provided in Word2Vec format, they can
|
||||||
be either a .txt or zipped as a .zip or .tar.gz.
|
be either a .txt or zipped as a .zip or .tar.gz.
|
||||||
"""
|
"""
|
||||||
|
init_model(
|
||||||
|
lang,
|
||||||
|
output_dir,
|
||||||
|
freqs_loc=freqs_loc,
|
||||||
|
clusters_loc=clusters_loc,
|
||||||
|
jsonl_loc=jsonl_loc,
|
||||||
|
prune_vectors=prune_vectors,
|
||||||
|
truncate_vectors=truncate_vectors,
|
||||||
|
vectors_name=vectors_name,
|
||||||
|
model_name=model_name,
|
||||||
|
omit_extra_lookups=omit_extra_lookups,
|
||||||
|
base_model=base_model,
|
||||||
|
silent=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def init_model(
|
||||||
|
lang: str,
|
||||||
|
output_dir: Path,
|
||||||
|
freqs_loc: Optional[Path] = None,
|
||||||
|
clusters_loc: Optional[Path] = None,
|
||||||
|
jsonl_loc: Optional[Path] = None,
|
||||||
|
vectors_loc: Optional[Path] = None,
|
||||||
|
prune_vectors: int = -1,
|
||||||
|
truncate_vectors: int = 0,
|
||||||
|
vectors_name: Optional[str] = None,
|
||||||
|
model_name: Optional[str] = None,
|
||||||
|
omit_extra_lookups: bool = False,
|
||||||
|
base_model: Optional[str] = None,
|
||||||
|
silent: bool = True,
|
||||||
|
) -> Language:
|
||||||
|
msg = Printer(no_print=silent, pretty=not silent)
|
||||||
if jsonl_loc is not None:
|
if jsonl_loc is not None:
|
||||||
if freqs_loc is not None or clusters_loc is not None:
|
if freqs_loc is not None or clusters_loc is not None:
|
||||||
settings = ["-j"]
|
settings = ["-j"]
|
||||||
|
@ -68,7 +104,7 @@ def init_model(
|
||||||
freqs_loc = ensure_path(freqs_loc)
|
freqs_loc = ensure_path(freqs_loc)
|
||||||
if freqs_loc is not None and not freqs_loc.exists():
|
if freqs_loc is not None and not freqs_loc.exists():
|
||||||
msg.fail("Can't find words frequencies file", freqs_loc, exits=1)
|
msg.fail("Can't find words frequencies file", freqs_loc, exits=1)
|
||||||
lex_attrs = read_attrs_from_deprecated(freqs_loc, clusters_loc)
|
lex_attrs = read_attrs_from_deprecated(msg, freqs_loc, clusters_loc)
|
||||||
|
|
||||||
with msg.loading("Creating model..."):
|
with msg.loading("Creating model..."):
|
||||||
nlp = create_model(lang, lex_attrs, name=model_name, base_model=base_model)
|
nlp = create_model(lang, lex_attrs, name=model_name, base_model=base_model)
|
||||||
|
@ -83,7 +119,9 @@ def init_model(
|
||||||
|
|
||||||
msg.good("Successfully created model")
|
msg.good("Successfully created model")
|
||||||
if vectors_loc is not None:
|
if vectors_loc is not None:
|
||||||
add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name)
|
add_vectors(
|
||||||
|
msg, nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name
|
||||||
|
)
|
||||||
vec_added = len(nlp.vocab.vectors)
|
vec_added = len(nlp.vocab.vectors)
|
||||||
lex_added = len(nlp.vocab)
|
lex_added = len(nlp.vocab)
|
||||||
msg.good(
|
msg.good(
|
||||||
|
@ -95,7 +133,7 @@ def init_model(
|
||||||
return nlp
|
return nlp
|
||||||
|
|
||||||
|
|
||||||
def open_file(loc):
|
def open_file(loc: Union[str, Path]) -> IO:
|
||||||
"""Handle .gz, .tar.gz or unzipped files"""
|
"""Handle .gz, .tar.gz or unzipped files"""
|
||||||
loc = ensure_path(loc)
|
loc = ensure_path(loc)
|
||||||
if tarfile.is_tarfile(str(loc)):
|
if tarfile.is_tarfile(str(loc)):
|
||||||
|
@ -111,7 +149,9 @@ def open_file(loc):
|
||||||
return loc.open("r", encoding="utf8")
|
return loc.open("r", encoding="utf8")
|
||||||
|
|
||||||
|
|
||||||
def read_attrs_from_deprecated(freqs_loc, clusters_loc):
|
def read_attrs_from_deprecated(
|
||||||
|
msg: Printer, freqs_loc: Optional[Path], clusters_loc: Optional[Path]
|
||||||
|
) -> List[Dict[str, Any]]:
|
||||||
if freqs_loc is not None:
|
if freqs_loc is not None:
|
||||||
with msg.loading("Counting frequencies..."):
|
with msg.loading("Counting frequencies..."):
|
||||||
probs, _ = read_freqs(freqs_loc)
|
probs, _ = read_freqs(freqs_loc)
|
||||||
|
@ -139,7 +179,12 @@ def read_attrs_from_deprecated(freqs_loc, clusters_loc):
|
||||||
return lex_attrs
|
return lex_attrs
|
||||||
|
|
||||||
|
|
||||||
def create_model(lang, lex_attrs, name=None, base_model=None):
|
def create_model(
|
||||||
|
lang: str,
|
||||||
|
lex_attrs: List[Dict[str, Any]],
|
||||||
|
name: Optional[str] = None,
|
||||||
|
base_model: Optional[Union[str, Path]] = None,
|
||||||
|
) -> Language:
|
||||||
if base_model:
|
if base_model:
|
||||||
nlp = load_model(base_model)
|
nlp = load_model(base_model)
|
||||||
# keep the tokenizer but remove any existing pipeline components due to
|
# keep the tokenizer but remove any existing pipeline components due to
|
||||||
|
@ -166,7 +211,14 @@ def create_model(lang, lex_attrs, name=None, base_model=None):
|
||||||
return nlp
|
return nlp
|
||||||
|
|
||||||
|
|
||||||
def add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, name=None):
|
def add_vectors(
|
||||||
|
msg: Printer,
|
||||||
|
nlp: Language,
|
||||||
|
vectors_loc: Optional[Path],
|
||||||
|
truncate_vectors: int,
|
||||||
|
prune_vectors: int,
|
||||||
|
name: Optional[str] = None,
|
||||||
|
) -> None:
|
||||||
vectors_loc = ensure_path(vectors_loc)
|
vectors_loc = ensure_path(vectors_loc)
|
||||||
if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
|
if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
|
||||||
nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))
|
nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))
|
||||||
|
@ -176,7 +228,7 @@ def add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, name=None):
|
||||||
else:
|
else:
|
||||||
if vectors_loc:
|
if vectors_loc:
|
||||||
with msg.loading(f"Reading vectors from {vectors_loc}"):
|
with msg.loading(f"Reading vectors from {vectors_loc}"):
|
||||||
vectors_data, vector_keys = read_vectors(vectors_loc)
|
vectors_data, vector_keys = read_vectors(msg, vectors_loc)
|
||||||
msg.good(f"Loaded vectors from {vectors_loc}")
|
msg.good(f"Loaded vectors from {vectors_loc}")
|
||||||
else:
|
else:
|
||||||
vectors_data, vector_keys = (None, None)
|
vectors_data, vector_keys = (None, None)
|
||||||
|
@ -195,7 +247,7 @@ def add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, name=None):
|
||||||
nlp.vocab.prune_vectors(prune_vectors)
|
nlp.vocab.prune_vectors(prune_vectors)
|
||||||
|
|
||||||
|
|
||||||
def read_vectors(vectors_loc, truncate_vectors=0):
|
def read_vectors(msg: Printer, vectors_loc: Path, truncate_vectors: int = 0):
|
||||||
f = open_file(vectors_loc)
|
f = open_file(vectors_loc)
|
||||||
shape = tuple(int(size) for size in next(f).split())
|
shape = tuple(int(size) for size in next(f).split())
|
||||||
if truncate_vectors >= 1:
|
if truncate_vectors >= 1:
|
||||||
|
@ -215,7 +267,9 @@ def read_vectors(vectors_loc, truncate_vectors=0):
|
||||||
return vectors_data, vectors_keys
|
return vectors_data, vectors_keys
|
||||||
|
|
||||||
|
|
||||||
def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
|
def read_freqs(
|
||||||
|
freqs_loc: Path, max_length: int = 100, min_doc_freq: int = 5, min_freq: int = 50
|
||||||
|
):
|
||||||
counts = PreshCounter()
|
counts = PreshCounter()
|
||||||
total = 0
|
total = 0
|
||||||
with freqs_loc.open() as f:
|
with freqs_loc.open() as f:
|
||||||
|
@ -244,7 +298,7 @@ def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
|
||||||
return probs, oov_prob
|
return probs, oov_prob
|
||||||
|
|
||||||
|
|
||||||
def read_clusters(clusters_loc):
|
def read_clusters(clusters_loc: Path) -> dict:
|
||||||
clusters = {}
|
clusters = {}
|
||||||
if ftfy is None:
|
if ftfy is None:
|
||||||
warnings.warn(Warnings.W004)
|
warnings.warn(Warnings.W004)
|
||||||
|
|
|
@ -1,19 +1,24 @@
|
||||||
|
from typing import Optional, Union, Any, Dict
|
||||||
import shutil
|
import shutil
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import msg, get_raw_input
|
from wasabi import Printer, get_raw_input
|
||||||
import srsly
|
import srsly
|
||||||
|
import sys
|
||||||
|
|
||||||
|
from ._app import app, Arg, Opt
|
||||||
|
from ..schemas import validate, ModelMetaSchema
|
||||||
from .. import util
|
from .. import util
|
||||||
from .. import about
|
from .. import about
|
||||||
|
|
||||||
|
|
||||||
def package(
|
@app.command("package")
|
||||||
|
def package_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
input_dir: ("Directory with model data", "positional", None, str),
|
input_dir: Path = Arg(..., help="Directory with model data", exists=True, file_okay=False),
|
||||||
output_dir: ("Output parent directory", "positional", None, str),
|
output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False),
|
||||||
meta_path: ("Path to meta.json", "option", "m", str) = None,
|
meta_path: Optional[Path] = Opt(None, "--meta-path", "-m", help="Path to meta.json", exists=True, dir_okay=False),
|
||||||
create_meta: ("Create meta.json, even if one exists", "flag", "c", bool) = False,
|
create_meta: bool = Opt(False, "--create-meta", "-c", "-C", help="Create meta.json, even if one exists"),
|
||||||
force: ("Force overwriting existing model in output directory", "flag", "f", bool) = False,
|
force: bool = Opt(False, "--force", "-f", "-F", help="Force overwriting existing model in output directory"),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
|
@ -23,6 +28,25 @@ def package(
|
||||||
set and a meta.json already exists in the output directory, the existing
|
set and a meta.json already exists in the output directory, the existing
|
||||||
values will be used as the defaults in the command-line prompt.
|
values will be used as the defaults in the command-line prompt.
|
||||||
"""
|
"""
|
||||||
|
package(
|
||||||
|
input_dir,
|
||||||
|
output_dir,
|
||||||
|
meta_path=meta_path,
|
||||||
|
create_meta=create_meta,
|
||||||
|
force=force,
|
||||||
|
silent=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def package(
|
||||||
|
input_dir: Path,
|
||||||
|
output_dir: Path,
|
||||||
|
meta_path: Optional[Path] = None,
|
||||||
|
create_meta: bool = False,
|
||||||
|
force: bool = False,
|
||||||
|
silent: bool = True,
|
||||||
|
) -> None:
|
||||||
|
msg = Printer(no_print=silent, pretty=not silent)
|
||||||
input_path = util.ensure_path(input_dir)
|
input_path = util.ensure_path(input_dir)
|
||||||
output_path = util.ensure_path(output_dir)
|
output_path = util.ensure_path(output_dir)
|
||||||
meta_path = util.ensure_path(meta_path)
|
meta_path = util.ensure_path(meta_path)
|
||||||
|
@ -33,23 +57,20 @@ def package(
|
||||||
if meta_path and not meta_path.exists():
|
if meta_path and not meta_path.exists():
|
||||||
msg.fail("Can't find model meta.json", meta_path, exits=1)
|
msg.fail("Can't find model meta.json", meta_path, exits=1)
|
||||||
|
|
||||||
meta_path = meta_path or input_path / "meta.json"
|
meta_path = meta_path or input_dir / "meta.json"
|
||||||
if meta_path.is_file():
|
if not meta_path.exists() or not meta_path.is_file():
|
||||||
meta = srsly.read_json(meta_path)
|
msg.fail("Can't load model meta.json", meta_path, exits=1)
|
||||||
if not create_meta: # only print if user doesn't want to overwrite
|
meta = srsly.read_json(meta_path)
|
||||||
msg.good("Loaded meta.json from file", meta_path)
|
if not create_meta: # only print if user doesn't want to overwrite
|
||||||
else:
|
msg.good("Loaded meta.json from file", meta_path)
|
||||||
meta = generate_meta(input_dir, meta, msg)
|
else:
|
||||||
for key in ("lang", "name", "version"):
|
meta = generate_meta(input_dir, meta, msg)
|
||||||
if key not in meta or meta[key] == "":
|
errors = validate(ModelMetaSchema, meta)
|
||||||
msg.fail(
|
if errors:
|
||||||
f"No '{key}' setting found in meta.json",
|
msg.fail("Invalid model meta.json", "\n".join(errors), exits=1)
|
||||||
"This setting is required to build your package.",
|
|
||||||
exits=1,
|
|
||||||
)
|
|
||||||
model_name = meta["lang"] + "_" + meta["name"]
|
model_name = meta["lang"] + "_" + meta["name"]
|
||||||
model_name_v = model_name + "-" + meta["version"]
|
model_name_v = model_name + "-" + meta["version"]
|
||||||
main_path = output_path / model_name_v
|
main_path = output_dir / model_name_v
|
||||||
package_path = main_path / model_name
|
package_path = main_path / model_name
|
||||||
|
|
||||||
if package_path.exists():
|
if package_path.exists():
|
||||||
|
@ -63,21 +84,26 @@ def package(
|
||||||
exits=1,
|
exits=1,
|
||||||
)
|
)
|
||||||
Path.mkdir(package_path, parents=True)
|
Path.mkdir(package_path, parents=True)
|
||||||
shutil.copytree(str(input_path), str(package_path / model_name_v))
|
shutil.copytree(str(input_dir), str(package_path / model_name_v))
|
||||||
create_file(main_path / "meta.json", srsly.json_dumps(meta, indent=2))
|
create_file(main_path / "meta.json", srsly.json_dumps(meta, indent=2))
|
||||||
create_file(main_path / "setup.py", TEMPLATE_SETUP)
|
create_file(main_path / "setup.py", TEMPLATE_SETUP)
|
||||||
create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST)
|
create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST)
|
||||||
create_file(package_path / "__init__.py", TEMPLATE_INIT)
|
create_file(package_path / "__init__.py", TEMPLATE_INIT)
|
||||||
msg.good(f"Successfully created package '{model_name_v}'", main_path)
|
msg.good(f"Successfully created package '{model_name_v}'", main_path)
|
||||||
msg.text("To build the package, run `python setup.py sdist` in this directory.")
|
with util.working_dir(main_path):
|
||||||
|
util.run_command([sys.executable, "setup.py", "sdist"])
|
||||||
|
zip_file = main_path / "dist" / f"{model_name_v}.tar.gz"
|
||||||
|
msg.good(f"Successfully created zipped Python package", zip_file)
|
||||||
|
|
||||||
|
|
||||||
def create_file(file_path, contents):
|
def create_file(file_path: Path, contents: str) -> None:
|
||||||
file_path.touch()
|
file_path.touch()
|
||||||
file_path.open("w", encoding="utf-8").write(contents)
|
file_path.open("w", encoding="utf-8").write(contents)
|
||||||
|
|
||||||
|
|
||||||
def generate_meta(model_path, existing_meta, msg):
|
def generate_meta(
|
||||||
|
model_path: Union[str, Path], existing_meta: Dict[str, Any], msg: Printer
|
||||||
|
) -> Dict[str, Any]:
|
||||||
meta = existing_meta or {}
|
meta = existing_meta or {}
|
||||||
settings = [
|
settings = [
|
||||||
("lang", "Model language", meta.get("lang", "en")),
|
("lang", "Model language", meta.get("lang", "en")),
|
||||||
|
|
|
@ -1,14 +1,15 @@
|
||||||
|
from typing import Optional
|
||||||
import random
|
import random
|
||||||
import numpy
|
import numpy
|
||||||
import time
|
import time
|
||||||
import re
|
import re
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
import plac
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from thinc.api import Linear, Maxout, chain, list2array, use_pytorch_for_gpu_memory
|
from thinc.api import Linear, Maxout, chain, list2array, use_pytorch_for_gpu_memory
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
import srsly
|
import srsly
|
||||||
|
|
||||||
|
from ._app import app, Arg, Opt
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
from ..ml.models.multi_task import build_masked_language_model
|
from ..ml.models.multi_task import build_masked_language_model
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
|
@ -17,25 +18,17 @@ from .. import util
|
||||||
from ..gold import Example
|
from ..gold import Example
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
@app.command("pretrain")
|
||||||
|
def pretrain_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
texts_loc=("Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the key 'tokens'", "positional", None, str),
|
texts_loc: Path = Arg(..., help="Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the key 'tokens'", exists=True),
|
||||||
vectors_model=("Name or path to spaCy model with vectors to learn from", "positional", None, str),
|
vectors_model: str = Arg(..., help="Name or path to spaCy model with vectors to learn from"),
|
||||||
output_dir=("Directory to write models to on each epoch", "positional", None, Path),
|
output_dir: Path = Arg(..., help="Directory to write models to on each epoch"),
|
||||||
config_path=("Path to config file", "positional", None, Path),
|
config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False),
|
||||||
use_gpu=("Use GPU", "option", "g", int),
|
use_gpu: int = Opt(-1, "--use-gpu", "-g", help="Use GPU"),
|
||||||
resume_path=("Path to pretrained weights from which to resume pretraining", "option", "r", Path),
|
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
|
||||||
epoch_resume=("The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files.", "option", "er", int),
|
epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files."),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
)
|
|
||||||
def pretrain(
|
|
||||||
texts_loc,
|
|
||||||
vectors_model,
|
|
||||||
config_path,
|
|
||||||
output_dir,
|
|
||||||
use_gpu=-1,
|
|
||||||
resume_path=None,
|
|
||||||
epoch_resume=None,
|
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
|
Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
|
||||||
|
@ -52,6 +45,26 @@ def pretrain(
|
||||||
all settings are the same between pretraining and training. Ideally,
|
all settings are the same between pretraining and training. Ideally,
|
||||||
this is done by using the same config file for both commands.
|
this is done by using the same config file for both commands.
|
||||||
"""
|
"""
|
||||||
|
pretrain(
|
||||||
|
texts_loc,
|
||||||
|
vectors_model,
|
||||||
|
output_dir,
|
||||||
|
config_path,
|
||||||
|
use_gpu=use_gpu,
|
||||||
|
resume_path=resume_path,
|
||||||
|
epoch_resume=epoch_resume,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def pretrain(
|
||||||
|
texts_loc: Path,
|
||||||
|
vectors_model: str,
|
||||||
|
output_dir: Path,
|
||||||
|
config_path: Path,
|
||||||
|
use_gpu: int = -1,
|
||||||
|
resume_path: Optional[Path] = None,
|
||||||
|
epoch_resume: Optional[int] = None,
|
||||||
|
):
|
||||||
if not config_path or not config_path.exists():
|
if not config_path or not config_path.exists():
|
||||||
msg.fail("Config file not found", config_path, exits=1)
|
msg.fail("Config file not found", config_path, exits=1)
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
from typing import Optional, Sequence, Union, Iterator
|
||||||
import tqdm
|
import tqdm
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import srsly
|
import srsly
|
||||||
|
@ -5,17 +6,19 @@ import cProfile
|
||||||
import pstats
|
import pstats
|
||||||
import sys
|
import sys
|
||||||
import itertools
|
import itertools
|
||||||
import ml_datasets
|
from wasabi import msg, Printer
|
||||||
from wasabi import msg
|
|
||||||
|
|
||||||
|
from ._app import app, Arg, Opt
|
||||||
|
from ..language import Language
|
||||||
from ..util import load_model
|
from ..util import load_model
|
||||||
|
|
||||||
|
|
||||||
def profile(
|
@app.command("profile")
|
||||||
|
def profile_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
model: ("Model to load", "positional", None, str),
|
model: str = Arg(..., help="Model to load"),
|
||||||
inputs: ("Location of input file. '-' for stdin.", "positional", None, str) = None,
|
inputs: Optional[Path] = Arg(None, help="Location of input file. '-' for stdin.", exists=True, allow_dash=True),
|
||||||
n_texts: ("Maximum number of texts to use if available", "option", "n", int) = 10000,
|
n_texts: int = Opt(10000, "--n-texts", "-n", help="Maximum number of texts to use if available"),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
|
@ -24,6 +27,18 @@ def profile(
|
||||||
It can either be provided as a JSONL file, or be read from sys.sytdin.
|
It can either be provided as a JSONL file, or be read from sys.sytdin.
|
||||||
If no input file is specified, the IMDB dataset is loaded via Thinc.
|
If no input file is specified, the IMDB dataset is loaded via Thinc.
|
||||||
"""
|
"""
|
||||||
|
profile(model, inputs=inputs, n_texts=n_texts)
|
||||||
|
|
||||||
|
|
||||||
|
def profile(model: str, inputs: Optional[Path] = None, n_texts: int = 10000) -> None:
|
||||||
|
try:
|
||||||
|
import ml_datasets
|
||||||
|
except ImportError:
|
||||||
|
msg.fail(
|
||||||
|
"This command requires the ml_datasets library to be installed:"
|
||||||
|
"pip install ml_datasets",
|
||||||
|
exits=1,
|
||||||
|
)
|
||||||
if inputs is not None:
|
if inputs is not None:
|
||||||
inputs = _read_inputs(inputs, msg)
|
inputs = _read_inputs(inputs, msg)
|
||||||
if inputs is None:
|
if inputs is None:
|
||||||
|
@ -43,12 +58,12 @@ def profile(
|
||||||
s.strip_dirs().sort_stats("time").print_stats()
|
s.strip_dirs().sort_stats("time").print_stats()
|
||||||
|
|
||||||
|
|
||||||
def parse_texts(nlp, texts):
|
def parse_texts(nlp: Language, texts: Sequence[str]) -> None:
|
||||||
for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=16):
|
for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=16):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
def _read_inputs(loc, msg):
|
def _read_inputs(loc: Union[Path, str], msg: Printer) -> Iterator[str]:
|
||||||
if loc == "-":
|
if loc == "-":
|
||||||
msg.info("Reading input from sys.stdin")
|
msg.info("Reading input from sys.stdin")
|
||||||
file_ = sys.stdin
|
file_ = sys.stdin
|
||||||
|
|
|
@ -1,10 +1,10 @@
|
||||||
from typing import Optional, Dict, List, Union, Sequence
|
from typing import Optional, Dict
|
||||||
from timeit import default_timer as timer
|
from timeit import default_timer as timer
|
||||||
|
|
||||||
import plac
|
import plac
|
||||||
import srsly
|
import srsly
|
||||||
from pydantic import BaseModel, FilePath
|
|
||||||
import tqdm
|
import tqdm
|
||||||
|
from pydantic import BaseModel, FilePath
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
import thinc
|
import thinc
|
||||||
|
@ -12,6 +12,7 @@ import thinc.schedules
|
||||||
from thinc.api import Model, use_pytorch_for_gpu_memory
|
from thinc.api import Model, use_pytorch_for_gpu_memory
|
||||||
import random
|
import random
|
||||||
|
|
||||||
|
from ._app import app, Arg, Opt
|
||||||
from ..gold import Corpus
|
from ..gold import Corpus
|
||||||
from ..lookups import Lookups
|
from ..lookups import Lookups
|
||||||
from .. import util
|
from .. import util
|
||||||
|
@ -20,6 +21,9 @@ from ..errors import Errors
|
||||||
# Don't remove - required to load the built-in architectures
|
# Don't remove - required to load the built-in architectures
|
||||||
from ..ml import models # noqa: F401
|
from ..ml import models # noqa: F401
|
||||||
|
|
||||||
|
# from ..schemas import ConfigSchema # TODO: include?
|
||||||
|
|
||||||
|
|
||||||
registry = util.registry
|
registry = util.registry
|
||||||
|
|
||||||
CONFIG_STR = """
|
CONFIG_STR = """
|
||||||
|
@ -116,35 +120,21 @@ class ConfigSchema(BaseModel):
|
||||||
extra = "allow"
|
extra = "allow"
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
@app.command("train")
|
||||||
# fmt: off
|
|
||||||
train_path=("Location of JSON-formatted training data", "positional", None, Path),
|
|
||||||
dev_path=("Location of JSON-formatted development data", "positional", None, Path),
|
|
||||||
config_path=("Path to config file", "positional", None, Path),
|
|
||||||
output_path=("Output directory to store model in", "option", "o", Path),
|
|
||||||
code_path=("Path to Python file with additional code (registered functions) to be imported", "option", "c", Path),
|
|
||||||
init_tok2vec=(
|
|
||||||
"Path to pretrained weights for the tok2vec components. See 'spacy pretrain'. Experimental.", "option", "t2v",
|
|
||||||
Path),
|
|
||||||
raw_text=("Path to jsonl file with unlabelled text documents.", "option", "rt", Path),
|
|
||||||
verbose=("Display more information for debugging purposes", "flag", "VV", bool),
|
|
||||||
use_gpu=("Use GPU", "option", "g", int),
|
|
||||||
tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path),
|
|
||||||
omit_extra_lookups=("Don't include extra lookups in model", "flag", "OEL", bool),
|
|
||||||
# fmt: on
|
|
||||||
)
|
|
||||||
def train_cli(
|
def train_cli(
|
||||||
train_path,
|
# fmt: off
|
||||||
dev_path,
|
train_path: Path = Arg(..., help="Location of JSON-formatted training data", exists=True),
|
||||||
config_path,
|
dev_path: Path = Arg(..., help="Location of JSON-formatted development data", exists=True),
|
||||||
output_path=None,
|
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
||||||
code_path=None,
|
output_path: Optional[Path] = Opt(None, "--output-path", "-o", help="Output directory to store model in"),
|
||||||
init_tok2vec=None,
|
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||||
raw_text=None,
|
init_tok2vec: Optional[Path] = Opt(None, "--init-tok2vec", "-t2v", help="Path to pretrained weights for the tok2vec components. See 'spacy pretrain'. Experimental."),
|
||||||
verbose=False,
|
raw_text: Optional[Path] = Opt(None, "--raw-text", "-rt", help="Path to jsonl file with unlabelled text documents."),
|
||||||
use_gpu=-1,
|
verbose: bool = Opt(False, "--verbose", "-VV", help="Display more information for debugging purposes"),
|
||||||
tag_map_path=None,
|
use_gpu: int = Opt(-1, "--use-gpu", "-g", help="Use GPU"),
|
||||||
omit_extra_lookups=False,
|
tag_map_path: Optional[Path] = Opt(None, "--tag-map-path", "-tm", help="Location of JSON-formatted tag map"),
|
||||||
|
omit_extra_lookups: bool = Opt(False, "--omit-extra-lookups", "-OEL", help="Don't include extra lookups in model"),
|
||||||
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Train or update a spaCy model. Requires data to be formatted in spaCy's
|
Train or update a spaCy model. Requires data to be formatted in spaCy's
|
||||||
|
@ -183,14 +173,14 @@ def train_cli(
|
||||||
|
|
||||||
|
|
||||||
def train(
|
def train(
|
||||||
config_path,
|
config_path: Path,
|
||||||
data_paths,
|
data_paths: Dict[str, Path],
|
||||||
raw_text=None,
|
raw_text: Optional[Path] = None,
|
||||||
output_path=None,
|
output_path: Optional[Path] = None,
|
||||||
tag_map=None,
|
tag_map: Optional[Path] = None,
|
||||||
weights_data=None,
|
weights_data: Optional[bytes] = None,
|
||||||
omit_extra_lookups=False,
|
omit_extra_lookups: bool = False,
|
||||||
):
|
) -> None:
|
||||||
msg.info(f"Loading config from: {config_path}")
|
msg.info(f"Loading config from: {config_path}")
|
||||||
# Read the config first without creating objects, to get to the original nlp_config
|
# Read the config first without creating objects, to get to the original nlp_config
|
||||||
config = util.load_config(config_path, create_objects=False)
|
config = util.load_config(config_path, create_objects=False)
|
||||||
|
|
|
@ -1,18 +1,25 @@
|
||||||
|
from typing import Tuple
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import sys
|
import sys
|
||||||
import requests
|
import requests
|
||||||
from wasabi import msg
|
from wasabi import msg, Printer
|
||||||
|
|
||||||
|
from ._app import app
|
||||||
from .. import about
|
from .. import about
|
||||||
from ..util import get_package_version, get_installed_models, get_base_version
|
from ..util import get_package_version, get_installed_models, get_base_version
|
||||||
from ..util import get_package_path, get_model_meta, is_compatible_version
|
from ..util import get_package_path, get_model_meta, is_compatible_version
|
||||||
|
|
||||||
|
|
||||||
def validate():
|
@app.command("validate")
|
||||||
|
def validate_cli():
|
||||||
"""
|
"""
|
||||||
Validate that the currently installed version of spaCy is compatible
|
Validate that the currently installed version of spaCy is compatible
|
||||||
with the installed models. Should be run after `pip install -U spacy`.
|
with the installed models. Should be run after `pip install -U spacy`.
|
||||||
"""
|
"""
|
||||||
|
validate()
|
||||||
|
|
||||||
|
|
||||||
|
def validate() -> None:
|
||||||
model_pkgs, compat = get_model_pkgs()
|
model_pkgs, compat = get_model_pkgs()
|
||||||
spacy_version = get_base_version(about.__version__)
|
spacy_version = get_base_version(about.__version__)
|
||||||
current_compat = compat.get(spacy_version, {})
|
current_compat = compat.get(spacy_version, {})
|
||||||
|
@ -55,7 +62,8 @@ def validate():
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
def get_model_pkgs():
|
def get_model_pkgs(silent: bool = False) -> Tuple[dict, dict]:
|
||||||
|
msg = Printer(no_print=silent, pretty=not silent)
|
||||||
with msg.loading("Loading compatibility table..."):
|
with msg.loading("Loading compatibility table..."):
|
||||||
r = requests.get(about.__compatibility__)
|
r = requests.get(about.__compatibility__)
|
||||||
if r.status_code != 200:
|
if r.status_code != 200:
|
||||||
|
@ -93,7 +101,7 @@ def get_model_pkgs():
|
||||||
return pkgs, compat
|
return pkgs, compat
|
||||||
|
|
||||||
|
|
||||||
def reformat_version(version):
|
def reformat_version(version: str) -> str:
|
||||||
"""Hack to reformat old versions ending on '-alpha' to match pip format."""
|
"""Hack to reformat old versions ending on '-alpha' to match pip format."""
|
||||||
if version.endswith("-alpha"):
|
if version.endswith("-alpha"):
|
||||||
return version.replace("-alpha", "a0")
|
return version.replace("-alpha", "a0")
|
||||||
|
|
|
@ -1,8 +1,9 @@
|
||||||
from typing import Dict, List, Union, Optional
|
from typing import Dict, List, Union, Optional, Sequence, Any
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from pydantic import BaseModel, Field, ValidationError, validator
|
from pydantic import BaseModel, Field, ValidationError, validator
|
||||||
from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool
|
from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool, FilePath
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
from thinc.api import Model
|
||||||
|
|
||||||
from .attrs import NAMES
|
from .attrs import NAMES
|
||||||
|
|
||||||
|
@ -163,24 +164,48 @@ class ModelMetaSchema(BaseModel):
|
||||||
email: Optional[StrictStr] = Field(None, title="Model author email")
|
email: Optional[StrictStr] = Field(None, title="Model author email")
|
||||||
url: Optional[StrictStr] = Field(None, title="Model author URL")
|
url: Optional[StrictStr] = Field(None, title="Model author URL")
|
||||||
sources: Optional[Union[List[StrictStr], Dict[str, str]]] = Field(None, title="Training data sources")
|
sources: Optional[Union[List[StrictStr], Dict[str, str]]] = Field(None, title="Training data sources")
|
||||||
vectors: Optional[Dict[str, int]] = Field(None, title="Included word vectors")
|
vectors: Optional[Dict[str, Any]] = Field(None, title="Included word vectors")
|
||||||
accuracy: Optional[Dict[str, Union[float, int]]] = Field(None, title="Accuracy numbers")
|
accuracy: Optional[Dict[str, Union[float, int]]] = Field(None, title="Accuracy numbers")
|
||||||
speed: Optional[Dict[str, Union[float, int]]] = Field(None, title="Speed evaluation numbers")
|
speed: Optional[Dict[str, Union[float, int]]] = Field(None, title="Speed evaluation numbers")
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
|
|
||||||
# Training data object in "simple training style"
|
# JSON training format
|
||||||
|
|
||||||
|
|
||||||
class SimpleTrainingSchema(BaseModel):
|
class PipelineComponent(BaseModel):
|
||||||
# TODO: write
|
factory: str
|
||||||
|
model: Model
|
||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
title = "Schema for training data dict in passed to nlp.update"
|
arbitrary_types_allowed = True
|
||||||
extra = "forbid"
|
|
||||||
|
|
||||||
|
|
||||||
# JSON training format
|
class ConfigSchema(BaseModel):
|
||||||
|
optimizer: Optional["Optimizer"]
|
||||||
|
|
||||||
|
class training(BaseModel):
|
||||||
|
patience: int = 10
|
||||||
|
eval_frequency: int = 100
|
||||||
|
dropout: float = 0.2
|
||||||
|
init_tok2vec: Optional[FilePath] = None
|
||||||
|
max_epochs: int = 100
|
||||||
|
orth_variant_level: float = 0.0
|
||||||
|
gold_preproc: bool = False
|
||||||
|
max_length: int = 0
|
||||||
|
use_gpu: int = 0
|
||||||
|
scores: List[str] = ["ents_p", "ents_r", "ents_f"]
|
||||||
|
score_weights: Dict[str, Union[int, float]] = {"ents_f": 1.0}
|
||||||
|
limit: int = 0
|
||||||
|
batch_size: Union[Sequence[int], int]
|
||||||
|
|
||||||
|
class nlp(BaseModel):
|
||||||
|
lang: str
|
||||||
|
vectors: Optional[str]
|
||||||
|
pipeline: Optional[Dict[str, PipelineComponent]]
|
||||||
|
|
||||||
|
class Config:
|
||||||
|
extra = "allow"
|
||||||
|
|
||||||
|
|
||||||
class TrainingSchema(BaseModel):
|
class TrainingSchema(BaseModel):
|
||||||
|
@ -189,3 +214,34 @@ class TrainingSchema(BaseModel):
|
||||||
class Config:
|
class Config:
|
||||||
title = "Schema for training data in spaCy's JSON format"
|
title = "Schema for training data in spaCy's JSON format"
|
||||||
extra = "forbid"
|
extra = "forbid"
|
||||||
|
|
||||||
|
|
||||||
|
# Project config Schema
|
||||||
|
|
||||||
|
|
||||||
|
class ProjectConfigAsset(BaseModel):
|
||||||
|
dest: StrictStr = Field(..., title="Destination of downloaded asset")
|
||||||
|
url: StrictStr = Field(..., title="URL of asset")
|
||||||
|
|
||||||
|
|
||||||
|
class ProjectConfigCommand(BaseModel):
|
||||||
|
# fmt: off
|
||||||
|
name: StrictStr = Field(..., title="Name of command")
|
||||||
|
help: Optional[StrictStr] = Field(None, title="Command description")
|
||||||
|
script: List[StrictStr] = Field([], title="List of CLI commands to run, in order")
|
||||||
|
dvc_deps: List[StrictStr] = Field([], title="Data Version Control dependencies")
|
||||||
|
dvc_outputs: List[StrictStr] = Field([], title="Data Version Control outputs")
|
||||||
|
dvc_outputs_no_cache: List[StrictStr] = Field([], title="Data Version Control outputs (no cache)")
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
|
|
||||||
|
class ProjectConfigSchema(BaseModel):
|
||||||
|
# fmt: off
|
||||||
|
variables: Dict[StrictStr, Union[str, int, float, bool]] = Field({}, title="Optional variables to substitute in commands")
|
||||||
|
assets: List[ProjectConfigAsset] = Field([], title="Data assets")
|
||||||
|
run: List[StrictStr] = Field([], title="Names of project commands to execute, in order")
|
||||||
|
commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts")
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
|
class Config:
|
||||||
|
title = "Schema for project configuration file"
|
||||||
|
|
|
@ -1,10 +1,10 @@
|
||||||
|
from typing import List, Union
|
||||||
import os
|
import os
|
||||||
import importlib
|
import importlib
|
||||||
import importlib.util
|
import importlib.util
|
||||||
import re
|
import re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import random
|
import random
|
||||||
from typing import List
|
|
||||||
import thinc
|
import thinc
|
||||||
from thinc.api import NumpyOps, get_current_ops, Adam, require_gpu, Config
|
from thinc.api import NumpyOps, get_current_ops, Adam, require_gpu, Config
|
||||||
import functools
|
import functools
|
||||||
|
@ -17,6 +17,8 @@ import sys
|
||||||
import warnings
|
import warnings
|
||||||
from packaging.specifiers import SpecifierSet, InvalidSpecifier
|
from packaging.specifiers import SpecifierSet, InvalidSpecifier
|
||||||
from packaging.version import Version, InvalidVersion
|
from packaging.version import Version, InvalidVersion
|
||||||
|
import subprocess
|
||||||
|
from contextlib import contextmanager
|
||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
@ -429,6 +431,30 @@ def get_package_path(name):
|
||||||
return Path(pkg.__file__).parent
|
return Path(pkg.__file__).parent
|
||||||
|
|
||||||
|
|
||||||
|
def run_command(command: List[str]) -> None:
|
||||||
|
"""Run a command on the command line as a subprocess.
|
||||||
|
|
||||||
|
command (list): The split command.
|
||||||
|
"""
|
||||||
|
status = subprocess.call(command, env=os.environ.copy())
|
||||||
|
if status != 0:
|
||||||
|
sys.exit(status)
|
||||||
|
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def working_dir(path: Union[str, Path]) -> None:
|
||||||
|
"""Change current working directory and returns to previous on exit.
|
||||||
|
|
||||||
|
path (str / Path): The directory to navigate to.
|
||||||
|
"""
|
||||||
|
prev_cwd = Path.cwd()
|
||||||
|
os.chdir(str(path))
|
||||||
|
try:
|
||||||
|
yield
|
||||||
|
finally:
|
||||||
|
os.chdir(prev_cwd)
|
||||||
|
|
||||||
|
|
||||||
def is_in_jupyter():
|
def is_in_jupyter():
|
||||||
"""Check if user is running spaCy from a Jupyter notebook by detecting the
|
"""Check if user is running spaCy from a Jupyter notebook by detecting the
|
||||||
IPython kernel. Mainly used for the displaCy visualizer.
|
IPython kernel. Mainly used for the displaCy visualizer.
|
||||||
|
|
Loading…
Reference in New Issue
Block a user