Modernize plac commands for Python 3 (#4836)

This commit is contained in:
Ines Montani 2020-01-01 13:15:46 +01:00 committed by GitHub
parent 401946d480
commit 83e0a6f3e3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 129 additions and 268 deletions

View File

@ -1,4 +1,3 @@
import plac
from pathlib import Path from pathlib import Path
from wasabi import Printer from wasabi import Printer
import srsly import srsly
@ -26,31 +25,19 @@ FILE_TYPES = ("json", "jsonl", "msg")
FILE_TYPES_STDOUT = ("json", "jsonl") FILE_TYPES_STDOUT = ("json", "jsonl")
@plac.annotations(
# fmt: off
input_file=("Input file", "positional", None, str),
output_dir=("Output directory. '-' for stdout.", "positional", None, str),
file_type=(f"Type of data to produce: {FILE_TYPES}", "option", "t", str, FILE_TYPES),
n_sents=("Number of sentences per doc (0 to disable)", "option", "n", int),
seg_sents=("Segment sentences (for -c ner)", "flag", "s"),
model=("Model for sentence segmentation (for -s)", "option", "b", str),
converter=(f"Converter: {tuple(CONVERTERS.keys())}", "option", "c", str),
lang=("Language (if tokenizer required)", "option", "l", str),
morphology=("Enable appending morphology to tags", "flag", "m", bool),
ner_map_path=("NER tag mapping (as JSON-encoded dict of entity types)", "option", "N", Path,),
# fmt: on
)
def convert( def convert(
input_file, # fmt: off
output_dir="-", input_file: ("Input file", "positional", None, str),
file_type="json", output_dir: ("Output directory. '-' for stdout.", "positional", None, str) = "-",
n_sents=1, file_type: (f"Type of data to produce: {FILE_TYPES}", "option", "t", str, FILE_TYPES) = "json",
seg_sents=False, n_sents: ("Number of sentences per doc (0 to disable)", "option", "n", int) = 1,
model=None, seg_sents: ("Segment sentences (for -c ner)", "flag", "s") = False,
morphology=False, model: ("Model for sentence segmentation (for -s)", "option", "b", str) = None,
converter="auto", morphology: ("Enable appending morphology to tags", "flag", "m", bool) = False,
ner_map_path=None, converter: (f"Converter: {tuple(CONVERTERS.keys())}", "option", "c", str) = "auto",
lang=None, ner_map_path: ("NER tag mapping (as JSON-encoded dict of entity types)", "option", "N", Path) = None,
lang: ("Language (if tokenizer required)", "option", "l", str) = None,
# fmt: on
): ):
""" """
Convert files into JSON format for use with train command and other Convert files into JSON format for use with train command and other

View File

@ -1,6 +1,5 @@
from pathlib import Path from pathlib import Path
from collections import Counter from collections import Counter
import plac
import sys import sys
import srsly import srsly
from wasabi import Printer, MESSAGES from wasabi import Printer, MESSAGES
@ -19,29 +18,18 @@ BLANK_MODEL_MIN_THRESHOLD = 100
BLANK_MODEL_THRESHOLD = 2000 BLANK_MODEL_THRESHOLD = 2000
@plac.annotations(
# fmt: off
lang=("model language", "positional", None, str),
train_path=("location of JSON-formatted training data", "positional", None, Path),
dev_path=("location of JSON-formatted development data", "positional", None, Path),
tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path),
base_model=("name of model to update (optional)", "option", "b", str),
pipeline=("Comma-separated names of pipeline components to train", "option", "p", str),
ignore_warnings=("Ignore warnings, only show stats and errors", "flag", "IW", bool),
verbose=("Print additional information and explanations", "flag", "V", bool),
no_format=("Don't pretty-print the results", "flag", "NF", bool),
# fmt: on
)
def debug_data( def debug_data(
lang, # fmt: off
train_path, lang: ("Model language", "positional", None, str),
dev_path, train_path: ("Location of JSON-formatted training data", "positional", None, Path),
tag_map_path=None, dev_path: ("Location of JSON-formatted development data", "positional", None, Path),
base_model=None, tag_map_path: ("Location of JSON-formatted tag map", "option", "tm", Path) = None,
pipeline="tagger,parser,ner", base_model: ("Name of model to update (optional)", "option", "b", str) = None,
ignore_warnings=False, pipeline: ("Comma-separated names of pipeline components to train", "option", "p", str) = "tagger,parser,ner",
verbose=False, ignore_warnings: ("Ignore warnings, only show stats and errors", "flag", "IW", bool) = False,
no_format=False, verbose: ("Print additional information and explanations", "flag", "V", bool) = False,
no_format: ("Don't pretty-print the results", "flag", "NF", bool) = False,
# fmt: on
): ):
""" """
Analyze, debug and validate your training and development data, get useful Analyze, debug and validate your training and development data, get useful

View File

@ -1,4 +1,3 @@
import plac
import requests import requests
import os import os
import subprocess import subprocess
@ -10,12 +9,11 @@ from ..util import get_package_path
from .. import about from .. import about
@plac.annotations( def download(
model=("Model to download (shortcut or name)", "positional", None, str), model: ("Model to download (shortcut or name)", "positional", None, str),
direct=("Force direct download of name + version", "flag", "d", bool), direct: ("Force direct download of name + version", "flag", "d", bool) = False,
pip_args=("Additional arguments to be passed to `pip install` on model install"), *pip_args: ("Additional arguments to be passed to `pip install` on model install"),
) ):
def download(model, direct=False, *pip_args):
""" """
Download compatible model from default download path using pip. Model Download compatible model from default download path using pip. Model
can be shortcut, model name or, if --direct flag is set, full model name can be shortcut, model name or, if --direct flag is set, full model name

View File

@ -1,4 +1,3 @@
import plac
from timeit import default_timer as timer from timeit import default_timer as timer
from wasabi import msg from wasabi import msg
@ -7,23 +6,16 @@ from .. import util
from .. import displacy from .. import displacy
@plac.annotations(
model=("Model name or path", "positional", None, str),
data_path=("Location of JSON-formatted evaluation data", "positional", None, str),
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
gpu_id=("Use GPU", "option", "g", int),
displacy_path=("Directory to output rendered parses as HTML", "option", "dp", str),
displacy_limit=("Limit of parses to render as HTML", "option", "dl", int),
return_scores=("Return dict containing model scores", "flag", "R", bool),
)
def evaluate( def evaluate(
model, # fmt: off
data_path, model: ("Model name or path", "positional", None, str),
gpu_id=-1, data_path: ("Location of JSON-formatted evaluation data", "positional", None, str),
gold_preproc=False, gpu_id: ("Use GPU", "option", "g", int) = -1,
displacy_path=None, gold_preproc: ("Use gold preprocessing", "flag", "G", bool) = False,
displacy_limit=25, displacy_path: ("Directory to output rendered parses as HTML", "option", "dp", str) = None,
return_scores=False, displacy_limit: ("Limit of parses to render as HTML", "option", "dl", int) = 25,
return_scores: ("Return dict containing model scores", "flag", "R", bool) = False,
# fmt: on
): ):
""" """
Evaluate a model. To render a sample of parses in a HTML file, set an Evaluate a model. To render a sample of parses in a HTML file, set an

View File

@ -1,4 +1,3 @@
import plac
import platform import platform
from pathlib import Path from pathlib import Path
from wasabi import msg from wasabi import msg
@ -8,12 +7,11 @@ from .. import util
from .. import about from .. import about
@plac.annotations( def info(
model=("Optional shortcut link of model", "positional", None, str), model: ("Optional shortcut link of model", "positional", None, str) = None,
markdown=("Generate Markdown for GitHub issues", "flag", "md", str), markdown: ("Generate Markdown for GitHub issues", "flag", "md", str) = False,
silent=("Don't print anything (just return)", "flag", "s"), silent: ("Don't print anything (just return)", "flag", "s") = False,
) ):
def info(model=None, markdown=False, silent=False):
""" """
Print info about spaCy installation. If a model shortcut link is Print info about spaCy installation. If a model shortcut link is
speficied as an argument, print model information. Flag --markdown speficied as an argument, print model information. Flag --markdown

View File

@ -1,4 +1,3 @@
import plac
import math import math
from tqdm import tqdm from tqdm import tqdm
import numpy import numpy
@ -24,32 +23,18 @@ except ImportError:
DEFAULT_OOV_PROB = -20 DEFAULT_OOV_PROB = -20
@plac.annotations(
lang=("Model language", "positional", None, str),
output_dir=("Model output directory", "positional", None, Path),
freqs_loc=("Location of words frequencies file", "option", "f", Path),
jsonl_loc=("Location of JSONL-formatted attributes file", "option", "j", Path),
clusters_loc=("Optional location of brown clusters data", "option", "c", str),
vectors_loc=("Optional vectors file in Word2Vec format", "option", "v", str),
prune_vectors=("Optional number of vectors to prune to", "option", "V", int),
vectors_name=(
"Optional name for the word vectors, e.g. en_core_web_lg.vectors",
"option",
"vn",
str,
),
model_name=("Optional name for the model meta", "option", "mn", str),
)
def init_model( def init_model(
lang, # fmt: off
output_dir, lang: ("Model language", "positional", None, str),
freqs_loc=None, output_dir: ("Model output directory", "positional", None, Path),
clusters_loc=None, freqs_loc: ("Location of words frequencies file", "option", "f", Path) = None,
jsonl_loc=None, clusters_loc: ("Optional location of brown clusters data", "option", "c", str) = None,
vectors_loc=None, jsonl_loc: ("Location of JSONL-formatted attributes file", "option", "j", Path) = None,
prune_vectors=-1, vectors_loc: ("Optional vectors file in Word2Vec format", "option", "v", str) = None,
vectors_name=None, prune_vectors: ("Optional number of vectors to prune to", "option", "V", int) = -1,
model_name=None, vectors_name: ("Optional name for the word vectors, e.g. en_core_web_lg.vectors", "option", "vn", str) = None,
model_name: ("Optional name for the model meta", "option", "mn", str) = None,
# fmt: on
): ):
""" """
Create a new model from raw data, like word frequencies, Brown clusters Create a new model from raw data, like word frequencies, Brown clusters

View File

@ -1,4 +1,3 @@
import plac
from pathlib import Path from pathlib import Path
from wasabi import msg from wasabi import msg
@ -6,12 +5,12 @@ from ..compat import symlink_to
from .. import util from .. import util
@plac.annotations( def link(
origin=("package name or local path to model", "positional", None, str), origin: ("package name or local path to model", "positional", None, str),
link_name=("name of shortuct link to create", "positional", None, str), link_name: ("name of shortuct link to create", "positional", None, str),
force=("force overwriting of existing link", "flag", "f", bool), force: ("force overwriting of existing link", "flag", "f", bool) = False,
) model_path=None,
def link(origin, link_name, force=False, model_path=None): ):
""" """
Create a symlink for models within the spacy/data directory. Accepts Create a symlink for models within the spacy/data directory. Accepts
either the name of a pip package, or the local path to the model data either the name of a pip package, or the local path to the model data

View File

@ -1,4 +1,3 @@
import plac
import shutil import shutil
from pathlib import Path from pathlib import Path
from wasabi import msg, get_raw_input from wasabi import msg, get_raw_input
@ -8,14 +7,15 @@ from .. import util
from .. import about from .. import about
@plac.annotations( def package(
input_dir=("Directory with model data", "positional", None, str), # fmt: off
output_dir=("Output parent directory", "positional", None, str), input_dir: ("Directory with model data", "positional", None, str),
meta_path=("Path to meta.json", "option", "m", str), output_dir: ("Output parent directory", "positional", None, str),
create_meta=("Create meta.json, even if one exists", "flag", "c", bool), meta_path: ("Path to meta.json", "option", "m", str) = None,
force=("Force overwriting existing model in output directory", "flag", "f", bool), create_meta: ("Create meta.json, even if one exists", "flag", "c", bool) = False,
) force: ("Force overwriting existing model in output directory", "flag", "f", bool) = False,
def package(input_dir, output_dir, meta_path=None, create_meta=False, force=False): # fmt: on
):
""" """
Generate Python package for model data, including meta and required Generate Python package for model data, including meta and required
installation files. A new directory will be created in the specified installation files. A new directory will be created in the specified

View File

@ -1,4 +1,3 @@
import plac
import random import random
import numpy import numpy
import time import time
@ -21,85 +20,31 @@ from .. import util
from .train import _load_pretrained_tok2vec from .train import _load_pretrained_tok2vec
@plac.annotations(
texts_loc=(
"Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the "
"key 'tokens'",
"positional",
None,
str,
),
vectors_model=("Name or path to spaCy model with vectors to learn from"),
output_dir=("Directory to write models to on each epoch", "positional", None, str),
width=("Width of CNN layers", "option", "cw", int),
depth=("Depth of CNN layers", "option", "cd", int),
cnn_window=("Window size for CNN layers", "option", "cW", int),
cnn_pieces=("Maxout size for CNN layers. 1 for Mish", "option", "cP", int),
use_chars=("Whether to use character-based embedding", "flag", "chr", bool),
sa_depth=("Depth of self-attention layers", "option", "sa", int),
bilstm_depth=("Depth of BiLSTM layers (requires PyTorch)", "option", "lstm", int),
embed_rows=("Number of embedding rows", "option", "er", int),
loss_func=(
"Loss function to use for the objective. Either 'L2' or 'cosine'",
"option",
"L",
str,
),
use_vectors=("Whether to use the static vectors as input features", "flag", "uv"),
dropout=("Dropout rate", "option", "d", float),
batch_size=("Number of words per training batch", "option", "bs", int),
max_length=(
"Max words per example. Longer examples are discarded",
"option",
"xw",
int,
),
min_length=(
"Min words per example. Shorter examples are discarded",
"option",
"nw",
int,
),
seed=("Seed for random number generators", "option", "s", int),
n_iter=("Number of iterations to pretrain", "option", "i", int),
n_save_every=("Save model every X batches.", "option", "se", int),
init_tok2vec=(
"Path to pretrained weights for the token-to-vector parts of the models. See 'spacy pretrain'. Experimental.",
"option",
"t2v",
Path,
),
epoch_start=(
"The epoch to start counting at. Only relevant when using '--init-tok2vec' and the given weight file has been "
"renamed. Prevents unintended overwriting of existing weight files.",
"option",
"es",
int,
),
)
def pretrain( def pretrain(
texts_loc, # fmt: off
vectors_model, texts_loc: ("Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the key 'tokens'", "positional", None, str),
output_dir, vectors_model: ("Name or path to spaCy model with vectors to learn from", "positional", None, str),
width=96, output_dir: ("Directory to write models to on each epoch", "positional", None, str),
depth=4, width: ("Width of CNN layers", "option", "cw", int) = 96,
bilstm_depth=0, depth: ("Depth of CNN layers", "option", "cd", int) = 4,
cnn_pieces=3, bilstm_depth: ("Depth of BiLSTM layers (requires PyTorch)", "option", "lstm", int) = 0,
sa_depth=0, cnn_pieces: ("Maxout size for CNN layers. 1 for Mish", "option", "cP", int) = 3,
use_chars=False, sa_depth: ("Depth of self-attention layers", "option", "sa", int) = 0,
cnn_window=1, use_chars: ("Whether to use character-based embedding", "flag", "chr", bool) = False,
embed_rows=2000, cnn_window: ("Window size for CNN layers", "option", "cW", int) = 1,
loss_func="cosine", embed_rows: ("Number of embedding rows", "option", "er", int) = 2000,
use_vectors=False, loss_func: ("Loss function to use for the objective. Either 'L2' or 'cosine'", "option", "L", str) = "cosine",
dropout=0.2, use_vectors: ("Whether to use the static vectors as input features", "flag", "uv") = False,
n_iter=1000, dropout: ("Dropout rate", "option", "d", float) = 0.2,
batch_size=3000, n_iter: ("Number of iterations to pretrain", "option", "i", int) = 1000,
max_length=500, batch_size: ("Number of words per training batch", "option", "bs", int) = 3000,
min_length=5, max_length: ("Max words per example. Longer examples are discarded", "option", "xw", int) = 500,
seed=0, min_length: ("Min words per example. Shorter examples are discarded", "option", "nw", int) = 5,
n_save_every=None, seed: ("Seed for random number generators", "option", "s", int) = 0,
init_tok2vec=None, n_save_every: ("Save model every X batches.", "option", "se", int) = None,
epoch_start=None, init_tok2vec: ("Path to pretrained weights for the token-to-vector parts of the models. See 'spacy pretrain'. Experimental.", "option", "t2v", Path) = None,
epoch_start: ("The epoch to start counting at. Only relevant when using '--init-tok2vec' and the given weight file has been renamed. Prevents unintended overwriting of existing weight files.", "option", "es", int) = None,
# fmt: on
): ):
""" """
Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components, Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,

View File

@ -1,4 +1,3 @@
import plac
import tqdm import tqdm
from pathlib import Path from pathlib import Path
import srsly import srsly
@ -12,12 +11,13 @@ from wasabi import msg
from ..util import load_model from ..util import load_model
@plac.annotations( def profile(
model=("Model to load", "positional", None, str), # fmt: off
inputs=("Location of input file. '-' for stdin.", "positional", None, str), model: ("Model to load", "positional", None, str),
n_texts=("Maximum number of texts to use if available", "option", "n", int), inputs: ("Location of input file. '-' for stdin.", "positional", None, str) = None,
) n_texts: ("Maximum number of texts to use if available", "option", "n", int) = 10000,
def profile(model, inputs=None, n_texts=10000): # fmt: on
):
""" """
Profile a spaCy pipeline, to find out which functions take the most time. Profile a spaCy pipeline, to find out which functions take the most time.
Input should be formatted as one JSON object per line with a key "text". Input should be formatted as one JSON object per line with a key "text".

View File

@ -1,4 +1,3 @@
import plac
import os import os
import tqdm import tqdm
from pathlib import Path from pathlib import Path
@ -17,67 +16,37 @@ from .. import util
from .. import about from .. import about
@plac.annotations(
# fmt: off
lang=("Model language", "positional", None, str),
output_path=("Output directory to store model in", "positional", None, Path),
train_path=("Location of JSON-formatted training data", "positional", None, Path),
dev_path=("Location of JSON-formatted development data", "positional", None, Path),
raw_text=("Path to jsonl file with unlabelled text documents.", "option", "rt", Path),
base_model=("Name of model to update (optional)", "option", "b", str),
pipeline=("Comma-separated names of pipeline components", "option", "p", str),
vectors=("Model to load vectors from", "option", "v", str),
n_iter=("Number of iterations", "option", "n", int),
n_early_stopping=("Maximum number of training epochs without dev accuracy improvement", "option", "ne", int),
n_examples=("Number of examples", "option", "ns", int),
use_gpu=("Use GPU", "option", "g", int),
version=("Model version", "option", "V", str),
meta_path=("Optional path to meta.json to use as base.", "option", "m", Path),
init_tok2vec=("Path to pretrained weights for the token-to-vector parts of the models. See 'spacy pretrain'. Experimental.", "option", "t2v", Path),
parser_multitasks=("Side objectives for parser CNN, e.g. 'dep' or 'dep,tag'", "option", "pt", str),
entity_multitasks=("Side objectives for NER CNN, e.g. 'dep' or 'dep,tag'", "option", "et", str),
noise_level=("Amount of corruption for data augmentation", "option", "nl", float),
orth_variant_level=("Amount of orthography variation for data augmentation", "option", "ovl", float),
eval_beam_widths=("Beam widths to evaluate, e.g. 4,8", "option", "bw", str),
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
learn_tokens=("Make parser learn gold-standard tokenization", "flag", "T", bool),
textcat_multilabel=("Textcat classes aren't mutually exclusive (multilabel)", "flag", "TML", bool),
textcat_arch=("Textcat model architecture", "option", "ta", str),
textcat_positive_label=("Textcat positive label for binary classes with two labels", "option", "tpl", str),
tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path),
verbose=("Display more information for debug", "flag", "VV", bool),
debug=("Run data diagnostics before training", "flag", "D", bool),
# fmt: on
)
def train( def train(
lang, # fmt: off
output_path, lang: ("Model language", "positional", None, str),
train_path, output_path: ("Output directory to store model in", "positional", None, Path),
dev_path, train_path: ("Location of JSON-formatted training data", "positional", None, Path),
raw_text=None, dev_path: ("Location of JSON-formatted development data", "positional", None, Path),
base_model=None, raw_text: ("Path to jsonl file with unlabelled text documents.", "option", "rt", Path) = None,
pipeline="tagger,parser,ner", base_model: ("Name of model to update (optional)", "option", "b", str) = None,
vectors=None, pipeline: ("Comma-separated names of pipeline components", "option", "p", str) = "tagger,parser,ner",
n_iter=30, vectors: ("Model to load vectors from", "option", "v", str) = None,
n_early_stopping=None, n_iter: ("Number of iterations", "option", "n", int) = 30,
n_examples=0, n_early_stopping: ("Maximum number of training epochs without dev accuracy improvement", "option", "ne", int) = None,
use_gpu=-1, n_examples: ("Number of examples", "option", "ns", int) = 0,
version="0.0.0", use_gpu: ("Use GPU", "option", "g", int) = -1,
meta_path=None, version: ("Model version", "option", "V", str) = "0.0.0",
init_tok2vec=None, meta_path: ("Optional path to meta.json to use as base.", "option", "m", Path) = None,
parser_multitasks="", init_tok2vec: ("Path to pretrained weights for the token-to-vector parts of the models. See 'spacy pretrain'. Experimental.", "option", "t2v", Path) = None,
entity_multitasks="", parser_multitasks: ("Side objectives for parser CNN, e.g. 'dep' or 'dep,tag'", "option", "pt", str) = "",
noise_level=0.0, entity_multitasks: ("Side objectives for NER CNN, e.g. 'dep' or 'dep,tag'", "option", "et", str) = "",
orth_variant_level=0.0, noise_level: ("Amount of corruption for data augmentation", "option", "nl", float) = 0.0,
eval_beam_widths="", orth_variant_level: ("Amount of orthography variation for data augmentation", "option", "ovl", float) = 0.0,
gold_preproc=False, eval_beam_widths: ("Beam widths to evaluate, e.g. 4,8", "option", "bw", str) = "",
learn_tokens=False, gold_preproc: ("Use gold preprocessing", "flag", "G", bool) = False,
textcat_multilabel=False, learn_tokens: ("Make parser learn gold-standard tokenization", "flag", "T", bool) = False,
textcat_arch="bow", textcat_multilabel: ("Textcat classes aren't mutually exclusive (multilabel)", "flag", "TML", bool) = False,
textcat_positive_label=None, textcat_arch: ("Textcat model architecture", "option", "ta", str) = "bow",
tag_map_path=None, textcat_positive_label: ("Textcat positive label for binary classes with two labels", "option", "tpl", str) = None,
verbose=False, tag_map_path: ("Location of JSON-formatted tag map", "option", "tm", Path) = None,
debug=False, verbose: ("Display more information for debug", "flag", "VV", bool) = False,
debug: ("Run data diagnostics before training", "flag", "D", bool) = False,
# fmt: on
): ):
""" """
Train or update a spaCy model. Requires data to be formatted in spaCy's Train or update a spaCy model. Requires data to be formatted in spaCy's