mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-27 09:44:36 +03:00
Modernize plac commands for Python 3 (#4836)
This commit is contained in:
parent
401946d480
commit
83e0a6f3e3
|
@ -1,4 +1,3 @@
|
||||||
import plac
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import Printer
|
from wasabi import Printer
|
||||||
import srsly
|
import srsly
|
||||||
|
@ -26,31 +25,19 @@ FILE_TYPES = ("json", "jsonl", "msg")
|
||||||
FILE_TYPES_STDOUT = ("json", "jsonl")
|
FILE_TYPES_STDOUT = ("json", "jsonl")
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
|
||||||
# fmt: off
|
|
||||||
input_file=("Input file", "positional", None, str),
|
|
||||||
output_dir=("Output directory. '-' for stdout.", "positional", None, str),
|
|
||||||
file_type=(f"Type of data to produce: {FILE_TYPES}", "option", "t", str, FILE_TYPES),
|
|
||||||
n_sents=("Number of sentences per doc (0 to disable)", "option", "n", int),
|
|
||||||
seg_sents=("Segment sentences (for -c ner)", "flag", "s"),
|
|
||||||
model=("Model for sentence segmentation (for -s)", "option", "b", str),
|
|
||||||
converter=(f"Converter: {tuple(CONVERTERS.keys())}", "option", "c", str),
|
|
||||||
lang=("Language (if tokenizer required)", "option", "l", str),
|
|
||||||
morphology=("Enable appending morphology to tags", "flag", "m", bool),
|
|
||||||
ner_map_path=("NER tag mapping (as JSON-encoded dict of entity types)", "option", "N", Path,),
|
|
||||||
# fmt: on
|
|
||||||
)
|
|
||||||
def convert(
|
def convert(
|
||||||
input_file,
|
# fmt: off
|
||||||
output_dir="-",
|
input_file: ("Input file", "positional", None, str),
|
||||||
file_type="json",
|
output_dir: ("Output directory. '-' for stdout.", "positional", None, str) = "-",
|
||||||
n_sents=1,
|
file_type: (f"Type of data to produce: {FILE_TYPES}", "option", "t", str, FILE_TYPES) = "json",
|
||||||
seg_sents=False,
|
n_sents: ("Number of sentences per doc (0 to disable)", "option", "n", int) = 1,
|
||||||
model=None,
|
seg_sents: ("Segment sentences (for -c ner)", "flag", "s") = False,
|
||||||
morphology=False,
|
model: ("Model for sentence segmentation (for -s)", "option", "b", str) = None,
|
||||||
converter="auto",
|
morphology: ("Enable appending morphology to tags", "flag", "m", bool) = False,
|
||||||
ner_map_path=None,
|
converter: (f"Converter: {tuple(CONVERTERS.keys())}", "option", "c", str) = "auto",
|
||||||
lang=None,
|
ner_map_path: ("NER tag mapping (as JSON-encoded dict of entity types)", "option", "N", Path) = None,
|
||||||
|
lang: ("Language (if tokenizer required)", "option", "l", str) = None,
|
||||||
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Convert files into JSON format for use with train command and other
|
Convert files into JSON format for use with train command and other
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
import plac
|
|
||||||
import sys
|
import sys
|
||||||
import srsly
|
import srsly
|
||||||
from wasabi import Printer, MESSAGES
|
from wasabi import Printer, MESSAGES
|
||||||
|
@ -19,29 +18,18 @@ BLANK_MODEL_MIN_THRESHOLD = 100
|
||||||
BLANK_MODEL_THRESHOLD = 2000
|
BLANK_MODEL_THRESHOLD = 2000
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
|
||||||
# fmt: off
|
|
||||||
lang=("model language", "positional", None, str),
|
|
||||||
train_path=("location of JSON-formatted training data", "positional", None, Path),
|
|
||||||
dev_path=("location of JSON-formatted development data", "positional", None, Path),
|
|
||||||
tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path),
|
|
||||||
base_model=("name of model to update (optional)", "option", "b", str),
|
|
||||||
pipeline=("Comma-separated names of pipeline components to train", "option", "p", str),
|
|
||||||
ignore_warnings=("Ignore warnings, only show stats and errors", "flag", "IW", bool),
|
|
||||||
verbose=("Print additional information and explanations", "flag", "V", bool),
|
|
||||||
no_format=("Don't pretty-print the results", "flag", "NF", bool),
|
|
||||||
# fmt: on
|
|
||||||
)
|
|
||||||
def debug_data(
|
def debug_data(
|
||||||
lang,
|
# fmt: off
|
||||||
train_path,
|
lang: ("Model language", "positional", None, str),
|
||||||
dev_path,
|
train_path: ("Location of JSON-formatted training data", "positional", None, Path),
|
||||||
tag_map_path=None,
|
dev_path: ("Location of JSON-formatted development data", "positional", None, Path),
|
||||||
base_model=None,
|
tag_map_path: ("Location of JSON-formatted tag map", "option", "tm", Path) = None,
|
||||||
pipeline="tagger,parser,ner",
|
base_model: ("Name of model to update (optional)", "option", "b", str) = None,
|
||||||
ignore_warnings=False,
|
pipeline: ("Comma-separated names of pipeline components to train", "option", "p", str) = "tagger,parser,ner",
|
||||||
verbose=False,
|
ignore_warnings: ("Ignore warnings, only show stats and errors", "flag", "IW", bool) = False,
|
||||||
no_format=False,
|
verbose: ("Print additional information and explanations", "flag", "V", bool) = False,
|
||||||
|
no_format: ("Don't pretty-print the results", "flag", "NF", bool) = False,
|
||||||
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Analyze, debug and validate your training and development data, get useful
|
Analyze, debug and validate your training and development data, get useful
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
import plac
|
|
||||||
import requests
|
import requests
|
||||||
import os
|
import os
|
||||||
import subprocess
|
import subprocess
|
||||||
|
@ -10,12 +9,11 @@ from ..util import get_package_path
|
||||||
from .. import about
|
from .. import about
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
def download(
|
||||||
model=("Model to download (shortcut or name)", "positional", None, str),
|
model: ("Model to download (shortcut or name)", "positional", None, str),
|
||||||
direct=("Force direct download of name + version", "flag", "d", bool),
|
direct: ("Force direct download of name + version", "flag", "d", bool) = False,
|
||||||
pip_args=("Additional arguments to be passed to `pip install` on model install"),
|
*pip_args: ("Additional arguments to be passed to `pip install` on model install"),
|
||||||
)
|
):
|
||||||
def download(model, direct=False, *pip_args):
|
|
||||||
"""
|
"""
|
||||||
Download compatible model from default download path using pip. Model
|
Download compatible model from default download path using pip. Model
|
||||||
can be shortcut, model name or, if --direct flag is set, full model name
|
can be shortcut, model name or, if --direct flag is set, full model name
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
import plac
|
|
||||||
from timeit import default_timer as timer
|
from timeit import default_timer as timer
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
|
|
||||||
|
@ -7,23 +6,16 @@ from .. import util
|
||||||
from .. import displacy
|
from .. import displacy
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
|
||||||
model=("Model name or path", "positional", None, str),
|
|
||||||
data_path=("Location of JSON-formatted evaluation data", "positional", None, str),
|
|
||||||
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
|
|
||||||
gpu_id=("Use GPU", "option", "g", int),
|
|
||||||
displacy_path=("Directory to output rendered parses as HTML", "option", "dp", str),
|
|
||||||
displacy_limit=("Limit of parses to render as HTML", "option", "dl", int),
|
|
||||||
return_scores=("Return dict containing model scores", "flag", "R", bool),
|
|
||||||
)
|
|
||||||
def evaluate(
|
def evaluate(
|
||||||
model,
|
# fmt: off
|
||||||
data_path,
|
model: ("Model name or path", "positional", None, str),
|
||||||
gpu_id=-1,
|
data_path: ("Location of JSON-formatted evaluation data", "positional", None, str),
|
||||||
gold_preproc=False,
|
gpu_id: ("Use GPU", "option", "g", int) = -1,
|
||||||
displacy_path=None,
|
gold_preproc: ("Use gold preprocessing", "flag", "G", bool) = False,
|
||||||
displacy_limit=25,
|
displacy_path: ("Directory to output rendered parses as HTML", "option", "dp", str) = None,
|
||||||
return_scores=False,
|
displacy_limit: ("Limit of parses to render as HTML", "option", "dl", int) = 25,
|
||||||
|
return_scores: ("Return dict containing model scores", "flag", "R", bool) = False,
|
||||||
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Evaluate a model. To render a sample of parses in a HTML file, set an
|
Evaluate a model. To render a sample of parses in a HTML file, set an
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
import plac
|
|
||||||
import platform
|
import platform
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
|
@ -8,12 +7,11 @@ from .. import util
|
||||||
from .. import about
|
from .. import about
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
def info(
|
||||||
model=("Optional shortcut link of model", "positional", None, str),
|
model: ("Optional shortcut link of model", "positional", None, str) = None,
|
||||||
markdown=("Generate Markdown for GitHub issues", "flag", "md", str),
|
markdown: ("Generate Markdown for GitHub issues", "flag", "md", str) = False,
|
||||||
silent=("Don't print anything (just return)", "flag", "s"),
|
silent: ("Don't print anything (just return)", "flag", "s") = False,
|
||||||
)
|
):
|
||||||
def info(model=None, markdown=False, silent=False):
|
|
||||||
"""
|
"""
|
||||||
Print info about spaCy installation. If a model shortcut link is
|
Print info about spaCy installation. If a model shortcut link is
|
||||||
speficied as an argument, print model information. Flag --markdown
|
speficied as an argument, print model information. Flag --markdown
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
import plac
|
|
||||||
import math
|
import math
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
import numpy
|
import numpy
|
||||||
|
@ -24,32 +23,18 @@ except ImportError:
|
||||||
DEFAULT_OOV_PROB = -20
|
DEFAULT_OOV_PROB = -20
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
|
||||||
lang=("Model language", "positional", None, str),
|
|
||||||
output_dir=("Model output directory", "positional", None, Path),
|
|
||||||
freqs_loc=("Location of words frequencies file", "option", "f", Path),
|
|
||||||
jsonl_loc=("Location of JSONL-formatted attributes file", "option", "j", Path),
|
|
||||||
clusters_loc=("Optional location of brown clusters data", "option", "c", str),
|
|
||||||
vectors_loc=("Optional vectors file in Word2Vec format", "option", "v", str),
|
|
||||||
prune_vectors=("Optional number of vectors to prune to", "option", "V", int),
|
|
||||||
vectors_name=(
|
|
||||||
"Optional name for the word vectors, e.g. en_core_web_lg.vectors",
|
|
||||||
"option",
|
|
||||||
"vn",
|
|
||||||
str,
|
|
||||||
),
|
|
||||||
model_name=("Optional name for the model meta", "option", "mn", str),
|
|
||||||
)
|
|
||||||
def init_model(
|
def init_model(
|
||||||
lang,
|
# fmt: off
|
||||||
output_dir,
|
lang: ("Model language", "positional", None, str),
|
||||||
freqs_loc=None,
|
output_dir: ("Model output directory", "positional", None, Path),
|
||||||
clusters_loc=None,
|
freqs_loc: ("Location of words frequencies file", "option", "f", Path) = None,
|
||||||
jsonl_loc=None,
|
clusters_loc: ("Optional location of brown clusters data", "option", "c", str) = None,
|
||||||
vectors_loc=None,
|
jsonl_loc: ("Location of JSONL-formatted attributes file", "option", "j", Path) = None,
|
||||||
prune_vectors=-1,
|
vectors_loc: ("Optional vectors file in Word2Vec format", "option", "v", str) = None,
|
||||||
vectors_name=None,
|
prune_vectors: ("Optional number of vectors to prune to", "option", "V", int) = -1,
|
||||||
model_name=None,
|
vectors_name: ("Optional name for the word vectors, e.g. en_core_web_lg.vectors", "option", "vn", str) = None,
|
||||||
|
model_name: ("Optional name for the model meta", "option", "mn", str) = None,
|
||||||
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Create a new model from raw data, like word frequencies, Brown clusters
|
Create a new model from raw data, like word frequencies, Brown clusters
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
import plac
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
|
|
||||||
|
@ -6,12 +5,12 @@ from ..compat import symlink_to
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
def link(
|
||||||
origin=("package name or local path to model", "positional", None, str),
|
origin: ("package name or local path to model", "positional", None, str),
|
||||||
link_name=("name of shortuct link to create", "positional", None, str),
|
link_name: ("name of shortuct link to create", "positional", None, str),
|
||||||
force=("force overwriting of existing link", "flag", "f", bool),
|
force: ("force overwriting of existing link", "flag", "f", bool) = False,
|
||||||
)
|
model_path=None,
|
||||||
def link(origin, link_name, force=False, model_path=None):
|
):
|
||||||
"""
|
"""
|
||||||
Create a symlink for models within the spacy/data directory. Accepts
|
Create a symlink for models within the spacy/data directory. Accepts
|
||||||
either the name of a pip package, or the local path to the model data
|
either the name of a pip package, or the local path to the model data
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
import plac
|
|
||||||
import shutil
|
import shutil
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import msg, get_raw_input
|
from wasabi import msg, get_raw_input
|
||||||
|
@ -8,14 +7,15 @@ from .. import util
|
||||||
from .. import about
|
from .. import about
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
def package(
|
||||||
input_dir=("Directory with model data", "positional", None, str),
|
# fmt: off
|
||||||
output_dir=("Output parent directory", "positional", None, str),
|
input_dir: ("Directory with model data", "positional", None, str),
|
||||||
meta_path=("Path to meta.json", "option", "m", str),
|
output_dir: ("Output parent directory", "positional", None, str),
|
||||||
create_meta=("Create meta.json, even if one exists", "flag", "c", bool),
|
meta_path: ("Path to meta.json", "option", "m", str) = None,
|
||||||
force=("Force overwriting existing model in output directory", "flag", "f", bool),
|
create_meta: ("Create meta.json, even if one exists", "flag", "c", bool) = False,
|
||||||
)
|
force: ("Force overwriting existing model in output directory", "flag", "f", bool) = False,
|
||||||
def package(input_dir, output_dir, meta_path=None, create_meta=False, force=False):
|
# fmt: on
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
Generate Python package for model data, including meta and required
|
Generate Python package for model data, including meta and required
|
||||||
installation files. A new directory will be created in the specified
|
installation files. A new directory will be created in the specified
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
import plac
|
|
||||||
import random
|
import random
|
||||||
import numpy
|
import numpy
|
||||||
import time
|
import time
|
||||||
|
@ -21,85 +20,31 @@ from .. import util
|
||||||
from .train import _load_pretrained_tok2vec
|
from .train import _load_pretrained_tok2vec
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
|
||||||
texts_loc=(
|
|
||||||
"Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the "
|
|
||||||
"key 'tokens'",
|
|
||||||
"positional",
|
|
||||||
None,
|
|
||||||
str,
|
|
||||||
),
|
|
||||||
vectors_model=("Name or path to spaCy model with vectors to learn from"),
|
|
||||||
output_dir=("Directory to write models to on each epoch", "positional", None, str),
|
|
||||||
width=("Width of CNN layers", "option", "cw", int),
|
|
||||||
depth=("Depth of CNN layers", "option", "cd", int),
|
|
||||||
cnn_window=("Window size for CNN layers", "option", "cW", int),
|
|
||||||
cnn_pieces=("Maxout size for CNN layers. 1 for Mish", "option", "cP", int),
|
|
||||||
use_chars=("Whether to use character-based embedding", "flag", "chr", bool),
|
|
||||||
sa_depth=("Depth of self-attention layers", "option", "sa", int),
|
|
||||||
bilstm_depth=("Depth of BiLSTM layers (requires PyTorch)", "option", "lstm", int),
|
|
||||||
embed_rows=("Number of embedding rows", "option", "er", int),
|
|
||||||
loss_func=(
|
|
||||||
"Loss function to use for the objective. Either 'L2' or 'cosine'",
|
|
||||||
"option",
|
|
||||||
"L",
|
|
||||||
str,
|
|
||||||
),
|
|
||||||
use_vectors=("Whether to use the static vectors as input features", "flag", "uv"),
|
|
||||||
dropout=("Dropout rate", "option", "d", float),
|
|
||||||
batch_size=("Number of words per training batch", "option", "bs", int),
|
|
||||||
max_length=(
|
|
||||||
"Max words per example. Longer examples are discarded",
|
|
||||||
"option",
|
|
||||||
"xw",
|
|
||||||
int,
|
|
||||||
),
|
|
||||||
min_length=(
|
|
||||||
"Min words per example. Shorter examples are discarded",
|
|
||||||
"option",
|
|
||||||
"nw",
|
|
||||||
int,
|
|
||||||
),
|
|
||||||
seed=("Seed for random number generators", "option", "s", int),
|
|
||||||
n_iter=("Number of iterations to pretrain", "option", "i", int),
|
|
||||||
n_save_every=("Save model every X batches.", "option", "se", int),
|
|
||||||
init_tok2vec=(
|
|
||||||
"Path to pretrained weights for the token-to-vector parts of the models. See 'spacy pretrain'. Experimental.",
|
|
||||||
"option",
|
|
||||||
"t2v",
|
|
||||||
Path,
|
|
||||||
),
|
|
||||||
epoch_start=(
|
|
||||||
"The epoch to start counting at. Only relevant when using '--init-tok2vec' and the given weight file has been "
|
|
||||||
"renamed. Prevents unintended overwriting of existing weight files.",
|
|
||||||
"option",
|
|
||||||
"es",
|
|
||||||
int,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
def pretrain(
|
def pretrain(
|
||||||
texts_loc,
|
# fmt: off
|
||||||
vectors_model,
|
texts_loc: ("Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the key 'tokens'", "positional", None, str),
|
||||||
output_dir,
|
vectors_model: ("Name or path to spaCy model with vectors to learn from", "positional", None, str),
|
||||||
width=96,
|
output_dir: ("Directory to write models to on each epoch", "positional", None, str),
|
||||||
depth=4,
|
width: ("Width of CNN layers", "option", "cw", int) = 96,
|
||||||
bilstm_depth=0,
|
depth: ("Depth of CNN layers", "option", "cd", int) = 4,
|
||||||
cnn_pieces=3,
|
bilstm_depth: ("Depth of BiLSTM layers (requires PyTorch)", "option", "lstm", int) = 0,
|
||||||
sa_depth=0,
|
cnn_pieces: ("Maxout size for CNN layers. 1 for Mish", "option", "cP", int) = 3,
|
||||||
use_chars=False,
|
sa_depth: ("Depth of self-attention layers", "option", "sa", int) = 0,
|
||||||
cnn_window=1,
|
use_chars: ("Whether to use character-based embedding", "flag", "chr", bool) = False,
|
||||||
embed_rows=2000,
|
cnn_window: ("Window size for CNN layers", "option", "cW", int) = 1,
|
||||||
loss_func="cosine",
|
embed_rows: ("Number of embedding rows", "option", "er", int) = 2000,
|
||||||
use_vectors=False,
|
loss_func: ("Loss function to use for the objective. Either 'L2' or 'cosine'", "option", "L", str) = "cosine",
|
||||||
dropout=0.2,
|
use_vectors: ("Whether to use the static vectors as input features", "flag", "uv") = False,
|
||||||
n_iter=1000,
|
dropout: ("Dropout rate", "option", "d", float) = 0.2,
|
||||||
batch_size=3000,
|
n_iter: ("Number of iterations to pretrain", "option", "i", int) = 1000,
|
||||||
max_length=500,
|
batch_size: ("Number of words per training batch", "option", "bs", int) = 3000,
|
||||||
min_length=5,
|
max_length: ("Max words per example. Longer examples are discarded", "option", "xw", int) = 500,
|
||||||
seed=0,
|
min_length: ("Min words per example. Shorter examples are discarded", "option", "nw", int) = 5,
|
||||||
n_save_every=None,
|
seed: ("Seed for random number generators", "option", "s", int) = 0,
|
||||||
init_tok2vec=None,
|
n_save_every: ("Save model every X batches.", "option", "se", int) = None,
|
||||||
epoch_start=None,
|
init_tok2vec: ("Path to pretrained weights for the token-to-vector parts of the models. See 'spacy pretrain'. Experimental.", "option", "t2v", Path) = None,
|
||||||
|
epoch_start: ("The epoch to start counting at. Only relevant when using '--init-tok2vec' and the given weight file has been renamed. Prevents unintended overwriting of existing weight files.", "option", "es", int) = None,
|
||||||
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
|
Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
import plac
|
|
||||||
import tqdm
|
import tqdm
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import srsly
|
import srsly
|
||||||
|
@ -12,12 +11,13 @@ from wasabi import msg
|
||||||
from ..util import load_model
|
from ..util import load_model
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
def profile(
|
||||||
model=("Model to load", "positional", None, str),
|
# fmt: off
|
||||||
inputs=("Location of input file. '-' for stdin.", "positional", None, str),
|
model: ("Model to load", "positional", None, str),
|
||||||
n_texts=("Maximum number of texts to use if available", "option", "n", int),
|
inputs: ("Location of input file. '-' for stdin.", "positional", None, str) = None,
|
||||||
)
|
n_texts: ("Maximum number of texts to use if available", "option", "n", int) = 10000,
|
||||||
def profile(model, inputs=None, n_texts=10000):
|
# fmt: on
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
Profile a spaCy pipeline, to find out which functions take the most time.
|
Profile a spaCy pipeline, to find out which functions take the most time.
|
||||||
Input should be formatted as one JSON object per line with a key "text".
|
Input should be formatted as one JSON object per line with a key "text".
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
import plac
|
|
||||||
import os
|
import os
|
||||||
import tqdm
|
import tqdm
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
@ -17,67 +16,37 @@ from .. import util
|
||||||
from .. import about
|
from .. import about
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
|
||||||
# fmt: off
|
|
||||||
lang=("Model language", "positional", None, str),
|
|
||||||
output_path=("Output directory to store model in", "positional", None, Path),
|
|
||||||
train_path=("Location of JSON-formatted training data", "positional", None, Path),
|
|
||||||
dev_path=("Location of JSON-formatted development data", "positional", None, Path),
|
|
||||||
raw_text=("Path to jsonl file with unlabelled text documents.", "option", "rt", Path),
|
|
||||||
base_model=("Name of model to update (optional)", "option", "b", str),
|
|
||||||
pipeline=("Comma-separated names of pipeline components", "option", "p", str),
|
|
||||||
vectors=("Model to load vectors from", "option", "v", str),
|
|
||||||
n_iter=("Number of iterations", "option", "n", int),
|
|
||||||
n_early_stopping=("Maximum number of training epochs without dev accuracy improvement", "option", "ne", int),
|
|
||||||
n_examples=("Number of examples", "option", "ns", int),
|
|
||||||
use_gpu=("Use GPU", "option", "g", int),
|
|
||||||
version=("Model version", "option", "V", str),
|
|
||||||
meta_path=("Optional path to meta.json to use as base.", "option", "m", Path),
|
|
||||||
init_tok2vec=("Path to pretrained weights for the token-to-vector parts of the models. See 'spacy pretrain'. Experimental.", "option", "t2v", Path),
|
|
||||||
parser_multitasks=("Side objectives for parser CNN, e.g. 'dep' or 'dep,tag'", "option", "pt", str),
|
|
||||||
entity_multitasks=("Side objectives for NER CNN, e.g. 'dep' or 'dep,tag'", "option", "et", str),
|
|
||||||
noise_level=("Amount of corruption for data augmentation", "option", "nl", float),
|
|
||||||
orth_variant_level=("Amount of orthography variation for data augmentation", "option", "ovl", float),
|
|
||||||
eval_beam_widths=("Beam widths to evaluate, e.g. 4,8", "option", "bw", str),
|
|
||||||
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
|
|
||||||
learn_tokens=("Make parser learn gold-standard tokenization", "flag", "T", bool),
|
|
||||||
textcat_multilabel=("Textcat classes aren't mutually exclusive (multilabel)", "flag", "TML", bool),
|
|
||||||
textcat_arch=("Textcat model architecture", "option", "ta", str),
|
|
||||||
textcat_positive_label=("Textcat positive label for binary classes with two labels", "option", "tpl", str),
|
|
||||||
tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path),
|
|
||||||
verbose=("Display more information for debug", "flag", "VV", bool),
|
|
||||||
debug=("Run data diagnostics before training", "flag", "D", bool),
|
|
||||||
# fmt: on
|
|
||||||
)
|
|
||||||
def train(
|
def train(
|
||||||
lang,
|
# fmt: off
|
||||||
output_path,
|
lang: ("Model language", "positional", None, str),
|
||||||
train_path,
|
output_path: ("Output directory to store model in", "positional", None, Path),
|
||||||
dev_path,
|
train_path: ("Location of JSON-formatted training data", "positional", None, Path),
|
||||||
raw_text=None,
|
dev_path: ("Location of JSON-formatted development data", "positional", None, Path),
|
||||||
base_model=None,
|
raw_text: ("Path to jsonl file with unlabelled text documents.", "option", "rt", Path) = None,
|
||||||
pipeline="tagger,parser,ner",
|
base_model: ("Name of model to update (optional)", "option", "b", str) = None,
|
||||||
vectors=None,
|
pipeline: ("Comma-separated names of pipeline components", "option", "p", str) = "tagger,parser,ner",
|
||||||
n_iter=30,
|
vectors: ("Model to load vectors from", "option", "v", str) = None,
|
||||||
n_early_stopping=None,
|
n_iter: ("Number of iterations", "option", "n", int) = 30,
|
||||||
n_examples=0,
|
n_early_stopping: ("Maximum number of training epochs without dev accuracy improvement", "option", "ne", int) = None,
|
||||||
use_gpu=-1,
|
n_examples: ("Number of examples", "option", "ns", int) = 0,
|
||||||
version="0.0.0",
|
use_gpu: ("Use GPU", "option", "g", int) = -1,
|
||||||
meta_path=None,
|
version: ("Model version", "option", "V", str) = "0.0.0",
|
||||||
init_tok2vec=None,
|
meta_path: ("Optional path to meta.json to use as base.", "option", "m", Path) = None,
|
||||||
parser_multitasks="",
|
init_tok2vec: ("Path to pretrained weights for the token-to-vector parts of the models. See 'spacy pretrain'. Experimental.", "option", "t2v", Path) = None,
|
||||||
entity_multitasks="",
|
parser_multitasks: ("Side objectives for parser CNN, e.g. 'dep' or 'dep,tag'", "option", "pt", str) = "",
|
||||||
noise_level=0.0,
|
entity_multitasks: ("Side objectives for NER CNN, e.g. 'dep' or 'dep,tag'", "option", "et", str) = "",
|
||||||
orth_variant_level=0.0,
|
noise_level: ("Amount of corruption for data augmentation", "option", "nl", float) = 0.0,
|
||||||
eval_beam_widths="",
|
orth_variant_level: ("Amount of orthography variation for data augmentation", "option", "ovl", float) = 0.0,
|
||||||
gold_preproc=False,
|
eval_beam_widths: ("Beam widths to evaluate, e.g. 4,8", "option", "bw", str) = "",
|
||||||
learn_tokens=False,
|
gold_preproc: ("Use gold preprocessing", "flag", "G", bool) = False,
|
||||||
textcat_multilabel=False,
|
learn_tokens: ("Make parser learn gold-standard tokenization", "flag", "T", bool) = False,
|
||||||
textcat_arch="bow",
|
textcat_multilabel: ("Textcat classes aren't mutually exclusive (multilabel)", "flag", "TML", bool) = False,
|
||||||
textcat_positive_label=None,
|
textcat_arch: ("Textcat model architecture", "option", "ta", str) = "bow",
|
||||||
tag_map_path=None,
|
textcat_positive_label: ("Textcat positive label for binary classes with two labels", "option", "tpl", str) = None,
|
||||||
verbose=False,
|
tag_map_path: ("Location of JSON-formatted tag map", "option", "tm", Path) = None,
|
||||||
debug=False,
|
verbose: ("Display more information for debug", "flag", "VV", bool) = False,
|
||||||
|
debug: ("Run data diagnostics before training", "flag", "D", bool) = False,
|
||||||
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Train or update a spaCy model. Requires data to be formatted in spaCy's
|
Train or update a spaCy model. Requires data to be formatted in spaCy's
|
||||||
|
|
Loading…
Reference in New Issue
Block a user