mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-26 05:31:15 +03:00 
			
		
		
		
	Modernize plac commands for Python 3 (#4836)
This commit is contained in:
		
							parent
							
								
									401946d480
								
							
						
					
					
						commit
						83e0a6f3e3
					
				|  | @ -1,4 +1,3 @@ | ||||||
| import plac |  | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
| from wasabi import Printer | from wasabi import Printer | ||||||
| import srsly | import srsly | ||||||
|  | @ -26,31 +25,19 @@ FILE_TYPES = ("json", "jsonl", "msg") | ||||||
| FILE_TYPES_STDOUT = ("json", "jsonl") | FILE_TYPES_STDOUT = ("json", "jsonl") | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @plac.annotations( |  | ||||||
|     # fmt: off |  | ||||||
|     input_file=("Input file", "positional", None, str), |  | ||||||
|     output_dir=("Output directory. '-' for stdout.", "positional", None, str), |  | ||||||
|     file_type=(f"Type of data to produce: {FILE_TYPES}", "option", "t", str, FILE_TYPES), |  | ||||||
|     n_sents=("Number of sentences per doc (0 to disable)", "option", "n", int), |  | ||||||
|     seg_sents=("Segment sentences (for -c ner)", "flag", "s"), |  | ||||||
|     model=("Model for sentence segmentation (for -s)", "option", "b", str), |  | ||||||
|     converter=(f"Converter: {tuple(CONVERTERS.keys())}", "option", "c", str), |  | ||||||
|     lang=("Language (if tokenizer required)", "option", "l", str), |  | ||||||
|     morphology=("Enable appending morphology to tags", "flag", "m", bool), |  | ||||||
|     ner_map_path=("NER tag mapping (as JSON-encoded dict of entity types)", "option", "N", Path,), |  | ||||||
|     # fmt: on |  | ||||||
| ) |  | ||||||
| def convert( | def convert( | ||||||
|     input_file, |     # fmt: off | ||||||
|     output_dir="-", |     input_file: ("Input file", "positional", None, str), | ||||||
|     file_type="json", |     output_dir: ("Output directory. '-' for stdout.", "positional", None, str) = "-", | ||||||
|     n_sents=1, |     file_type: (f"Type of data to produce: {FILE_TYPES}", "option", "t", str, FILE_TYPES) = "json", | ||||||
|     seg_sents=False, |     n_sents: ("Number of sentences per doc (0 to disable)", "option", "n", int) = 1, | ||||||
|     model=None, |     seg_sents: ("Segment sentences (for -c ner)", "flag", "s") = False, | ||||||
|     morphology=False, |     model: ("Model for sentence segmentation (for -s)", "option", "b", str) = None, | ||||||
|     converter="auto", |     morphology: ("Enable appending morphology to tags", "flag", "m", bool) = False, | ||||||
|     ner_map_path=None, |     converter: (f"Converter: {tuple(CONVERTERS.keys())}", "option", "c", str) = "auto", | ||||||
|     lang=None, |     ner_map_path: ("NER tag mapping (as JSON-encoded dict of entity types)", "option", "N", Path) = None, | ||||||
|  |     lang: ("Language (if tokenizer required)", "option", "l", str) = None, | ||||||
|  |     # fmt: on | ||||||
| ): | ): | ||||||
|     """ |     """ | ||||||
|     Convert files into JSON format for use with train command and other |     Convert files into JSON format for use with train command and other | ||||||
|  |  | ||||||
|  | @ -1,6 +1,5 @@ | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
| from collections import Counter | from collections import Counter | ||||||
| import plac |  | ||||||
| import sys | import sys | ||||||
| import srsly | import srsly | ||||||
| from wasabi import Printer, MESSAGES | from wasabi import Printer, MESSAGES | ||||||
|  | @ -19,29 +18,18 @@ BLANK_MODEL_MIN_THRESHOLD = 100 | ||||||
| BLANK_MODEL_THRESHOLD = 2000 | BLANK_MODEL_THRESHOLD = 2000 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @plac.annotations( |  | ||||||
|     # fmt: off |  | ||||||
|     lang=("model language", "positional", None, str), |  | ||||||
|     train_path=("location of JSON-formatted training data", "positional", None, Path), |  | ||||||
|     dev_path=("location of JSON-formatted development data", "positional", None, Path), |  | ||||||
|     tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path), |  | ||||||
|     base_model=("name of model to update (optional)", "option", "b", str), |  | ||||||
|     pipeline=("Comma-separated names of pipeline components to train", "option", "p", str), |  | ||||||
|     ignore_warnings=("Ignore warnings, only show stats and errors", "flag", "IW", bool), |  | ||||||
|     verbose=("Print additional information and explanations", "flag", "V", bool), |  | ||||||
|     no_format=("Don't pretty-print the results", "flag", "NF", bool), |  | ||||||
|     # fmt: on |  | ||||||
| ) |  | ||||||
| def debug_data( | def debug_data( | ||||||
|     lang, |     # fmt: off | ||||||
|     train_path, |     lang: ("Model language", "positional", None, str), | ||||||
|     dev_path, |     train_path: ("Location of JSON-formatted training data", "positional", None, Path), | ||||||
|     tag_map_path=None, |     dev_path: ("Location of JSON-formatted development data", "positional", None, Path), | ||||||
|     base_model=None, |     tag_map_path: ("Location of JSON-formatted tag map", "option", "tm", Path) = None, | ||||||
|     pipeline="tagger,parser,ner", |     base_model: ("Name of model to update (optional)", "option", "b", str) = None, | ||||||
|     ignore_warnings=False, |     pipeline: ("Comma-separated names of pipeline components to train", "option", "p", str) = "tagger,parser,ner", | ||||||
|     verbose=False, |     ignore_warnings: ("Ignore warnings, only show stats and errors", "flag", "IW", bool) = False, | ||||||
|     no_format=False, |     verbose: ("Print additional information and explanations", "flag", "V", bool) = False, | ||||||
|  |     no_format: ("Don't pretty-print the results", "flag", "NF", bool) = False, | ||||||
|  |     # fmt: on | ||||||
| ): | ): | ||||||
|     """ |     """ | ||||||
|     Analyze, debug and validate your training and development data, get useful |     Analyze, debug and validate your training and development data, get useful | ||||||
|  |  | ||||||
|  | @ -1,4 +1,3 @@ | ||||||
| import plac |  | ||||||
| import requests | import requests | ||||||
| import os | import os | ||||||
| import subprocess | import subprocess | ||||||
|  | @ -10,12 +9,11 @@ from ..util import get_package_path | ||||||
| from .. import about | from .. import about | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @plac.annotations( | def download( | ||||||
|     model=("Model to download (shortcut or name)", "positional", None, str), |     model: ("Model to download (shortcut or name)", "positional", None, str), | ||||||
|     direct=("Force direct download of name + version", "flag", "d", bool), |     direct: ("Force direct download of name + version", "flag", "d", bool) = False, | ||||||
|     pip_args=("Additional arguments to be passed to `pip install` on model install"), |     *pip_args: ("Additional arguments to be passed to `pip install` on model install"), | ||||||
| ) | ): | ||||||
| def download(model, direct=False, *pip_args): |  | ||||||
|     """ |     """ | ||||||
|     Download compatible model from default download path using pip. Model |     Download compatible model from default download path using pip. Model | ||||||
|     can be shortcut, model name or, if --direct flag is set, full model name |     can be shortcut, model name or, if --direct flag is set, full model name | ||||||
|  |  | ||||||
|  | @ -1,4 +1,3 @@ | ||||||
| import plac |  | ||||||
| from timeit import default_timer as timer | from timeit import default_timer as timer | ||||||
| from wasabi import msg | from wasabi import msg | ||||||
| 
 | 
 | ||||||
|  | @ -7,23 +6,16 @@ from .. import util | ||||||
| from .. import displacy | from .. import displacy | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @plac.annotations( |  | ||||||
|     model=("Model name or path", "positional", None, str), |  | ||||||
|     data_path=("Location of JSON-formatted evaluation data", "positional", None, str), |  | ||||||
|     gold_preproc=("Use gold preprocessing", "flag", "G", bool), |  | ||||||
|     gpu_id=("Use GPU", "option", "g", int), |  | ||||||
|     displacy_path=("Directory to output rendered parses as HTML", "option", "dp", str), |  | ||||||
|     displacy_limit=("Limit of parses to render as HTML", "option", "dl", int), |  | ||||||
|     return_scores=("Return dict containing model scores", "flag", "R", bool), |  | ||||||
| ) |  | ||||||
| def evaluate( | def evaluate( | ||||||
|     model, |     # fmt: off | ||||||
|     data_path, |     model: ("Model name or path", "positional", None, str), | ||||||
|     gpu_id=-1, |     data_path: ("Location of JSON-formatted evaluation data", "positional", None, str), | ||||||
|     gold_preproc=False, |     gpu_id: ("Use GPU", "option", "g", int) = -1, | ||||||
|     displacy_path=None, |     gold_preproc: ("Use gold preprocessing", "flag", "G", bool) = False, | ||||||
|     displacy_limit=25, |     displacy_path: ("Directory to output rendered parses as HTML", "option", "dp", str) = None, | ||||||
|     return_scores=False, |     displacy_limit: ("Limit of parses to render as HTML", "option", "dl", int) = 25, | ||||||
|  |     return_scores: ("Return dict containing model scores", "flag", "R", bool) = False, | ||||||
|  |     # fmt: on | ||||||
| ): | ): | ||||||
|     """ |     """ | ||||||
|     Evaluate a model. To render a sample of parses in a HTML file, set an |     Evaluate a model. To render a sample of parses in a HTML file, set an | ||||||
|  |  | ||||||
|  | @ -1,4 +1,3 @@ | ||||||
| import plac |  | ||||||
| import platform | import platform | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
| from wasabi import msg | from wasabi import msg | ||||||
|  | @ -8,12 +7,11 @@ from .. import util | ||||||
| from .. import about | from .. import about | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @plac.annotations( | def info( | ||||||
|     model=("Optional shortcut link of model", "positional", None, str), |     model: ("Optional shortcut link of model", "positional", None, str) = None, | ||||||
|     markdown=("Generate Markdown for GitHub issues", "flag", "md", str), |     markdown: ("Generate Markdown for GitHub issues", "flag", "md", str) = False, | ||||||
|     silent=("Don't print anything (just return)", "flag", "s"), |     silent: ("Don't print anything (just return)", "flag", "s") = False, | ||||||
| ) | ): | ||||||
| def info(model=None, markdown=False, silent=False): |  | ||||||
|     """ |     """ | ||||||
|     Print info about spaCy installation. If a model shortcut link is |     Print info about spaCy installation. If a model shortcut link is | ||||||
|     speficied as an argument, print model information. Flag --markdown |     speficied as an argument, print model information. Flag --markdown | ||||||
|  |  | ||||||
|  | @ -1,4 +1,3 @@ | ||||||
| import plac |  | ||||||
| import math | import math | ||||||
| from tqdm import tqdm | from tqdm import tqdm | ||||||
| import numpy | import numpy | ||||||
|  | @ -24,32 +23,18 @@ except ImportError: | ||||||
| DEFAULT_OOV_PROB = -20 | DEFAULT_OOV_PROB = -20 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @plac.annotations( |  | ||||||
|     lang=("Model language", "positional", None, str), |  | ||||||
|     output_dir=("Model output directory", "positional", None, Path), |  | ||||||
|     freqs_loc=("Location of words frequencies file", "option", "f", Path), |  | ||||||
|     jsonl_loc=("Location of JSONL-formatted attributes file", "option", "j", Path), |  | ||||||
|     clusters_loc=("Optional location of brown clusters data", "option", "c", str), |  | ||||||
|     vectors_loc=("Optional vectors file in Word2Vec format", "option", "v", str), |  | ||||||
|     prune_vectors=("Optional number of vectors to prune to", "option", "V", int), |  | ||||||
|     vectors_name=( |  | ||||||
|         "Optional name for the word vectors, e.g. en_core_web_lg.vectors", |  | ||||||
|         "option", |  | ||||||
|         "vn", |  | ||||||
|         str, |  | ||||||
|     ), |  | ||||||
|     model_name=("Optional name for the model meta", "option", "mn", str), |  | ||||||
| ) |  | ||||||
| def init_model( | def init_model( | ||||||
|     lang, |     # fmt: off | ||||||
|     output_dir, |     lang: ("Model language", "positional", None, str), | ||||||
|     freqs_loc=None, |     output_dir: ("Model output directory", "positional", None, Path), | ||||||
|     clusters_loc=None, |     freqs_loc: ("Location of words frequencies file", "option", "f", Path) = None, | ||||||
|     jsonl_loc=None, |     clusters_loc: ("Optional location of brown clusters data", "option", "c", str) = None, | ||||||
|     vectors_loc=None, |     jsonl_loc: ("Location of JSONL-formatted attributes file", "option", "j", Path) = None, | ||||||
|     prune_vectors=-1, |     vectors_loc: ("Optional vectors file in Word2Vec format", "option", "v", str) = None, | ||||||
|     vectors_name=None, |     prune_vectors: ("Optional number of vectors to prune to", "option", "V", int) = -1, | ||||||
|     model_name=None, |     vectors_name: ("Optional name for the word vectors, e.g. en_core_web_lg.vectors", "option", "vn", str) = None, | ||||||
|  |     model_name: ("Optional name for the model meta", "option", "mn", str) = None, | ||||||
|  |     # fmt: on | ||||||
| ): | ): | ||||||
|     """ |     """ | ||||||
|     Create a new model from raw data, like word frequencies, Brown clusters |     Create a new model from raw data, like word frequencies, Brown clusters | ||||||
|  |  | ||||||
|  | @ -1,4 +1,3 @@ | ||||||
| import plac |  | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
| from wasabi import msg | from wasabi import msg | ||||||
| 
 | 
 | ||||||
|  | @ -6,12 +5,12 @@ from ..compat import symlink_to | ||||||
| from .. import util | from .. import util | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @plac.annotations( | def link( | ||||||
|     origin=("package name or local path to model", "positional", None, str), |     origin: ("package name or local path to model", "positional", None, str), | ||||||
|     link_name=("name of shortuct link to create", "positional", None, str), |     link_name: ("name of shortuct link to create", "positional", None, str), | ||||||
|     force=("force overwriting of existing link", "flag", "f", bool), |     force: ("force overwriting of existing link", "flag", "f", bool) = False, | ||||||
| ) |     model_path=None, | ||||||
| def link(origin, link_name, force=False, model_path=None): | ): | ||||||
|     """ |     """ | ||||||
|     Create a symlink for models within the spacy/data directory. Accepts |     Create a symlink for models within the spacy/data directory. Accepts | ||||||
|     either the name of a pip package, or the local path to the model data |     either the name of a pip package, or the local path to the model data | ||||||
|  |  | ||||||
|  | @ -1,4 +1,3 @@ | ||||||
| import plac |  | ||||||
| import shutil | import shutil | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
| from wasabi import msg, get_raw_input | from wasabi import msg, get_raw_input | ||||||
|  | @ -8,14 +7,15 @@ from .. import util | ||||||
| from .. import about | from .. import about | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @plac.annotations( | def package( | ||||||
|     input_dir=("Directory with model data", "positional", None, str), |     # fmt: off | ||||||
|     output_dir=("Output parent directory", "positional", None, str), |     input_dir: ("Directory with model data", "positional", None, str), | ||||||
|     meta_path=("Path to meta.json", "option", "m", str), |     output_dir: ("Output parent directory", "positional", None, str), | ||||||
|     create_meta=("Create meta.json, even if one exists", "flag", "c", bool), |     meta_path: ("Path to meta.json", "option", "m", str) = None, | ||||||
|     force=("Force overwriting existing model in output directory", "flag", "f", bool), |     create_meta: ("Create meta.json, even if one exists", "flag", "c", bool) = False, | ||||||
| ) |     force: ("Force overwriting existing model in output directory", "flag", "f", bool) = False, | ||||||
| def package(input_dir, output_dir, meta_path=None, create_meta=False, force=False): |     # fmt: on | ||||||
|  | ): | ||||||
|     """ |     """ | ||||||
|     Generate Python package for model data, including meta and required |     Generate Python package for model data, including meta and required | ||||||
|     installation files. A new directory will be created in the specified |     installation files. A new directory will be created in the specified | ||||||
|  |  | ||||||
|  | @ -1,4 +1,3 @@ | ||||||
| import plac |  | ||||||
| import random | import random | ||||||
| import numpy | import numpy | ||||||
| import time | import time | ||||||
|  | @ -21,85 +20,31 @@ from .. import util | ||||||
| from .train import _load_pretrained_tok2vec | from .train import _load_pretrained_tok2vec | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @plac.annotations( |  | ||||||
|     texts_loc=( |  | ||||||
|         "Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the " |  | ||||||
|         "key 'tokens'", |  | ||||||
|         "positional", |  | ||||||
|         None, |  | ||||||
|         str, |  | ||||||
|     ), |  | ||||||
|     vectors_model=("Name or path to spaCy model with vectors to learn from"), |  | ||||||
|     output_dir=("Directory to write models to on each epoch", "positional", None, str), |  | ||||||
|     width=("Width of CNN layers", "option", "cw", int), |  | ||||||
|     depth=("Depth of CNN layers", "option", "cd", int), |  | ||||||
|     cnn_window=("Window size for CNN layers", "option", "cW", int), |  | ||||||
|     cnn_pieces=("Maxout size for CNN layers. 1 for Mish", "option", "cP", int), |  | ||||||
|     use_chars=("Whether to use character-based embedding", "flag", "chr", bool), |  | ||||||
|     sa_depth=("Depth of self-attention layers", "option", "sa", int), |  | ||||||
|     bilstm_depth=("Depth of BiLSTM layers (requires PyTorch)", "option", "lstm", int), |  | ||||||
|     embed_rows=("Number of embedding rows", "option", "er", int), |  | ||||||
|     loss_func=( |  | ||||||
|         "Loss function to use for the objective. Either 'L2' or 'cosine'", |  | ||||||
|         "option", |  | ||||||
|         "L", |  | ||||||
|         str, |  | ||||||
|     ), |  | ||||||
|     use_vectors=("Whether to use the static vectors as input features", "flag", "uv"), |  | ||||||
|     dropout=("Dropout rate", "option", "d", float), |  | ||||||
|     batch_size=("Number of words per training batch", "option", "bs", int), |  | ||||||
|     max_length=( |  | ||||||
|         "Max words per example. Longer examples are discarded", |  | ||||||
|         "option", |  | ||||||
|         "xw", |  | ||||||
|         int, |  | ||||||
|     ), |  | ||||||
|     min_length=( |  | ||||||
|         "Min words per example. Shorter examples are discarded", |  | ||||||
|         "option", |  | ||||||
|         "nw", |  | ||||||
|         int, |  | ||||||
|     ), |  | ||||||
|     seed=("Seed for random number generators", "option", "s", int), |  | ||||||
|     n_iter=("Number of iterations to pretrain", "option", "i", int), |  | ||||||
|     n_save_every=("Save model every X batches.", "option", "se", int), |  | ||||||
|     init_tok2vec=( |  | ||||||
|         "Path to pretrained weights for the token-to-vector parts of the models. See 'spacy pretrain'. Experimental.", |  | ||||||
|         "option", |  | ||||||
|         "t2v", |  | ||||||
|         Path, |  | ||||||
|     ), |  | ||||||
|     epoch_start=( |  | ||||||
|         "The epoch to start counting at. Only relevant when using '--init-tok2vec' and the given weight file has been " |  | ||||||
|         "renamed. Prevents unintended overwriting of existing weight files.", |  | ||||||
|         "option", |  | ||||||
|         "es", |  | ||||||
|         int, |  | ||||||
|     ), |  | ||||||
| ) |  | ||||||
| def pretrain( | def pretrain( | ||||||
|     texts_loc, |     # fmt: off | ||||||
|     vectors_model, |     texts_loc: ("Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the key 'tokens'", "positional", None, str), | ||||||
|     output_dir, |     vectors_model: ("Name or path to spaCy model with vectors to learn from", "positional", None, str), | ||||||
|     width=96, |     output_dir: ("Directory to write models to on each epoch", "positional", None, str), | ||||||
|     depth=4, |     width: ("Width of CNN layers", "option", "cw", int) = 96, | ||||||
|     bilstm_depth=0, |     depth: ("Depth of CNN layers", "option", "cd", int) = 4, | ||||||
|     cnn_pieces=3, |     bilstm_depth: ("Depth of BiLSTM layers (requires PyTorch)", "option", "lstm", int) = 0, | ||||||
|     sa_depth=0, |     cnn_pieces: ("Maxout size for CNN layers. 1 for Mish", "option", "cP", int) = 3, | ||||||
|     use_chars=False, |     sa_depth: ("Depth of self-attention layers", "option", "sa", int) = 0, | ||||||
|     cnn_window=1, |     use_chars: ("Whether to use character-based embedding", "flag", "chr", bool) = False, | ||||||
|     embed_rows=2000, |     cnn_window: ("Window size for CNN layers", "option", "cW", int) = 1, | ||||||
|     loss_func="cosine", |     embed_rows: ("Number of embedding rows", "option", "er", int) = 2000, | ||||||
|     use_vectors=False, |     loss_func: ("Loss function to use for the objective. Either 'L2' or 'cosine'", "option", "L", str) = "cosine", | ||||||
|     dropout=0.2, |     use_vectors: ("Whether to use the static vectors as input features", "flag", "uv") = False, | ||||||
|     n_iter=1000, |     dropout: ("Dropout rate", "option", "d", float) = 0.2, | ||||||
|     batch_size=3000, |     n_iter: ("Number of iterations to pretrain", "option", "i", int) = 1000, | ||||||
|     max_length=500, |     batch_size: ("Number of words per training batch", "option", "bs", int) = 3000, | ||||||
|     min_length=5, |     max_length: ("Max words per example. Longer examples are discarded", "option", "xw", int) = 500, | ||||||
|     seed=0, |     min_length: ("Min words per example. Shorter examples are discarded", "option", "nw", int) = 5, | ||||||
|     n_save_every=None, |     seed: ("Seed for random number generators", "option", "s", int) = 0, | ||||||
|     init_tok2vec=None, |     n_save_every: ("Save model every X batches.", "option", "se", int) = None, | ||||||
|     epoch_start=None, |     init_tok2vec: ("Path to pretrained weights for the token-to-vector parts of the models. See 'spacy pretrain'. Experimental.", "option", "t2v", Path) = None, | ||||||
|  |     epoch_start: ("The epoch to start counting at. Only relevant when using '--init-tok2vec' and the given weight file has been renamed. Prevents unintended overwriting of existing weight files.", "option", "es", int) = None, | ||||||
|  |     # fmt: on | ||||||
| ): | ): | ||||||
|     """ |     """ | ||||||
|     Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components, |     Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components, | ||||||
|  |  | ||||||
|  | @ -1,4 +1,3 @@ | ||||||
| import plac |  | ||||||
| import tqdm | import tqdm | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
| import srsly | import srsly | ||||||
|  | @ -12,12 +11,13 @@ from wasabi import msg | ||||||
| from ..util import load_model | from ..util import load_model | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @plac.annotations( | def profile( | ||||||
|     model=("Model to load", "positional", None, str), |     # fmt: off | ||||||
|     inputs=("Location of input file. '-' for stdin.", "positional", None, str), |     model: ("Model to load", "positional", None, str), | ||||||
|     n_texts=("Maximum number of texts to use if available", "option", "n", int), |     inputs: ("Location of input file. '-' for stdin.", "positional", None, str) = None, | ||||||
| ) |     n_texts: ("Maximum number of texts to use if available", "option", "n", int) = 10000, | ||||||
| def profile(model, inputs=None, n_texts=10000): |     # fmt: on | ||||||
|  | ): | ||||||
|     """ |     """ | ||||||
|     Profile a spaCy pipeline, to find out which functions take the most time. |     Profile a spaCy pipeline, to find out which functions take the most time. | ||||||
|     Input should be formatted as one JSON object per line with a key "text". |     Input should be formatted as one JSON object per line with a key "text". | ||||||
|  |  | ||||||
|  | @ -1,4 +1,3 @@ | ||||||
| import plac |  | ||||||
| import os | import os | ||||||
| import tqdm | import tqdm | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
|  | @ -17,67 +16,37 @@ from .. import util | ||||||
| from .. import about | from .. import about | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @plac.annotations( |  | ||||||
|     # fmt: off |  | ||||||
|     lang=("Model language", "positional", None, str), |  | ||||||
|     output_path=("Output directory to store model in", "positional", None, Path), |  | ||||||
|     train_path=("Location of JSON-formatted training data", "positional", None, Path), |  | ||||||
|     dev_path=("Location of JSON-formatted development data", "positional", None, Path), |  | ||||||
|     raw_text=("Path to jsonl file with unlabelled text documents.", "option", "rt", Path), |  | ||||||
|     base_model=("Name of model to update (optional)", "option", "b", str), |  | ||||||
|     pipeline=("Comma-separated names of pipeline components", "option", "p", str), |  | ||||||
|     vectors=("Model to load vectors from", "option", "v", str), |  | ||||||
|     n_iter=("Number of iterations", "option", "n", int), |  | ||||||
|     n_early_stopping=("Maximum number of training epochs without dev accuracy improvement", "option", "ne", int), |  | ||||||
|     n_examples=("Number of examples", "option", "ns", int), |  | ||||||
|     use_gpu=("Use GPU", "option", "g", int), |  | ||||||
|     version=("Model version", "option", "V", str), |  | ||||||
|     meta_path=("Optional path to meta.json to use as base.", "option", "m", Path), |  | ||||||
|     init_tok2vec=("Path to pretrained weights for the token-to-vector parts of the models. See 'spacy pretrain'. Experimental.", "option", "t2v", Path), |  | ||||||
|     parser_multitasks=("Side objectives for parser CNN, e.g. 'dep' or 'dep,tag'", "option", "pt", str), |  | ||||||
|     entity_multitasks=("Side objectives for NER CNN, e.g. 'dep' or 'dep,tag'", "option", "et", str), |  | ||||||
|     noise_level=("Amount of corruption for data augmentation", "option", "nl", float), |  | ||||||
|     orth_variant_level=("Amount of orthography variation for data augmentation", "option", "ovl", float), |  | ||||||
|     eval_beam_widths=("Beam widths to evaluate, e.g. 4,8", "option", "bw", str), |  | ||||||
|     gold_preproc=("Use gold preprocessing", "flag", "G", bool), |  | ||||||
|     learn_tokens=("Make parser learn gold-standard tokenization", "flag", "T", bool), |  | ||||||
|     textcat_multilabel=("Textcat classes aren't mutually exclusive (multilabel)", "flag", "TML", bool), |  | ||||||
|     textcat_arch=("Textcat model architecture", "option", "ta", str), |  | ||||||
|     textcat_positive_label=("Textcat positive label for binary classes with two labels", "option", "tpl", str), |  | ||||||
|     tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path), |  | ||||||
|     verbose=("Display more information for debug", "flag", "VV", bool), |  | ||||||
|     debug=("Run data diagnostics before training", "flag", "D", bool), |  | ||||||
|     # fmt: on |  | ||||||
| ) |  | ||||||
| def train( | def train( | ||||||
|     lang, |     # fmt: off | ||||||
|     output_path, |     lang: ("Model language", "positional", None, str), | ||||||
|     train_path, |     output_path: ("Output directory to store model in", "positional", None, Path), | ||||||
|     dev_path, |     train_path: ("Location of JSON-formatted training data", "positional", None, Path), | ||||||
|     raw_text=None, |     dev_path: ("Location of JSON-formatted development data", "positional", None, Path), | ||||||
|     base_model=None, |     raw_text: ("Path to jsonl file with unlabelled text documents.", "option", "rt", Path) = None, | ||||||
|     pipeline="tagger,parser,ner", |     base_model: ("Name of model to update (optional)", "option", "b", str) = None, | ||||||
|     vectors=None, |     pipeline: ("Comma-separated names of pipeline components", "option", "p", str) = "tagger,parser,ner", | ||||||
|     n_iter=30, |     vectors: ("Model to load vectors from", "option", "v", str) = None, | ||||||
|     n_early_stopping=None, |     n_iter: ("Number of iterations", "option", "n", int) = 30, | ||||||
|     n_examples=0, |     n_early_stopping: ("Maximum number of training epochs without dev accuracy improvement", "option", "ne", int) = None, | ||||||
|     use_gpu=-1, |     n_examples: ("Number of examples", "option", "ns", int) = 0, | ||||||
|     version="0.0.0", |     use_gpu: ("Use GPU", "option", "g", int) = -1, | ||||||
|     meta_path=None, |     version: ("Model version", "option", "V", str) = "0.0.0", | ||||||
|     init_tok2vec=None, |     meta_path: ("Optional path to meta.json to use as base.", "option", "m", Path) = None, | ||||||
|     parser_multitasks="", |     init_tok2vec: ("Path to pretrained weights for the token-to-vector parts of the models. See 'spacy pretrain'. Experimental.", "option", "t2v", Path) = None, | ||||||
|     entity_multitasks="", |     parser_multitasks: ("Side objectives for parser CNN, e.g. 'dep' or 'dep,tag'", "option", "pt", str) = "", | ||||||
|     noise_level=0.0, |     entity_multitasks: ("Side objectives for NER CNN, e.g. 'dep' or 'dep,tag'", "option", "et", str) = "", | ||||||
|     orth_variant_level=0.0, |     noise_level: ("Amount of corruption for data augmentation", "option", "nl", float) = 0.0, | ||||||
|     eval_beam_widths="", |     orth_variant_level: ("Amount of orthography variation for data augmentation", "option", "ovl", float) = 0.0, | ||||||
|     gold_preproc=False, |     eval_beam_widths: ("Beam widths to evaluate, e.g. 4,8", "option", "bw", str) = "", | ||||||
|     learn_tokens=False, |     gold_preproc: ("Use gold preprocessing", "flag", "G", bool) = False, | ||||||
|     textcat_multilabel=False, |     learn_tokens: ("Make parser learn gold-standard tokenization", "flag", "T", bool) = False, | ||||||
|     textcat_arch="bow", |     textcat_multilabel: ("Textcat classes aren't mutually exclusive (multilabel)", "flag", "TML", bool) = False, | ||||||
|     textcat_positive_label=None, |     textcat_arch: ("Textcat model architecture", "option", "ta", str) = "bow", | ||||||
|     tag_map_path=None, |     textcat_positive_label: ("Textcat positive label for binary classes with two labels", "option", "tpl", str) = None, | ||||||
|     verbose=False, |     tag_map_path: ("Location of JSON-formatted tag map", "option", "tm", Path) = None, | ||||||
|     debug=False, |     verbose: ("Display more information for debug", "flag", "VV", bool) = False, | ||||||
|  |     debug: ("Run data diagnostics before training", "flag", "D", bool) = False, | ||||||
|  |     # fmt: on | ||||||
| ): | ): | ||||||
|     """ |     """ | ||||||
|     Train or update a spaCy model. Requires data to be formatted in spaCy's |     Train or update a spaCy model. Requires data to be formatted in spaCy's | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user