mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	Reduce complexity in CLI
Remove now redundant model command and move plac annotations to cli files
This commit is contained in:
		
							parent
							
								
									aae97f00e9
								
							
						
					
					
						commit
						fc3ec733ea
					
				|  | @ -3,127 +3,21 @@ from __future__ import print_function | |||
| # NB! This breaks in plac on Python 2!! | ||||
| #from __future__ import unicode_literals | ||||
| 
 | ||||
| import plac | ||||
| from spacy.cli import download as cli_download | ||||
| from spacy.cli import link as cli_link | ||||
| from spacy.cli import info as cli_info | ||||
| from spacy.cli import package as cli_package | ||||
| from spacy.cli import train as cli_train | ||||
| from spacy.cli import model as cli_model | ||||
| from spacy.cli import convert as cli_convert | ||||
| 
 | ||||
| 
 | ||||
| @plac.annotations( | ||||
|     model=("model to download (shortcut or model name)", "positional", None, str), | ||||
|     direct=("force direct download. Needs model name with version and won't " | ||||
|             "perform compatibility check", "flag", "d", bool) | ||||
| ) | ||||
| def download(model, direct=False): | ||||
|     """ | ||||
|     Download compatible model from default download path using pip. Model | ||||
|     can be shortcut, model name or, if --direct flag is set, full model name | ||||
|     with version. | ||||
|     """ | ||||
|     cli_download(model, direct) | ||||
| 
 | ||||
| 
 | ||||
| @plac.annotations( | ||||
|     origin=("package name or local path to model", "positional", None, str), | ||||
|     link_name=("name of shortuct link to create", "positional", None, str), | ||||
|     force=("force overwriting of existing link", "flag", "f", bool) | ||||
| ) | ||||
| def link(origin, link_name, force=False): | ||||
|     """ | ||||
|     Create a symlink for models within the spacy/data directory. Accepts | ||||
|     either the name of a pip package, or the local path to the model data | ||||
|     directory. Linking models allows loading them via spacy.load(link_name). | ||||
|     """ | ||||
|     cli_link(origin, link_name, force) | ||||
| 
 | ||||
| 
 | ||||
| @plac.annotations( | ||||
|     model=("optional: shortcut link of model", "positional", None, str), | ||||
|     markdown=("generate Markdown for GitHub issues", "flag", "md", str) | ||||
| ) | ||||
| def info(model=None, markdown=False): | ||||
|     """ | ||||
|     Print info about spaCy installation. If a model shortcut link is | ||||
|     speficied as an argument, print model information. Flag --markdown | ||||
|     prints details in Markdown for easy copy-pasting to GitHub issues. | ||||
|     """ | ||||
|     cli_info(model, markdown) | ||||
| 
 | ||||
| 
 | ||||
| @plac.annotations( | ||||
|     input_dir=("directory with model data", "positional", None, str), | ||||
|     output_dir=("output parent directory", "positional", None, str), | ||||
|     meta=("path to meta.json", "option", "m", str), | ||||
|     force=("force overwriting of existing folder in output directory", "flag", "f", bool) | ||||
| ) | ||||
| def package(input_dir, output_dir, meta=None, force=False): | ||||
|     """ | ||||
|     Generate Python package for model data, including meta and required | ||||
|     installation files. A new directory will be created in the specified | ||||
|     output directory, and model data will be copied over. | ||||
|     """ | ||||
|     cli_package(input_dir, output_dir, meta, force) | ||||
| 
 | ||||
| 
 | ||||
| @plac.annotations( | ||||
|     input_file=("input file", "positional", None, str), | ||||
|     output_dir=("output directory for converted file", "positional", None, str), | ||||
|     n_sents=("Number of sentences per doc", "option", "n", float), | ||||
|     morphology=("Enable appending morphology to tags", "flag", "m", bool) | ||||
| ) | ||||
| def convert(input_file, output_dir, n_sents=10, morphology=False): | ||||
|     """ | ||||
|     Convert files into JSON format for use with train command and other | ||||
|     experiment management functions. | ||||
|     """ | ||||
|     cli_convert(input_file, output_dir, n_sents, morphology) | ||||
| 
 | ||||
| 
 | ||||
| @plac.annotations( | ||||
|     lang=("model language", "positional", None, str), | ||||
|     output_dir=("output directory to store model in", "positional", None, str), | ||||
|     train_data=("location of JSON-formatted training data", "positional", None, str), | ||||
|     dev_data=("location of JSON-formatted development data (optional)", "positional", None, str), | ||||
|     n_iter=("number of iterations", "option", "n", int), | ||||
|     nsents=("number of sentences", "option", None, int), | ||||
|     use_gpu=("Use GPU", "flag", "g", bool), | ||||
|     no_tagger=("Don't train tagger", "flag", "T", bool), | ||||
|     no_parser=("Don't train parser", "flag", "P", bool), | ||||
|     no_entities=("Don't train NER", "flag", "N", bool) | ||||
| ) | ||||
| def train(lang, output_dir, train_data, dev_data=None, n_iter=15, | ||||
|           nsents=0, use_gpu=False, | ||||
|           no_tagger=False, no_parser=False, no_entities=False): | ||||
|     """ | ||||
|     Train a model. Expects data in spaCy's JSON format. | ||||
|     """ | ||||
|     nsents = nsents or None | ||||
|     cli_train(lang, output_dir, train_data, dev_data, n_iter, nsents, | ||||
|               use_gpu, no_tagger, no_parser, no_entities) | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == '__main__': | ||||
|     import plac | ||||
|     import sys | ||||
|     commands = { | ||||
|         'train': train, | ||||
|         'convert': convert, | ||||
|         'download': download, | ||||
|         'link': link, | ||||
|         'info': info, | ||||
|         'package': package, | ||||
|     } | ||||
|     from spacy.cli import download, link, info, package, train, convert | ||||
|     from spacy.util import prints | ||||
| 
 | ||||
|     commands = {'download': download, 'link': link, 'info': info, 'train': train, | ||||
|                 'convert': convert, 'package': package} | ||||
|     if len(sys.argv) == 1: | ||||
|         print("Available commands: %s" % ', '.join(sorted(commands))) | ||||
|         sys.exit(1) | ||||
|         prints(', '.join(commands), title="Available commands", exits=1) | ||||
|     command = sys.argv.pop(1) | ||||
|     sys.argv[0] = 'spacy %s' % command | ||||
|     if command in commands: | ||||
|         plac.call(commands[command]) | ||||
|     else: | ||||
|         print("Unknown command: %s. Available: %s" % (command, ', '.join(commands))) | ||||
|         sys.exit(1) | ||||
|         prints("Available: %s" % ', '.join(commands), | ||||
|                title="Unknown command: %s" % command, exits=1) | ||||
|  |  | |||
|  | @ -3,5 +3,4 @@ from .info import info | |||
| from .link import link | ||||
| from .package import package | ||||
| from .train import train | ||||
| from .model import model | ||||
| from .convert import convert | ||||
|  |  | |||
|  | @ -1,6 +1,7 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| import plac | ||||
| from pathlib import Path | ||||
| 
 | ||||
| from .converters import conllu2json, iob2json | ||||
|  | @ -18,15 +19,24 @@ CONVERTERS = { | |||
| } | ||||
| 
 | ||||
| 
 | ||||
| def convert(input_file, output_dir, *args): | ||||
| @plac.annotations( | ||||
|     input_file=("input file", "positional", None, str), | ||||
|     output_dir=("output directory for converted file", "positional", None, str), | ||||
|     n_sents=("Number of sentences per doc", "option", "n", float), | ||||
|     morphology=("Enable appending morphology to tags", "flag", "m", bool) | ||||
| ) | ||||
| def convert(input_file, output_dir, n_sents, morphology): | ||||
|     """Convert files into JSON format for use with train command and other | ||||
|     experiment management functions. | ||||
|     """ | ||||
|     input_path = Path(input_file) | ||||
|     output_path = Path(output_dir) | ||||
|     if not input_path.exists(): | ||||
|         prints(input_path, title="Input file not found", exits=True) | ||||
|         prints(input_path, title="Input file not found", exits=1) | ||||
|     if not output_path.exists(): | ||||
|         prints(output_path, title="Output directory not found", exits=True) | ||||
|         prints(output_path, title="Output directory not found", exits=1) | ||||
|     file_ext = input_path.suffix | ||||
|     if not file_ext in CONVERTERS: | ||||
|         prints("Can't find converter for %s" % input_path.parts[-1], | ||||
|                title="Unknown format", exits=True) | ||||
|                title="Unknown format", exits=1) | ||||
|     CONVERTERS[file_ext](input_path, output_path, *args) | ||||
|  |  | |||
|  | @ -1,6 +1,7 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| import plac | ||||
| import requests | ||||
| import os | ||||
| import subprocess | ||||
|  | @ -11,7 +12,16 @@ from ..util import prints | |||
| from .. import about | ||||
| 
 | ||||
| 
 | ||||
| @plac.annotations( | ||||
|     model=("model to download (shortcut or model name)", "positional", None, str), | ||||
|     direct=("force direct download. Needs model name with version and won't " | ||||
|             "perform compatibility check", "flag", "d", bool) | ||||
| ) | ||||
| def download(model, direct=False): | ||||
|     """Download compatible model from default download path using pip. Model | ||||
|     can be shortcut, model name or, if --direct flag is set, full model name | ||||
|     with version. | ||||
|     """ | ||||
|     if direct: | ||||
|         download_model('{m}/{m}.tar.gz'.format(m=model)) | ||||
|     else: | ||||
|  | @ -38,7 +48,7 @@ def get_json(url, desc): | |||
|     if r.status_code != 200: | ||||
|         prints("Couldn't fetch %s. Please find a model for your spaCy installation " | ||||
|                "(v%s), and download it manually." % (desc, about.__version__), | ||||
|                about.__docs_models__, title="Server error (%d)" % r.status_code, exits=True) | ||||
|                about.__docs_models__, title="Server error (%d)" % r.status_code, exits=1) | ||||
|     return r.json() | ||||
| 
 | ||||
| 
 | ||||
|  | @ -48,7 +58,7 @@ def get_compatibility(): | |||
|     comp = comp_table['spacy'] | ||||
|     if version not in comp: | ||||
|         prints("No compatible models found for v%s of spaCy." % version, | ||||
|                title="Compatibility error", exits=True) | ||||
|                title="Compatibility error", exits=1) | ||||
|     return comp[version] | ||||
| 
 | ||||
| 
 | ||||
|  | @ -56,7 +66,7 @@ def get_version(model, comp): | |||
|     if model not in comp: | ||||
|         version = about.__version__ | ||||
|         prints("No compatible model found for '%s' (spaCy v%s)." % (model, version), | ||||
|                title="Compatibility error", exits=True) | ||||
|                title="Compatibility error", exits=1) | ||||
|     return comp[model][0] | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,6 +1,7 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| import plac | ||||
| import platform | ||||
| from pathlib import Path | ||||
| 
 | ||||
|  | @ -9,7 +10,15 @@ from .. import about | |||
| from .. import util | ||||
| 
 | ||||
| 
 | ||||
| @plac.annotations( | ||||
|     model=("optional: shortcut link of model", "positional", None, str), | ||||
|     markdown=("generate Markdown for GitHub issues", "flag", "md", str) | ||||
| ) | ||||
| def info(model=None, markdown=False): | ||||
|     """Print info about spaCy installation. If a model shortcut link is | ||||
|     speficied as an argument, print model information. Flag --markdown | ||||
|     prints details in Markdown for easy copy-pasting to GitHub issues. | ||||
|     """ | ||||
|     if model: | ||||
|         model_path = util.resolve_model_path(model) | ||||
|         meta = util.parse_package_meta(model_path) | ||||
|  |  | |||
|  | @ -1,24 +1,35 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| import plac | ||||
| from pathlib import Path | ||||
| 
 | ||||
| from ..compat import symlink_to, path2str | ||||
| from ..util import prints | ||||
| from .. import util | ||||
| 
 | ||||
| 
 | ||||
| @plac.annotations( | ||||
|     origin=("package name or local path to model", "positional", None, str), | ||||
|     link_name=("name of shortuct link to create", "positional", None, str), | ||||
|     force=("force overwriting of existing link", "flag", "f", bool) | ||||
| ) | ||||
| def link(origin, link_name, force=False): | ||||
|     """Create a symlink for models within the spacy/data directory. Accepts | ||||
|     either the name of a pip package, or the local path to the model data | ||||
|     directory. Linking models allows loading them via spacy.load(link_name). | ||||
|     """ | ||||
|     if util.is_package(origin): | ||||
|         model_path = util.get_model_package_path(origin) | ||||
|     else: | ||||
|         model_path = Path(origin) | ||||
|     if not model_path.exists(): | ||||
|         prints("The data should be located in %s" % path2str(model_path), | ||||
|                title="Can't locate model data", exits=True) | ||||
|                title="Can't locate model data", exits=1) | ||||
|     link_path = util.get_data_path() / link_name | ||||
|     if link_path.exists() and not force: | ||||
|         prints("To overwrite an existing link, use the --force flag.", | ||||
|                title="Link %s already exists" % link_name, exits=True) | ||||
|                title="Link %s already exists" % link_name, exits=1) | ||||
|     elif link_path.exists(): | ||||
|         link_path.unlink() | ||||
|     try: | ||||
|  |  | |||
|  | @ -1,122 +0,0 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| import gzip | ||||
| import math | ||||
| from ast import literal_eval | ||||
| from preshed.counter import PreshCounter | ||||
| 
 | ||||
| from ..vocab import write_binary_vectors | ||||
| from ..compat import fix_text, path2str | ||||
| from ..util import prints | ||||
| from .. import util | ||||
| 
 | ||||
| 
 | ||||
| def model(lang, model_dir, freqs_data, clusters_data, vectors_data): | ||||
|     model_path = util.ensure_path(model_dir) | ||||
|     freqs_path = util.ensure_path(freqs_data) | ||||
|     clusters_path = util.ensure_path(clusters_data) | ||||
|     vectors_path = util.ensure_path(vectors_data) | ||||
|     if not freqs_path.is_file(): | ||||
|         prints(freqs_path, title="No frequencies file found", exits=True) | ||||
|     if clusters_path and not clusters_path.is_file(): | ||||
|         prints(clusters_path, title="No Brown clusters file found", exits=True) | ||||
|     if vectors_path and not vectors_path.is_file(): | ||||
|         prints(vectors_path, title="No word vectors file found", exits=True) | ||||
|     vocab = util.get_lang_class(lang).Defaults.create_vocab() | ||||
|     probs, oov_prob = read_probs(freqs_path) | ||||
|     clusters = read_clusters(clusters_path) if clusters_path else {} | ||||
|     populate_vocab(vocab, clusters, probs, oov_prob) | ||||
|     create_model(model_path, vectors_path, vocab, oov_prob) | ||||
| 
 | ||||
| 
 | ||||
| def create_model(model_path, vectors_path, vocab, oov_prob): | ||||
|     vocab_path = model_path / 'vocab' | ||||
|     lexemes_path = vocab_path / 'lexemes.bin' | ||||
|     strings_path = vocab_path / 'strings.json' | ||||
|     oov_path = vocab_path / 'oov_prob' | ||||
| 
 | ||||
|     if not model_path.exists(): | ||||
|         model_path.mkdir() | ||||
|     if not vocab_path.exists(): | ||||
|         vocab_path.mkdir() | ||||
|     vocab.dump(path2str(lexemes_path)) | ||||
|     with strings_path.open('w') as f: | ||||
|         vocab.strings.dump(f) | ||||
|     with oov_path.open('w') as f: | ||||
|         f.write('%f' % oov_prob) | ||||
|     if vectors_path: | ||||
|         vectors_dest = vocab_path / 'vec.bin' | ||||
|         write_binary_vectors(path2str(vectors_path), path2str(vectors_dest)) | ||||
| 
 | ||||
| 
 | ||||
| def read_probs(freqs_path, max_length=100, min_doc_freq=5, min_freq=200): | ||||
|     counts = PreshCounter() | ||||
|     total = 0 | ||||
|     freqs_file = check_unzip(freqs_path) | ||||
|     for i, line in enumerate(freqs_file): | ||||
|         freq, doc_freq, key = line.rstrip().split('\t', 2) | ||||
|         freq = int(freq) | ||||
|         counts.inc(i+1, freq) | ||||
|         total += freq | ||||
|     counts.smooth() | ||||
|     log_total = math.log(total) | ||||
|     freqs_file = check_unzip(freqs_path) | ||||
|     probs = {} | ||||
|     for line in freqs_file: | ||||
|         freq, doc_freq, key = line.rstrip().split('\t', 2) | ||||
|         doc_freq = int(doc_freq) | ||||
|         freq = int(freq) | ||||
|         if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length: | ||||
|             word = literal_eval(key) | ||||
|             smooth_count = counts.smoother(int(freq)) | ||||
|             probs[word] = math.log(smooth_count) - log_total | ||||
|     oov_prob = math.log(counts.smoother(0)) - log_total | ||||
|     return probs, oov_prob | ||||
| 
 | ||||
| 
 | ||||
| def read_clusters(clusters_path): | ||||
|     clusters = {} | ||||
|     with clusters_path.open() as f: | ||||
|         for line in f: | ||||
|             try: | ||||
|                 cluster, word, freq = line.split() | ||||
|                 word = fix_text(word) | ||||
|             except ValueError: | ||||
|                 continue | ||||
|             # If the clusterer has only seen the word a few times, its | ||||
|             # cluster is unreliable. | ||||
|             if int(freq) >= 3: | ||||
|                 clusters[word] = cluster | ||||
|             else: | ||||
|                 clusters[word] = '0' | ||||
|     # Expand clusters with re-casing | ||||
|     for word, cluster in list(clusters.items()): | ||||
|         if word.lower() not in clusters: | ||||
|             clusters[word.lower()] = cluster | ||||
|         if word.title() not in clusters: | ||||
|             clusters[word.title()] = cluster | ||||
|         if word.upper() not in clusters: | ||||
|             clusters[word.upper()] = cluster | ||||
|     return clusters | ||||
| 
 | ||||
| 
 | ||||
| def populate_vocab(vocab, clusters, probs, oov_prob): | ||||
|     for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])): | ||||
|         lexeme = vocab[word] | ||||
|         lexeme.prob = prob | ||||
|         lexeme.is_oov = False | ||||
|         # Decode as a little-endian string, so that we can do & 15 to get | ||||
|         # the first 4 bits. See _parse_features.pyx | ||||
|         if word in clusters: | ||||
|             lexeme.cluster = int(clusters[word][::-1], 2) | ||||
|         else: | ||||
|             lexeme.cluster = 0 | ||||
| 
 | ||||
| 
 | ||||
| def check_unzip(file_path): | ||||
|     file_path_str = path2str(file_path) | ||||
|     if file_path_str.endswith('gz'): | ||||
|         return gzip.open(file_path_str) | ||||
|     else: | ||||
|         return file_path.open() | ||||
|  | @ -1,6 +1,7 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| import plac | ||||
| import shutil | ||||
| import requests | ||||
| from pathlib import Path | ||||
|  | @ -11,16 +12,26 @@ from .. import util | |||
| from .. import about | ||||
| 
 | ||||
| 
 | ||||
| def package(input_dir, output_dir, meta_path, force): | ||||
| @plac.annotations( | ||||
|     input_dir=("directory with model data", "positional", None, str), | ||||
|     output_dir=("output parent directory", "positional", None, str), | ||||
|     meta=("path to meta.json", "option", "m", str), | ||||
|     force=("force overwriting of existing folder in output directory", "flag", "f", bool) | ||||
| ) | ||||
| def package(input_dir, output_dir, meta, force): | ||||
|     """Generate Python package for model data, including meta and required | ||||
|     installation files. A new directory will be created in the specified | ||||
|     output directory, and model data will be copied over. | ||||
|     """ | ||||
|     input_path = util.ensure_path(input_dir) | ||||
|     output_path = util.ensure_path(output_dir) | ||||
|     meta_path = util.ensure_path(meta_path) | ||||
|     meta_path = util.ensure_path(meta) | ||||
|     if not input_path or not input_path.exists(): | ||||
|         prints(input_path, title="Model directory not found", exits=True) | ||||
|         prints(input_path, title="Model directory not found", exits=1) | ||||
|     if not output_path or not output_path.exists(): | ||||
|         prints(output_path, title="Output directory not found", exits=True) | ||||
|         prints(output_path, title="Output directory not found", exits=1) | ||||
|     if meta_path and not meta_path.exists(): | ||||
|         prints(meta_path, title="meta.json not found", exits=True) | ||||
|         prints(meta_path, title="meta.json not found", exits=1) | ||||
| 
 | ||||
|     template_setup = get_template('setup.py') | ||||
|     template_manifest = get_template('MANIFEST.in') | ||||
|  | @ -55,7 +66,7 @@ def create_dirs(package_path, force): | |||
|         else: | ||||
|             prints(package_path, "Please delete the directory and try again, or " | ||||
|                    "use the --force flag to overwrite existing directories.", | ||||
|                    title="Package directory already exists", exits=True) | ||||
|                    title="Package directory already exists", exits=1) | ||||
|     Path.mkdir(package_path, parents=True) | ||||
| 
 | ||||
| 
 | ||||
|  | @ -87,12 +98,12 @@ def validate_meta(meta, keys): | |||
|     for key in keys: | ||||
|         if key not in meta or meta[key] == '': | ||||
|             prints("This setting is required to build your package.", | ||||
|                    title='No "%s" setting found in meta.json' % key, exits=True) | ||||
|                    title='No "%s" setting found in meta.json' % key, exits=1) | ||||
| 
 | ||||
| 
 | ||||
| def get_template(filepath): | ||||
|     r = requests.get(about.__model_files__ + filepath) | ||||
|     if r.status_code != 200: | ||||
|         prints("Couldn't fetch template files from GitHub.", | ||||
|                title="Server error (%d)" % r.status_code, exits=True) | ||||
|                title="Server error (%d)" % r.status_code, exits=1) | ||||
|     return r.text | ||||
|  |  | |||
|  | @ -1,6 +1,7 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals, division, print_function | ||||
| 
 | ||||
| import plac | ||||
| import json | ||||
| from collections import defaultdict | ||||
| import cytoolz | ||||
|  | @ -18,19 +19,33 @@ from .. import util | |||
| from .. import displacy | ||||
| 
 | ||||
| 
 | ||||
| def train(lang_id, output_dir, train_data, dev_data, n_iter, n_sents, | ||||
| @plac.annotations( | ||||
|     lang=("model language", "positional", None, str), | ||||
|     output_dir=("output directory to store model in", "positional", None, str), | ||||
|     train_data=("location of JSON-formatted training data", "positional", None, str), | ||||
|     dev_data=("location of JSON-formatted development data (optional)", "positional", None, str), | ||||
|     n_iter=("number of iterations", "option", "n", int), | ||||
|     n_sents=("number of sentences", "option", "ns", int), | ||||
|     use_gpu=("Use GPU", "flag", "G", bool), | ||||
|     no_tagger=("Don't train tagger", "flag", "T", bool), | ||||
|     no_parser=("Don't train parser", "flag", "P", bool), | ||||
|     no_entities=("Don't train NER", "flag", "N", bool) | ||||
| ) | ||||
| def train(lang, output_dir, train_data, dev_data, n_iter, n_sents, | ||||
|           use_gpu, no_tagger, no_parser, no_entities): | ||||
|     """Train a model. Expects data in spaCy's JSON format.""" | ||||
|     n_sents = n_sents or None | ||||
|     output_path = util.ensure_path(output_dir) | ||||
|     train_path = util.ensure_path(train_data) | ||||
|     dev_path = util.ensure_path(dev_data) | ||||
|     if not output_path.exists(): | ||||
|         prints(output_path, title="Output directory not found", exits=True) | ||||
|         prints(output_path, title="Output directory not found", exits=1) | ||||
|     if not train_path.exists(): | ||||
|         prints(train_path, title="Training data not found", exits=True) | ||||
|         prints(train_path, title="Training data not found", exits=1) | ||||
|     if dev_path and not dev_path.exists(): | ||||
|         prints(dev_path, title="Development data not found", exits=True) | ||||
|         prints(dev_path, title="Development data not found", exits=1) | ||||
| 
 | ||||
|     lang_class = util.get_lang_class(lang_id) | ||||
|     lang_class = util.get_lang_class(lang) | ||||
| 
 | ||||
|     pipeline = ['token_vectors', 'tags', 'dependencies', 'entities'] | ||||
|     if no_tagger and 'tags' in pipeline: pipeline.remove('tags') | ||||
|  |  | |||
|  | @ -5,16 +5,23 @@ include ../../_includes/_mixins | |||
| p | ||||
|     |  As of v1.7.0, spaCy comes with new command line helpers to download and | ||||
|     |  link models and show useful debugging information. For a list of available | ||||
|     |  commands, type #[code python -m spacy --help]. | ||||
|     |  commands, type #[code python -m spacy]. To make the command even more | ||||
|     |  convenient, we recommend | ||||
|     |  #[+a("https://askubuntu.com/questions/17536/how-do-i-create-a-permanent-bash-alias/17537#17537") creating an alias] | ||||
|     |  mapping #[code python -m spacy] to #[code spacy]. | ||||
| 
 | ||||
| +aside("Why python -m?") | ||||
|     |  The problem with a global entry point is that it's resolved by looking up | ||||
|     |  entries in your #[code PATH] environment variable. This can give you | ||||
|     |  unexpected results, like executing the wrong spaCy installation | ||||
|     |  (especially when using #[code virtualenv]). #[code python -m] prevents | ||||
|     |  fallbacks to system modules and makes sure the correct spaCy version is | ||||
|     |  used. If you hate typing it every time, we recommend creating an | ||||
|     |  #[code alias] instead. | ||||
|     |  unexpected results, like executing the wrong spaCy installation. | ||||
|     |  #[code python -m] prevents fallbacks to system modules. | ||||
| 
 | ||||
| +infobox("⚠️ Deprecation note") | ||||
|     |  As of spaCy 2.0, the #[code model] command to initialise a model data | ||||
|     |  directory is deprecated. The command was only necessary because previous | ||||
|     |  versions of spaCy expected a model directory to already be set up. This | ||||
|     |  has since been changed, so you can use the #[+api("cli#train") #[code train]] | ||||
|     |  command straight away. | ||||
| 
 | ||||
| +h(2, "download") Download | ||||
| 
 | ||||
|  | @ -45,7 +52,7 @@ p | |||
|         +cell flag | ||||
|         +cell Show help message and available arguments. | ||||
| 
 | ||||
| +infobox("Important note") | ||||
| +aside("Downloading best practices") | ||||
|     |  The #[code download] command is mostly intended as a convenient, | ||||
|     |  interactive wrapper – it performs compatibility checks and prints | ||||
|     |  detailed messages in case things go wrong. It's #[strong not recommended] | ||||
|  | @ -116,7 +123,6 @@ p | |||
|         +cell Show help message and available arguments. | ||||
| 
 | ||||
| +h(2, "convert") Convert | ||||
|     +tag experimental | ||||
| 
 | ||||
| p | ||||
|     |  Convert files into spaCy's #[+a("/docs/api/annotation#json-input") JSON format] | ||||
|  | @ -153,49 +159,7 @@ p | |||
|         +cell flag | ||||
|         +cell Show help message and available arguments. | ||||
| 
 | ||||
| +h(2, "model") Model | ||||
|     +tag experimental | ||||
| 
 | ||||
| p | ||||
|     |  Initialise a new model and its data directory. For more info on this, see | ||||
|     |  the documentation on #[+a("/docs/usage/adding-languages") adding languages]. | ||||
| 
 | ||||
| +code(false, "bash"). | ||||
|     python -m spacy model [lang] [model_dir] [freqs_data] [clusters_data] [vectors_data] | ||||
| 
 | ||||
| +table(["Argument", "Type", "Description"]) | ||||
|     +row | ||||
|         +cell #[code lang] | ||||
|         +cell positional | ||||
|         +cell Model language. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code model_dir] | ||||
|         +cell positional | ||||
|         +cell Output directory to store the model in. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code freqs_data] | ||||
|         +cell positional | ||||
|         +cell Tab-separated frequencies file. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code clusters_data] | ||||
|         +cell positional | ||||
|         +cell Brown custers file (optional). | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code vectors_data] | ||||
|         +cell positional | ||||
|         +cell Word vectors file (optional). | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code --help], #[code -h] | ||||
|         +cell flag | ||||
|         +cell Show help message and available arguments. | ||||
| 
 | ||||
| +h(2, "train") Train | ||||
|     +tag experimental | ||||
| 
 | ||||
| p | ||||
|     |  Train a model. Expects data in spaCy's | ||||
|  | @ -231,7 +195,7 @@ p | |||
|         +cell Number of iterations (default: #[code 15]). | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code --nsents] | ||||
|         +cell #[code --n_sents], #[code -ns] | ||||
|         +cell option | ||||
|         +cell Number of sentences (default: #[code 0]). | ||||
| 
 | ||||
|  | @ -241,7 +205,7 @@ p | |||
|         +cell L1 regularization penalty for parser (default: #[code 0.0]). | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code --use-gpu], #[code -g] | ||||
|         +cell #[code --use-gpu], #[code -G] | ||||
|         +cell flag | ||||
|         +cell Use GPU. | ||||
| 
 | ||||
|  | @ -266,17 +230,16 @@ p | |||
|         +cell Show help message and available arguments. | ||||
| 
 | ||||
| +h(2, "package") Package | ||||
|     +tag experimental | ||||
| 
 | ||||
| p | ||||
|     |  Generate a #[+a("/docs/usage/saving-loading#generating") model Python package] | ||||
|     |  from an existing model data directory. All data files are copied over. | ||||
|     |  If the path to a meta.json is supplied, or a meta.json is found in the | ||||
|     |  input directory, this file is used. Otherwise, the data can be entered | ||||
|     |  directly from the command line. While this feature is still experimental, | ||||
|     |  the required file templates are downloaded from | ||||
|     |  #[+src(gh("spacy-dev-resources", "templates/model")) GitHub]. This means | ||||
|     |  you need to be connected to the internet to use this command. | ||||
|     |  directly from the command line. The required file templates are downloaded | ||||
|     |  from #[+src(gh("spacy-dev-resources", "templates/model")) GitHub] to make | ||||
|     |  sure you're always using the latest versions. This means you need to be | ||||
|     |  connected to the internet to use this command. | ||||
| 
 | ||||
| +code(false, "bash"). | ||||
|     python -m spacy package [input_dir] [output_dir] [--meta] [--force] | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user