From d941fc36672bb08cfaf59c2301b98f27ff846667 Mon Sep 17 00:00:00 2001 From: ines Date: Fri, 27 Oct 2017 14:38:39 +0200 Subject: [PATCH] Tidy up CLI --- spacy/cli/convert.py | 12 +++---- spacy/cli/converters/conll_ner2json.py | 3 +- spacy/cli/download.py | 46 ++++++++++++++------------ spacy/cli/evaluate.py | 45 +++++++++++-------------- spacy/cli/info.py | 3 +- spacy/cli/link.py | 8 ++--- spacy/cli/package.py | 28 ++++++++++------ spacy/cli/profile.py | 8 ++--- spacy/cli/train.py | 36 ++++++++++---------- spacy/cli/validate.py | 13 +++++--- 10 files changed, 103 insertions(+), 99 deletions(-) diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index d9a812a15..ad17844a1 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -7,10 +7,9 @@ from pathlib import Path from .converters import conllu2json, iob2json, conll_ner2json from ..util import prints -# Converters are matched by file extension. To add a converter, add a new entry -# to this dict with the file extension mapped to the converter function imported -# from /converters. - +# Converters are matched by file extension. To add a converter, add a new +# entry to this dict with the file extension mapped to the converter function +# imported from /converters. CONVERTERS = { 'conllu': conllu2json, 'conll': conllu2json, @@ -24,8 +23,7 @@ CONVERTERS = { output_dir=("output directory for converted file", "positional", None, str), n_sents=("Number of sentences per doc", "option", "n", int), converter=("Name of converter (auto, iob, conllu or ner)", "option", "c", str), - morphology=("Enable appending morphology to tags", "flag", "m", bool) -) + morphology=("Enable appending morphology to tags", "flag", "m", bool)) def convert(cmd, input_file, output_dir, n_sents=1, morphology=False, converter='auto'): """ @@ -40,7 +38,7 @@ def convert(cmd, input_file, output_dir, n_sents=1, morphology=False, prints(output_path, title="Output directory not found", exits=1) if converter == 'auto': converter = input_path.suffix[1:] - if not converter in CONVERTERS: + if converter not in CONVERTERS: prints("Can't find converter for %s" % converter, title="Unknown format", exits=1) func = CONVERTERS[converter] diff --git a/spacy/cli/converters/conll_ner2json.py b/spacy/cli/converters/conll_ner2json.py index e3bd82e7e..fb2979652 100644 --- a/spacy/cli/converters/conll_ner2json.py +++ b/spacy/cli/converters/conll_ner2json.py @@ -8,7 +8,8 @@ from ...gold import iob_to_biluo def conll_ner2json(input_path, output_path, n_sents=10, use_morphology=False): """ - Convert files in the CoNLL-2003 NER format into JSON format for use with train cli. + Convert files in the CoNLL-2003 NER format into JSON format for use with + train cli. """ docs = read_conll_ner(input_path) diff --git a/spacy/cli/download.py b/spacy/cli/download.py index 28ae07865..0d3f11153 100644 --- a/spacy/cli/download.py +++ b/spacy/cli/download.py @@ -13,10 +13,9 @@ from .. import about @plac.annotations( - model=("model to download (shortcut or model name)", "positional", None, str), + model=("model to download, shortcut or name)", "positional", None, str), direct=("force direct download. Needs model name with version and won't " - "perform compatibility check", "flag", "d", bool) -) + "perform compatibility check", "flag", "d", bool)) def download(cmd, model, direct=False): """ Download compatible model from default download path using pip. Model @@ -30,21 +29,25 @@ def download(cmd, model, direct=False): model_name = shortcuts.get(model, model) compatibility = get_compatibility() version = get_version(model_name, compatibility) - dl = download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version)) + dl = download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, + v=version)) if dl == 0: try: # Get package path here because link uses - # pip.get_installed_distributions() to check if model is a package, - # which fails if model was just installed via subprocess + # pip.get_installed_distributions() to check if model is a + # package, which fails if model was just installed via + # subprocess package_path = get_package_path(model_name) - link(None, model_name, model, force=True, model_path=package_path) + link(None, model_name, model, force=True, + model_path=package_path) except: - # Dirty, but since spacy.download and the auto-linking is mostly - # a convenience wrapper, it's best to show a success message and - # loading instructions, even if linking fails. - prints("Creating a shortcut link for 'en' didn't work (maybe you " - "don't have admin permissions?), but you can still load " - "the model via its full package name:", + # Dirty, but since spacy.download and the auto-linking is + # mostly a convenience wrapper, it's best to show a success + # message and loading instructions, even if linking fails. + prints( + "Creating a shortcut link for 'en' didn't work (maybe " + "you don't have admin permissions?), but you can still " + "load the model via its full package name:", "nlp = spacy.load('%s')" % model_name, title="Download successful") @@ -52,9 +55,10 @@ def download(cmd, model, direct=False): def get_json(url, desc): r = requests.get(url) if r.status_code != 200: - prints("Couldn't fetch %s. Please find a model for your spaCy installation " - "(v%s), and download it manually." % (desc, about.__version__), - about.__docs_models__, title="Server error (%d)" % r.status_code, exits=1) + msg = ("Couldn't fetch %s. Please find a model for your spaCy " + "installation (v%s), and download it manually.") + prints(msg % (desc, about.__version__), about.__docs_models__, + title="Server error (%d)" % r.status_code, exits=1) return r.json() @@ -71,13 +75,13 @@ def get_compatibility(): def get_version(model, comp): if model not in comp: version = about.__version__ - prints("No compatible model found for '%s' (spaCy v%s)." % (model, version), - title="Compatibility error", exits=1) + msg = "No compatible model found for '%s' (spaCy v%s)." + prints(msg % (model, version), title="Compatibility error", exits=1) return comp[model][0] def download_model(filename): download_url = about.__download_url__ + '/' + filename - return subprocess.call([sys.executable, '-m', - 'pip', 'install', '--no-cache-dir', download_url], - env=os.environ.copy()) + return subprocess.call( + [sys.executable, '-m', 'pip', 'install', '--no-cache-dir', + download_url], env=os.environ.copy()) diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index 29e30b7d2..d4d54d8aa 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -2,27 +2,15 @@ from __future__ import unicode_literals, division, print_function import plac -import json -from collections import defaultdict -import cytoolz -from pathlib import Path -import dill -import tqdm -from thinc.neural._classes.model import Model -from thinc.neural.optimizers import linear_decay from timeit import default_timer as timer import random import numpy.random -from ..tokens.doc import Doc -from ..scorer import Scorer -from ..gold import GoldParse, merge_sents -from ..gold import GoldCorpus, minibatch +from ..gold import GoldCorpus from ..util import prints from .. import util -from .. import about from .. import displacy -from ..compat import json_dumps + random.seed(0) numpy.random.seed(0) @@ -30,17 +18,18 @@ numpy.random.seed(0) @plac.annotations( model=("Model name or path", "positional", None, str), - data_path=("Location of JSON-formatted evaluation data", "positional", None, str), + data_path=("Location of JSON-formatted evaluation data", "positional", + None, str), gold_preproc=("Use gold preprocessing", "flag", "G", bool), gpu_id=("Use GPU", "option", "g", int), - displacy_path=("Directory to output rendered parses as HTML", "option", "dp", str), - displacy_limit=("Limit of parses to render as HTML", "option", "dl", int) -) + displacy_path=("Directory to output rendered parses as HTML", "option", + "dp", str), + displacy_limit=("Limit of parses to render as HTML", "option", "dl", int)) def evaluate(cmd, model, data_path, gpu_id=-1, gold_preproc=False, displacy_path=None, displacy_limit=25): """ - Evaluate a model. To render a sample of parses in a HTML file, set an output - directory as the displacy_path argument. + Evaluate a model. To render a sample of parses in a HTML file, set an + output directory as the displacy_path argument. """ if gpu_id >= 0: util.use_gpu(gpu_id) @@ -50,7 +39,8 @@ def evaluate(cmd, model, data_path, gpu_id=-1, gold_preproc=False, if not data_path.exists(): prints(data_path, title="Evaluation data not found", exits=1) if displacy_path and not displacy_path.exists(): - prints(displacy_path, title="Visualization output directory not found", exits=1) + prints(displacy_path, title="Visualization output directory not found", + exits=1) corpus = GoldCorpus(data_path, data_path) nlp = util.load_model(model) dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc)) @@ -64,12 +54,14 @@ def evaluate(cmd, model, data_path, gpu_id=-1, gold_preproc=False, docs, golds = zip(*dev_docs) render_deps = 'parser' in nlp.meta.get('pipeline', []) render_ents = 'ner' in nlp.meta.get('pipeline', []) - render_parses(docs, displacy_path, model_name=model, limit=displacy_limit, - deps=render_deps, ents=render_ents) - prints(displacy_path, title="Generated %s parses as HTML" % displacy_limit) + render_parses(docs, displacy_path, model_name=model, + limit=displacy_limit, deps=render_deps, ents=render_ents) + msg = "Generated %s parses as HTML" % displacy_limit + prints(displacy_path, title=msg) -def render_parses(docs, output_path, model_name='', limit=250, deps=True, ents=True): +def render_parses(docs, output_path, model_name='', limit=250, deps=True, + ents=True): docs[0].user_data['title'] = model_name if ents: with (output_path / 'entities.html').open('w') as file_: @@ -77,7 +69,8 @@ def render_parses(docs, output_path, model_name='', limit=250, deps=True, ents=T file_.write(html) if deps: with (output_path / 'parses.html').open('w') as file_: - html = displacy.render(docs[:limit], style='dep', page=True, options={'compact': True}) + html = displacy.render(docs[:limit], style='dep', page=True, + options={'compact': True}) file_.write(html) diff --git a/spacy/cli/info.py b/spacy/cli/info.py index 5d45b271c..3636494fb 100644 --- a/spacy/cli/info.py +++ b/spacy/cli/info.py @@ -12,8 +12,7 @@ from .. import util @plac.annotations( model=("optional: shortcut link of model", "positional", None, str), - markdown=("generate Markdown for GitHub issues", "flag", "md", str) -) + markdown=("generate Markdown for GitHub issues", "flag", "md", str)) def info(cmd, model=None, markdown=False): """Print info about spaCy installation. If a model shortcut link is speficied as an argument, print model information. Flag --markdown diff --git a/spacy/cli/link.py b/spacy/cli/link.py index 5b333dae5..cfbc97e3e 100644 --- a/spacy/cli/link.py +++ b/spacy/cli/link.py @@ -12,8 +12,7 @@ from .. import util @plac.annotations( origin=("package name or local path to model", "positional", None, str), link_name=("name of shortuct link to create", "positional", None, str), - force=("force overwriting of existing link", "flag", "f", bool) -) + force=("force overwriting of existing link", "flag", "f", bool)) def link(cmd, origin, link_name, force=False, model_path=None): """ Create a symlink for models within the spacy/data directory. Accepts @@ -46,8 +45,9 @@ def link(cmd, origin, link_name, force=False, model_path=None): # This is quite dirty, but just making sure other errors are caught. prints("Creating a symlink in spacy/data failed. Make sure you have " "the required permissions and try re-running the command as " - "admin, or use a virtualenv. You can still import the model as a " - "module and call its load() method, or create the symlink manually.", + "admin, or use a virtualenv. You can still import the model as " + "a module and call its load() method, or create the symlink " + "manually.", "%s --> %s" % (path2str(model_path), path2str(link_path)), title="Error: Couldn't link model to '%s'" % link_name) raise diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 6b0811459..d1984fe65 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -16,10 +16,12 @@ from .. import about input_dir=("directory with model data", "positional", None, str), output_dir=("output parent directory", "positional", None, str), meta_path=("path to meta.json", "option", "m", str), - create_meta=("create meta.json, even if one exists in directory", "flag", "c", bool), - force=("force overwriting of existing folder in output directory", "flag", "f", bool) -) -def package(cmd, input_dir, output_dir, meta_path=None, create_meta=False, force=False): + create_meta=("create meta.json, even if one exists in directory", "flag", + "c", bool), + force=("force overwriting of existing folder in output directory", "flag", + "f", bool)) +def package(cmd, input_dir, output_dir, meta_path=None, create_meta=False, + force=False): """ Generate Python package for model data, including meta and required installation files. A new directory will be created in the specified @@ -52,13 +54,15 @@ def package(cmd, input_dir, output_dir, meta_path=None, create_meta=False, force package_path = main_path / model_name create_dirs(package_path, force) - shutil.copytree(path2str(input_path), path2str(package_path / model_name_v)) + shutil.copytree(path2str(input_path), + path2str(package_path / model_name_v)) create_file(main_path / 'meta.json', json_dumps(meta)) create_file(main_path / 'setup.py', template_setup) create_file(main_path / 'MANIFEST.in', template_manifest) create_file(package_path / '__init__.py', template_init) - prints(main_path, "To build the package, run `python setup.py sdist` in this " - "directory.", title="Successfully created package '%s'" % model_name_v) + prints(main_path, "To build the package, run `python setup.py sdist` in " + "this directory.", + title="Successfully created package '%s'" % model_name_v) def create_dirs(package_path, force): @@ -66,9 +70,10 @@ def create_dirs(package_path, force): if force: shutil.rmtree(path2str(package_path)) else: - prints(package_path, "Please delete the directory and try again, or " - "use the --force flag to overwrite existing directories.", - title="Package directory already exists", exits=1) + prints(package_path, "Please delete the directory and try again, " + "or use the --force flag to overwrite existing " + "directories.", title="Package directory already exists", + exits=1) Path.mkdir(package_path, parents=True) @@ -82,7 +87,8 @@ def generate_meta(model_path): settings = [('lang', 'Model language', 'en'), ('name', 'Model name', 'model'), ('version', 'Model version', '0.0.0'), - ('spacy_version', 'Required spaCy version', '>=%s,<3.0.0' % about.__version__), + ('spacy_version', 'Required spaCy version', + '>=%s,<3.0.0' % about.__version__), ('description', 'Model description', False), ('author', 'Author', False), ('email', 'Author email', False), diff --git a/spacy/cli/profile.py b/spacy/cli/profile.py index db6fc5b41..a394989d0 100644 --- a/spacy/cli/profile.py +++ b/spacy/cli/profile.py @@ -27,15 +27,15 @@ def read_inputs(loc): @plac.annotations( lang=("model/language", "positional", None, str), - inputs=("Location of input file", "positional", None, read_inputs) -) + inputs=("Location of input file", "positional", None, read_inputs)) def profile(cmd, lang, inputs=None): """ Profile a spaCy pipeline, to find out which functions take the most time. """ - nlp = spacy.load(lang) + nlp = spacy.load(lang) texts = list(cytoolz.take(10000, inputs)) - cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof") + cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), + "Profile.prof") s = pstats.Stats("Profile.prof") s.strip_dirs().sort_stats("time").print_stats() diff --git a/spacy/cli/train.py b/spacy/cli/train.py index da398751c..fb96e6c05 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -2,21 +2,14 @@ from __future__ import unicode_literals, division, print_function import plac -import json -from collections import defaultdict -import cytoolz from pathlib import Path import dill import tqdm from thinc.neural._classes.model import Model -from thinc.neural.optimizers import linear_decay from timeit import default_timer as timer import random import numpy.random -from ..tokens.doc import Doc -from ..scorer import Scorer -from ..gold import GoldParse, merge_sents from ..gold import GoldCorpus, minibatch from ..util import prints from .. import util @@ -31,8 +24,10 @@ numpy.random.seed(0) @plac.annotations( lang=("model language", "positional", None, str), output_dir=("output directory to store model in", "positional", None, str), - train_data=("location of JSON-formatted training data", "positional", None, str), - dev_data=("location of JSON-formatted development data (optional)", "positional", None, str), + train_data=("location of JSON-formatted training data", "positional", + None, str), + dev_data=("location of JSON-formatted development data (optional)", + "positional", None, str), n_iter=("number of iterations", "option", "n", int), n_sents=("number of sentences", "option", "ns", int), use_gpu=("Use GPU", "option", "g", int), @@ -42,11 +37,12 @@ numpy.random.seed(0) no_entities=("Don't train NER", "flag", "N", bool), gold_preproc=("Use gold preprocessing", "flag", "G", bool), version=("Model version", "option", "V", str), - meta_path=("Optional path to meta.json. All relevant properties will be overwritten.", "option", "m", Path) -) + meta_path=("Optional path to meta.json. All relevant properties will be " + "overwritten.", "option", "m", Path)) def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0, - use_gpu=-1, vectors=None, no_tagger=False, no_parser=False, no_entities=False, - gold_preproc=False, version="0.0.0", meta_path=None): + use_gpu=-1, vectors=None, no_tagger=False, no_parser=False, + no_entities=False, gold_preproc=False, version="0.0.0", + meta_path=None): """ Train a model. Expects data in spaCy's JSON format. """ @@ -72,9 +68,12 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0, meta.setdefault('name', 'unnamed') pipeline = ['tagger', 'parser', 'ner'] - if no_tagger and 'tagger' in pipeline: pipeline.remove('tagger') - if no_parser and 'parser' in pipeline: pipeline.remove('parser') - if no_entities and 'ner' in pipeline: pipeline.remove('ner') + if no_tagger and 'tagger' in pipeline: + pipeline.remove('tagger') + if no_parser and 'parser' in pipeline: + pipeline.remove('parser') + if no_entities and 'ner' in pipeline: + pipeline.remove('ner') # Take dropout and batch size as generators of values -- dropout # starts high and decays sharply, to force the optimizer to explore. @@ -139,7 +138,7 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0, scorer = nlp_loaded.evaluate(dev_docs) end_time = timer() cpu_wps = nwords/(end_time-start_time) - acc_loc =(output_path / ('model%d' % i) / 'accuracy.json') + acc_loc = (output_path / ('model%d' % i) / 'accuracy.json') with acc_loc.open('w') as file_: file_.write(json_dumps(scorer.scores)) meta_loc = output_path / ('model%d' % i) / 'meta.json' @@ -157,7 +156,8 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0, with meta_loc.open('w') as file_: file_.write(json_dumps(meta)) util.set_env_log(True) - print_progress(i, losses, scorer.scores, cpu_wps=cpu_wps, gpu_wps=gpu_wps) + print_progress(i, losses, scorer.scores, cpu_wps=cpu_wps, + gpu_wps=gpu_wps) finally: print("Saving model...") try: diff --git a/spacy/cli/validate.py b/spacy/cli/validate.py index c1f992ed6..1c645a554 100644 --- a/spacy/cli/validate.py +++ b/spacy/cli/validate.py @@ -1,5 +1,5 @@ # coding: utf8 -from __future__ import unicode_literals +from __future__ import unicode_literals, print_function import requests import pkg_resources @@ -29,8 +29,10 @@ def validate(cmd): model_links = get_model_links(current_compat) model_pkgs = get_model_pkgs(current_compat, all_models) incompat_links = {l for l, d in model_links.items() if not d['compat']} - incompat_models = {d['name'] for _, d in model_pkgs.items() if not d['compat']} - incompat_models.update([d['name'] for _, d in model_links.items() if not d['compat']]) + incompat_models = {d['name'] for _, d in model_pkgs.items() + if not d['compat']} + incompat_models.update([d['name'] for _, d in model_links.items() + if not d['compat']]) na_models = [m for m in incompat_models if m not in current_compat] update_models = [m for m in incompat_models if m in current_compat] @@ -90,7 +92,6 @@ def get_model_pkgs(compat, all_models): def get_model_row(compat, name, data, type='package'): - tpl_row = ' {:<10}' + (' {:<20}' * 4) tpl_red = '\x1b[38;5;1m{}\x1b[0m' tpl_green = '\x1b[38;5;2m{}\x1b[0m' if data['compat']: @@ -110,7 +111,8 @@ def get_row(*args): def is_model_path(model_path): exclude = ['cache', 'pycache', '__pycache__'] name = model_path.parts[-1] - return model_path.is_dir() and name not in exclude and not name.startswith('.') + return (model_path.is_dir() and name not in exclude + and not name.startswith('.')) def is_compat(compat, name, version): @@ -118,6 +120,7 @@ def is_compat(compat, name, version): def reformat_version(version): + """Hack to reformat old versions ending on '-alpha' to match pip format.""" if version.endswith('-alpha'): return version.replace('-alpha', 'a0') return version.replace('-alpha', 'a')