Tidy up CLI

This commit is contained in:
ines 2017-10-27 14:38:39 +02:00
parent 298c3d973c
commit d941fc3667
10 changed files with 103 additions and 99 deletions

View File

@ -7,10 +7,9 @@ from pathlib import Path
from .converters import conllu2json, iob2json, conll_ner2json
from ..util import prints
# Converters are matched by file extension. To add a converter, add a new entry
# to this dict with the file extension mapped to the converter function imported
# from /converters.
# Converters are matched by file extension. To add a converter, add a new
# entry to this dict with the file extension mapped to the converter function
# imported from /converters.
CONVERTERS = {
'conllu': conllu2json,
'conll': conllu2json,
@ -24,8 +23,7 @@ CONVERTERS = {
output_dir=("output directory for converted file", "positional", None, str),
n_sents=("Number of sentences per doc", "option", "n", int),
converter=("Name of converter (auto, iob, conllu or ner)", "option", "c", str),
morphology=("Enable appending morphology to tags", "flag", "m", bool)
)
morphology=("Enable appending morphology to tags", "flag", "m", bool))
def convert(cmd, input_file, output_dir, n_sents=1, morphology=False,
converter='auto'):
"""
@ -40,7 +38,7 @@ def convert(cmd, input_file, output_dir, n_sents=1, morphology=False,
prints(output_path, title="Output directory not found", exits=1)
if converter == 'auto':
converter = input_path.suffix[1:]
if not converter in CONVERTERS:
if converter not in CONVERTERS:
prints("Can't find converter for %s" % converter,
title="Unknown format", exits=1)
func = CONVERTERS[converter]

View File

@ -8,7 +8,8 @@ from ...gold import iob_to_biluo
def conll_ner2json(input_path, output_path, n_sents=10, use_morphology=False):
"""
Convert files in the CoNLL-2003 NER format into JSON format for use with train cli.
Convert files in the CoNLL-2003 NER format into JSON format for use with
train cli.
"""
docs = read_conll_ner(input_path)

View File

@ -13,10 +13,9 @@ from .. import about
@plac.annotations(
model=("model to download (shortcut or model name)", "positional", None, str),
model=("model to download, shortcut or name)", "positional", None, str),
direct=("force direct download. Needs model name with version and won't "
"perform compatibility check", "flag", "d", bool)
)
"perform compatibility check", "flag", "d", bool))
def download(cmd, model, direct=False):
"""
Download compatible model from default download path using pip. Model
@ -30,21 +29,25 @@ def download(cmd, model, direct=False):
model_name = shortcuts.get(model, model)
compatibility = get_compatibility()
version = get_version(model_name, compatibility)
dl = download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version))
dl = download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name,
v=version))
if dl == 0:
try:
# Get package path here because link uses
# pip.get_installed_distributions() to check if model is a package,
# which fails if model was just installed via subprocess
# pip.get_installed_distributions() to check if model is a
# package, which fails if model was just installed via
# subprocess
package_path = get_package_path(model_name)
link(None, model_name, model, force=True, model_path=package_path)
link(None, model_name, model, force=True,
model_path=package_path)
except:
# Dirty, but since spacy.download and the auto-linking is mostly
# a convenience wrapper, it's best to show a success message and
# loading instructions, even if linking fails.
prints("Creating a shortcut link for 'en' didn't work (maybe you "
"don't have admin permissions?), but you can still load "
"the model via its full package name:",
# Dirty, but since spacy.download and the auto-linking is
# mostly a convenience wrapper, it's best to show a success
# message and loading instructions, even if linking fails.
prints(
"Creating a shortcut link for 'en' didn't work (maybe "
"you don't have admin permissions?), but you can still "
"load the model via its full package name:",
"nlp = spacy.load('%s')" % model_name,
title="Download successful")
@ -52,9 +55,10 @@ def download(cmd, model, direct=False):
def get_json(url, desc):
r = requests.get(url)
if r.status_code != 200:
prints("Couldn't fetch %s. Please find a model for your spaCy installation "
"(v%s), and download it manually." % (desc, about.__version__),
about.__docs_models__, title="Server error (%d)" % r.status_code, exits=1)
msg = ("Couldn't fetch %s. Please find a model for your spaCy "
"installation (v%s), and download it manually.")
prints(msg % (desc, about.__version__), about.__docs_models__,
title="Server error (%d)" % r.status_code, exits=1)
return r.json()
@ -71,13 +75,13 @@ def get_compatibility():
def get_version(model, comp):
if model not in comp:
version = about.__version__
prints("No compatible model found for '%s' (spaCy v%s)." % (model, version),
title="Compatibility error", exits=1)
msg = "No compatible model found for '%s' (spaCy v%s)."
prints(msg % (model, version), title="Compatibility error", exits=1)
return comp[model][0]
def download_model(filename):
download_url = about.__download_url__ + '/' + filename
return subprocess.call([sys.executable, '-m',
'pip', 'install', '--no-cache-dir', download_url],
env=os.environ.copy())
return subprocess.call(
[sys.executable, '-m', 'pip', 'install', '--no-cache-dir',
download_url], env=os.environ.copy())

View File

@ -2,27 +2,15 @@
from __future__ import unicode_literals, division, print_function
import plac
import json
from collections import defaultdict
import cytoolz
from pathlib import Path
import dill
import tqdm
from thinc.neural._classes.model import Model
from thinc.neural.optimizers import linear_decay
from timeit import default_timer as timer
import random
import numpy.random
from ..tokens.doc import Doc
from ..scorer import Scorer
from ..gold import GoldParse, merge_sents
from ..gold import GoldCorpus, minibatch
from ..gold import GoldCorpus
from ..util import prints
from .. import util
from .. import about
from .. import displacy
from ..compat import json_dumps
random.seed(0)
numpy.random.seed(0)
@ -30,17 +18,18 @@ numpy.random.seed(0)
@plac.annotations(
model=("Model name or path", "positional", None, str),
data_path=("Location of JSON-formatted evaluation data", "positional", None, str),
data_path=("Location of JSON-formatted evaluation data", "positional",
None, str),
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
gpu_id=("Use GPU", "option", "g", int),
displacy_path=("Directory to output rendered parses as HTML", "option", "dp", str),
displacy_limit=("Limit of parses to render as HTML", "option", "dl", int)
)
displacy_path=("Directory to output rendered parses as HTML", "option",
"dp", str),
displacy_limit=("Limit of parses to render as HTML", "option", "dl", int))
def evaluate(cmd, model, data_path, gpu_id=-1, gold_preproc=False,
displacy_path=None, displacy_limit=25):
"""
Evaluate a model. To render a sample of parses in a HTML file, set an output
directory as the displacy_path argument.
Evaluate a model. To render a sample of parses in a HTML file, set an
output directory as the displacy_path argument.
"""
if gpu_id >= 0:
util.use_gpu(gpu_id)
@ -50,7 +39,8 @@ def evaluate(cmd, model, data_path, gpu_id=-1, gold_preproc=False,
if not data_path.exists():
prints(data_path, title="Evaluation data not found", exits=1)
if displacy_path and not displacy_path.exists():
prints(displacy_path, title="Visualization output directory not found", exits=1)
prints(displacy_path, title="Visualization output directory not found",
exits=1)
corpus = GoldCorpus(data_path, data_path)
nlp = util.load_model(model)
dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc))
@ -64,12 +54,14 @@ def evaluate(cmd, model, data_path, gpu_id=-1, gold_preproc=False,
docs, golds = zip(*dev_docs)
render_deps = 'parser' in nlp.meta.get('pipeline', [])
render_ents = 'ner' in nlp.meta.get('pipeline', [])
render_parses(docs, displacy_path, model_name=model, limit=displacy_limit,
deps=render_deps, ents=render_ents)
prints(displacy_path, title="Generated %s parses as HTML" % displacy_limit)
render_parses(docs, displacy_path, model_name=model,
limit=displacy_limit, deps=render_deps, ents=render_ents)
msg = "Generated %s parses as HTML" % displacy_limit
prints(displacy_path, title=msg)
def render_parses(docs, output_path, model_name='', limit=250, deps=True, ents=True):
def render_parses(docs, output_path, model_name='', limit=250, deps=True,
ents=True):
docs[0].user_data['title'] = model_name
if ents:
with (output_path / 'entities.html').open('w') as file_:
@ -77,7 +69,8 @@ def render_parses(docs, output_path, model_name='', limit=250, deps=True, ents=T
file_.write(html)
if deps:
with (output_path / 'parses.html').open('w') as file_:
html = displacy.render(docs[:limit], style='dep', page=True, options={'compact': True})
html = displacy.render(docs[:limit], style='dep', page=True,
options={'compact': True})
file_.write(html)

View File

@ -12,8 +12,7 @@ from .. import util
@plac.annotations(
model=("optional: shortcut link of model", "positional", None, str),
markdown=("generate Markdown for GitHub issues", "flag", "md", str)
)
markdown=("generate Markdown for GitHub issues", "flag", "md", str))
def info(cmd, model=None, markdown=False):
"""Print info about spaCy installation. If a model shortcut link is
speficied as an argument, print model information. Flag --markdown

View File

@ -12,8 +12,7 @@ from .. import util
@plac.annotations(
origin=("package name or local path to model", "positional", None, str),
link_name=("name of shortuct link to create", "positional", None, str),
force=("force overwriting of existing link", "flag", "f", bool)
)
force=("force overwriting of existing link", "flag", "f", bool))
def link(cmd, origin, link_name, force=False, model_path=None):
"""
Create a symlink for models within the spacy/data directory. Accepts
@ -46,8 +45,9 @@ def link(cmd, origin, link_name, force=False, model_path=None):
# This is quite dirty, but just making sure other errors are caught.
prints("Creating a symlink in spacy/data failed. Make sure you have "
"the required permissions and try re-running the command as "
"admin, or use a virtualenv. You can still import the model as a "
"module and call its load() method, or create the symlink manually.",
"admin, or use a virtualenv. You can still import the model as "
"a module and call its load() method, or create the symlink "
"manually.",
"%s --> %s" % (path2str(model_path), path2str(link_path)),
title="Error: Couldn't link model to '%s'" % link_name)
raise

View File

@ -16,10 +16,12 @@ from .. import about
input_dir=("directory with model data", "positional", None, str),
output_dir=("output parent directory", "positional", None, str),
meta_path=("path to meta.json", "option", "m", str),
create_meta=("create meta.json, even if one exists in directory", "flag", "c", bool),
force=("force overwriting of existing folder in output directory", "flag", "f", bool)
)
def package(cmd, input_dir, output_dir, meta_path=None, create_meta=False, force=False):
create_meta=("create meta.json, even if one exists in directory", "flag",
"c", bool),
force=("force overwriting of existing folder in output directory", "flag",
"f", bool))
def package(cmd, input_dir, output_dir, meta_path=None, create_meta=False,
force=False):
"""
Generate Python package for model data, including meta and required
installation files. A new directory will be created in the specified
@ -52,13 +54,15 @@ def package(cmd, input_dir, output_dir, meta_path=None, create_meta=False, force
package_path = main_path / model_name
create_dirs(package_path, force)
shutil.copytree(path2str(input_path), path2str(package_path / model_name_v))
shutil.copytree(path2str(input_path),
path2str(package_path / model_name_v))
create_file(main_path / 'meta.json', json_dumps(meta))
create_file(main_path / 'setup.py', template_setup)
create_file(main_path / 'MANIFEST.in', template_manifest)
create_file(package_path / '__init__.py', template_init)
prints(main_path, "To build the package, run `python setup.py sdist` in this "
"directory.", title="Successfully created package '%s'" % model_name_v)
prints(main_path, "To build the package, run `python setup.py sdist` in "
"this directory.",
title="Successfully created package '%s'" % model_name_v)
def create_dirs(package_path, force):
@ -66,9 +70,10 @@ def create_dirs(package_path, force):
if force:
shutil.rmtree(path2str(package_path))
else:
prints(package_path, "Please delete the directory and try again, or "
"use the --force flag to overwrite existing directories.",
title="Package directory already exists", exits=1)
prints(package_path, "Please delete the directory and try again, "
"or use the --force flag to overwrite existing "
"directories.", title="Package directory already exists",
exits=1)
Path.mkdir(package_path, parents=True)
@ -82,7 +87,8 @@ def generate_meta(model_path):
settings = [('lang', 'Model language', 'en'),
('name', 'Model name', 'model'),
('version', 'Model version', '0.0.0'),
('spacy_version', 'Required spaCy version', '>=%s,<3.0.0' % about.__version__),
('spacy_version', 'Required spaCy version',
'>=%s,<3.0.0' % about.__version__),
('description', 'Model description', False),
('author', 'Author', False),
('email', 'Author email', False),

View File

@ -27,15 +27,15 @@ def read_inputs(loc):
@plac.annotations(
lang=("model/language", "positional", None, str),
inputs=("Location of input file", "positional", None, read_inputs)
)
inputs=("Location of input file", "positional", None, read_inputs))
def profile(cmd, lang, inputs=None):
"""
Profile a spaCy pipeline, to find out which functions take the most time.
"""
nlp = spacy.load(lang)
texts = list(cytoolz.take(10000, inputs))
cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(),
"Profile.prof")
s = pstats.Stats("Profile.prof")
s.strip_dirs().sort_stats("time").print_stats()

View File

@ -2,21 +2,14 @@
from __future__ import unicode_literals, division, print_function
import plac
import json
from collections import defaultdict
import cytoolz
from pathlib import Path
import dill
import tqdm
from thinc.neural._classes.model import Model
from thinc.neural.optimizers import linear_decay
from timeit import default_timer as timer
import random
import numpy.random
from ..tokens.doc import Doc
from ..scorer import Scorer
from ..gold import GoldParse, merge_sents
from ..gold import GoldCorpus, minibatch
from ..util import prints
from .. import util
@ -31,8 +24,10 @@ numpy.random.seed(0)
@plac.annotations(
lang=("model language", "positional", None, str),
output_dir=("output directory to store model in", "positional", None, str),
train_data=("location of JSON-formatted training data", "positional", None, str),
dev_data=("location of JSON-formatted development data (optional)", "positional", None, str),
train_data=("location of JSON-formatted training data", "positional",
None, str),
dev_data=("location of JSON-formatted development data (optional)",
"positional", None, str),
n_iter=("number of iterations", "option", "n", int),
n_sents=("number of sentences", "option", "ns", int),
use_gpu=("Use GPU", "option", "g", int),
@ -42,11 +37,12 @@ numpy.random.seed(0)
no_entities=("Don't train NER", "flag", "N", bool),
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
version=("Model version", "option", "V", str),
meta_path=("Optional path to meta.json. All relevant properties will be overwritten.", "option", "m", Path)
)
meta_path=("Optional path to meta.json. All relevant properties will be "
"overwritten.", "option", "m", Path))
def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
use_gpu=-1, vectors=None, no_tagger=False, no_parser=False, no_entities=False,
gold_preproc=False, version="0.0.0", meta_path=None):
use_gpu=-1, vectors=None, no_tagger=False, no_parser=False,
no_entities=False, gold_preproc=False, version="0.0.0",
meta_path=None):
"""
Train a model. Expects data in spaCy's JSON format.
"""
@ -72,9 +68,12 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
meta.setdefault('name', 'unnamed')
pipeline = ['tagger', 'parser', 'ner']
if no_tagger and 'tagger' in pipeline: pipeline.remove('tagger')
if no_parser and 'parser' in pipeline: pipeline.remove('parser')
if no_entities and 'ner' in pipeline: pipeline.remove('ner')
if no_tagger and 'tagger' in pipeline:
pipeline.remove('tagger')
if no_parser and 'parser' in pipeline:
pipeline.remove('parser')
if no_entities and 'ner' in pipeline:
pipeline.remove('ner')
# Take dropout and batch size as generators of values -- dropout
# starts high and decays sharply, to force the optimizer to explore.
@ -139,7 +138,7 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
scorer = nlp_loaded.evaluate(dev_docs)
end_time = timer()
cpu_wps = nwords/(end_time-start_time)
acc_loc =(output_path / ('model%d' % i) / 'accuracy.json')
acc_loc = (output_path / ('model%d' % i) / 'accuracy.json')
with acc_loc.open('w') as file_:
file_.write(json_dumps(scorer.scores))
meta_loc = output_path / ('model%d' % i) / 'meta.json'
@ -157,7 +156,8 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
with meta_loc.open('w') as file_:
file_.write(json_dumps(meta))
util.set_env_log(True)
print_progress(i, losses, scorer.scores, cpu_wps=cpu_wps, gpu_wps=gpu_wps)
print_progress(i, losses, scorer.scores, cpu_wps=cpu_wps,
gpu_wps=gpu_wps)
finally:
print("Saving model...")
try:

View File

@ -1,5 +1,5 @@
# coding: utf8
from __future__ import unicode_literals
from __future__ import unicode_literals, print_function
import requests
import pkg_resources
@ -29,8 +29,10 @@ def validate(cmd):
model_links = get_model_links(current_compat)
model_pkgs = get_model_pkgs(current_compat, all_models)
incompat_links = {l for l, d in model_links.items() if not d['compat']}
incompat_models = {d['name'] for _, d in model_pkgs.items() if not d['compat']}
incompat_models.update([d['name'] for _, d in model_links.items() if not d['compat']])
incompat_models = {d['name'] for _, d in model_pkgs.items()
if not d['compat']}
incompat_models.update([d['name'] for _, d in model_links.items()
if not d['compat']])
na_models = [m for m in incompat_models if m not in current_compat]
update_models = [m for m in incompat_models if m in current_compat]
@ -90,7 +92,6 @@ def get_model_pkgs(compat, all_models):
def get_model_row(compat, name, data, type='package'):
tpl_row = ' {:<10}' + (' {:<20}' * 4)
tpl_red = '\x1b[38;5;1m{}\x1b[0m'
tpl_green = '\x1b[38;5;2m{}\x1b[0m'
if data['compat']:
@ -110,7 +111,8 @@ def get_row(*args):
def is_model_path(model_path):
exclude = ['cache', 'pycache', '__pycache__']
name = model_path.parts[-1]
return model_path.is_dir() and name not in exclude and not name.startswith('.')
return (model_path.is_dir() and name not in exclude
and not name.startswith('.'))
def is_compat(compat, name, version):
@ -118,6 +120,7 @@ def is_compat(compat, name, version):
def reformat_version(version):
"""Hack to reformat old versions ending on '-alpha' to match pip format."""
if version.endswith('-alpha'):
return version.replace('-alpha', 'a0')
return version.replace('-alpha', 'a')