mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Tidy up CLI
This commit is contained in:
parent
298c3d973c
commit
d941fc3667
|
@ -7,10 +7,9 @@ from pathlib import Path
|
|||
from .converters import conllu2json, iob2json, conll_ner2json
|
||||
from ..util import prints
|
||||
|
||||
# Converters are matched by file extension. To add a converter, add a new entry
|
||||
# to this dict with the file extension mapped to the converter function imported
|
||||
# from /converters.
|
||||
|
||||
# Converters are matched by file extension. To add a converter, add a new
|
||||
# entry to this dict with the file extension mapped to the converter function
|
||||
# imported from /converters.
|
||||
CONVERTERS = {
|
||||
'conllu': conllu2json,
|
||||
'conll': conllu2json,
|
||||
|
@ -24,8 +23,7 @@ CONVERTERS = {
|
|||
output_dir=("output directory for converted file", "positional", None, str),
|
||||
n_sents=("Number of sentences per doc", "option", "n", int),
|
||||
converter=("Name of converter (auto, iob, conllu or ner)", "option", "c", str),
|
||||
morphology=("Enable appending morphology to tags", "flag", "m", bool)
|
||||
)
|
||||
morphology=("Enable appending morphology to tags", "flag", "m", bool))
|
||||
def convert(cmd, input_file, output_dir, n_sents=1, morphology=False,
|
||||
converter='auto'):
|
||||
"""
|
||||
|
@ -40,7 +38,7 @@ def convert(cmd, input_file, output_dir, n_sents=1, morphology=False,
|
|||
prints(output_path, title="Output directory not found", exits=1)
|
||||
if converter == 'auto':
|
||||
converter = input_path.suffix[1:]
|
||||
if not converter in CONVERTERS:
|
||||
if converter not in CONVERTERS:
|
||||
prints("Can't find converter for %s" % converter,
|
||||
title="Unknown format", exits=1)
|
||||
func = CONVERTERS[converter]
|
||||
|
|
|
@ -8,7 +8,8 @@ from ...gold import iob_to_biluo
|
|||
|
||||
def conll_ner2json(input_path, output_path, n_sents=10, use_morphology=False):
|
||||
"""
|
||||
Convert files in the CoNLL-2003 NER format into JSON format for use with train cli.
|
||||
Convert files in the CoNLL-2003 NER format into JSON format for use with
|
||||
train cli.
|
||||
"""
|
||||
docs = read_conll_ner(input_path)
|
||||
|
||||
|
|
|
@ -13,10 +13,9 @@ from .. import about
|
|||
|
||||
|
||||
@plac.annotations(
|
||||
model=("model to download (shortcut or model name)", "positional", None, str),
|
||||
model=("model to download, shortcut or name)", "positional", None, str),
|
||||
direct=("force direct download. Needs model name with version and won't "
|
||||
"perform compatibility check", "flag", "d", bool)
|
||||
)
|
||||
"perform compatibility check", "flag", "d", bool))
|
||||
def download(cmd, model, direct=False):
|
||||
"""
|
||||
Download compatible model from default download path using pip. Model
|
||||
|
@ -30,21 +29,25 @@ def download(cmd, model, direct=False):
|
|||
model_name = shortcuts.get(model, model)
|
||||
compatibility = get_compatibility()
|
||||
version = get_version(model_name, compatibility)
|
||||
dl = download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version))
|
||||
dl = download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name,
|
||||
v=version))
|
||||
if dl == 0:
|
||||
try:
|
||||
# Get package path here because link uses
|
||||
# pip.get_installed_distributions() to check if model is a package,
|
||||
# which fails if model was just installed via subprocess
|
||||
# pip.get_installed_distributions() to check if model is a
|
||||
# package, which fails if model was just installed via
|
||||
# subprocess
|
||||
package_path = get_package_path(model_name)
|
||||
link(None, model_name, model, force=True, model_path=package_path)
|
||||
link(None, model_name, model, force=True,
|
||||
model_path=package_path)
|
||||
except:
|
||||
# Dirty, but since spacy.download and the auto-linking is mostly
|
||||
# a convenience wrapper, it's best to show a success message and
|
||||
# loading instructions, even if linking fails.
|
||||
prints("Creating a shortcut link for 'en' didn't work (maybe you "
|
||||
"don't have admin permissions?), but you can still load "
|
||||
"the model via its full package name:",
|
||||
# Dirty, but since spacy.download and the auto-linking is
|
||||
# mostly a convenience wrapper, it's best to show a success
|
||||
# message and loading instructions, even if linking fails.
|
||||
prints(
|
||||
"Creating a shortcut link for 'en' didn't work (maybe "
|
||||
"you don't have admin permissions?), but you can still "
|
||||
"load the model via its full package name:",
|
||||
"nlp = spacy.load('%s')" % model_name,
|
||||
title="Download successful")
|
||||
|
||||
|
@ -52,9 +55,10 @@ def download(cmd, model, direct=False):
|
|||
def get_json(url, desc):
|
||||
r = requests.get(url)
|
||||
if r.status_code != 200:
|
||||
prints("Couldn't fetch %s. Please find a model for your spaCy installation "
|
||||
"(v%s), and download it manually." % (desc, about.__version__),
|
||||
about.__docs_models__, title="Server error (%d)" % r.status_code, exits=1)
|
||||
msg = ("Couldn't fetch %s. Please find a model for your spaCy "
|
||||
"installation (v%s), and download it manually.")
|
||||
prints(msg % (desc, about.__version__), about.__docs_models__,
|
||||
title="Server error (%d)" % r.status_code, exits=1)
|
||||
return r.json()
|
||||
|
||||
|
||||
|
@ -71,13 +75,13 @@ def get_compatibility():
|
|||
def get_version(model, comp):
|
||||
if model not in comp:
|
||||
version = about.__version__
|
||||
prints("No compatible model found for '%s' (spaCy v%s)." % (model, version),
|
||||
title="Compatibility error", exits=1)
|
||||
msg = "No compatible model found for '%s' (spaCy v%s)."
|
||||
prints(msg % (model, version), title="Compatibility error", exits=1)
|
||||
return comp[model][0]
|
||||
|
||||
|
||||
def download_model(filename):
|
||||
download_url = about.__download_url__ + '/' + filename
|
||||
return subprocess.call([sys.executable, '-m',
|
||||
'pip', 'install', '--no-cache-dir', download_url],
|
||||
env=os.environ.copy())
|
||||
return subprocess.call(
|
||||
[sys.executable, '-m', 'pip', 'install', '--no-cache-dir',
|
||||
download_url], env=os.environ.copy())
|
||||
|
|
|
@ -2,27 +2,15 @@
|
|||
from __future__ import unicode_literals, division, print_function
|
||||
|
||||
import plac
|
||||
import json
|
||||
from collections import defaultdict
|
||||
import cytoolz
|
||||
from pathlib import Path
|
||||
import dill
|
||||
import tqdm
|
||||
from thinc.neural._classes.model import Model
|
||||
from thinc.neural.optimizers import linear_decay
|
||||
from timeit import default_timer as timer
|
||||
import random
|
||||
import numpy.random
|
||||
|
||||
from ..tokens.doc import Doc
|
||||
from ..scorer import Scorer
|
||||
from ..gold import GoldParse, merge_sents
|
||||
from ..gold import GoldCorpus, minibatch
|
||||
from ..gold import GoldCorpus
|
||||
from ..util import prints
|
||||
from .. import util
|
||||
from .. import about
|
||||
from .. import displacy
|
||||
from ..compat import json_dumps
|
||||
|
||||
|
||||
random.seed(0)
|
||||
numpy.random.seed(0)
|
||||
|
@ -30,17 +18,18 @@ numpy.random.seed(0)
|
|||
|
||||
@plac.annotations(
|
||||
model=("Model name or path", "positional", None, str),
|
||||
data_path=("Location of JSON-formatted evaluation data", "positional", None, str),
|
||||
data_path=("Location of JSON-formatted evaluation data", "positional",
|
||||
None, str),
|
||||
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
|
||||
gpu_id=("Use GPU", "option", "g", int),
|
||||
displacy_path=("Directory to output rendered parses as HTML", "option", "dp", str),
|
||||
displacy_limit=("Limit of parses to render as HTML", "option", "dl", int)
|
||||
)
|
||||
displacy_path=("Directory to output rendered parses as HTML", "option",
|
||||
"dp", str),
|
||||
displacy_limit=("Limit of parses to render as HTML", "option", "dl", int))
|
||||
def evaluate(cmd, model, data_path, gpu_id=-1, gold_preproc=False,
|
||||
displacy_path=None, displacy_limit=25):
|
||||
"""
|
||||
Evaluate a model. To render a sample of parses in a HTML file, set an output
|
||||
directory as the displacy_path argument.
|
||||
Evaluate a model. To render a sample of parses in a HTML file, set an
|
||||
output directory as the displacy_path argument.
|
||||
"""
|
||||
if gpu_id >= 0:
|
||||
util.use_gpu(gpu_id)
|
||||
|
@ -50,7 +39,8 @@ def evaluate(cmd, model, data_path, gpu_id=-1, gold_preproc=False,
|
|||
if not data_path.exists():
|
||||
prints(data_path, title="Evaluation data not found", exits=1)
|
||||
if displacy_path and not displacy_path.exists():
|
||||
prints(displacy_path, title="Visualization output directory not found", exits=1)
|
||||
prints(displacy_path, title="Visualization output directory not found",
|
||||
exits=1)
|
||||
corpus = GoldCorpus(data_path, data_path)
|
||||
nlp = util.load_model(model)
|
||||
dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc))
|
||||
|
@ -64,12 +54,14 @@ def evaluate(cmd, model, data_path, gpu_id=-1, gold_preproc=False,
|
|||
docs, golds = zip(*dev_docs)
|
||||
render_deps = 'parser' in nlp.meta.get('pipeline', [])
|
||||
render_ents = 'ner' in nlp.meta.get('pipeline', [])
|
||||
render_parses(docs, displacy_path, model_name=model, limit=displacy_limit,
|
||||
deps=render_deps, ents=render_ents)
|
||||
prints(displacy_path, title="Generated %s parses as HTML" % displacy_limit)
|
||||
render_parses(docs, displacy_path, model_name=model,
|
||||
limit=displacy_limit, deps=render_deps, ents=render_ents)
|
||||
msg = "Generated %s parses as HTML" % displacy_limit
|
||||
prints(displacy_path, title=msg)
|
||||
|
||||
|
||||
def render_parses(docs, output_path, model_name='', limit=250, deps=True, ents=True):
|
||||
def render_parses(docs, output_path, model_name='', limit=250, deps=True,
|
||||
ents=True):
|
||||
docs[0].user_data['title'] = model_name
|
||||
if ents:
|
||||
with (output_path / 'entities.html').open('w') as file_:
|
||||
|
@ -77,7 +69,8 @@ def render_parses(docs, output_path, model_name='', limit=250, deps=True, ents=T
|
|||
file_.write(html)
|
||||
if deps:
|
||||
with (output_path / 'parses.html').open('w') as file_:
|
||||
html = displacy.render(docs[:limit], style='dep', page=True, options={'compact': True})
|
||||
html = displacy.render(docs[:limit], style='dep', page=True,
|
||||
options={'compact': True})
|
||||
file_.write(html)
|
||||
|
||||
|
||||
|
|
|
@ -12,8 +12,7 @@ from .. import util
|
|||
|
||||
@plac.annotations(
|
||||
model=("optional: shortcut link of model", "positional", None, str),
|
||||
markdown=("generate Markdown for GitHub issues", "flag", "md", str)
|
||||
)
|
||||
markdown=("generate Markdown for GitHub issues", "flag", "md", str))
|
||||
def info(cmd, model=None, markdown=False):
|
||||
"""Print info about spaCy installation. If a model shortcut link is
|
||||
speficied as an argument, print model information. Flag --markdown
|
||||
|
|
|
@ -12,8 +12,7 @@ from .. import util
|
|||
@plac.annotations(
|
||||
origin=("package name or local path to model", "positional", None, str),
|
||||
link_name=("name of shortuct link to create", "positional", None, str),
|
||||
force=("force overwriting of existing link", "flag", "f", bool)
|
||||
)
|
||||
force=("force overwriting of existing link", "flag", "f", bool))
|
||||
def link(cmd, origin, link_name, force=False, model_path=None):
|
||||
"""
|
||||
Create a symlink for models within the spacy/data directory. Accepts
|
||||
|
@ -46,8 +45,9 @@ def link(cmd, origin, link_name, force=False, model_path=None):
|
|||
# This is quite dirty, but just making sure other errors are caught.
|
||||
prints("Creating a symlink in spacy/data failed. Make sure you have "
|
||||
"the required permissions and try re-running the command as "
|
||||
"admin, or use a virtualenv. You can still import the model as a "
|
||||
"module and call its load() method, or create the symlink manually.",
|
||||
"admin, or use a virtualenv. You can still import the model as "
|
||||
"a module and call its load() method, or create the symlink "
|
||||
"manually.",
|
||||
"%s --> %s" % (path2str(model_path), path2str(link_path)),
|
||||
title="Error: Couldn't link model to '%s'" % link_name)
|
||||
raise
|
||||
|
|
|
@ -16,10 +16,12 @@ from .. import about
|
|||
input_dir=("directory with model data", "positional", None, str),
|
||||
output_dir=("output parent directory", "positional", None, str),
|
||||
meta_path=("path to meta.json", "option", "m", str),
|
||||
create_meta=("create meta.json, even if one exists in directory", "flag", "c", bool),
|
||||
force=("force overwriting of existing folder in output directory", "flag", "f", bool)
|
||||
)
|
||||
def package(cmd, input_dir, output_dir, meta_path=None, create_meta=False, force=False):
|
||||
create_meta=("create meta.json, even if one exists in directory", "flag",
|
||||
"c", bool),
|
||||
force=("force overwriting of existing folder in output directory", "flag",
|
||||
"f", bool))
|
||||
def package(cmd, input_dir, output_dir, meta_path=None, create_meta=False,
|
||||
force=False):
|
||||
"""
|
||||
Generate Python package for model data, including meta and required
|
||||
installation files. A new directory will be created in the specified
|
||||
|
@ -52,13 +54,15 @@ def package(cmd, input_dir, output_dir, meta_path=None, create_meta=False, force
|
|||
package_path = main_path / model_name
|
||||
|
||||
create_dirs(package_path, force)
|
||||
shutil.copytree(path2str(input_path), path2str(package_path / model_name_v))
|
||||
shutil.copytree(path2str(input_path),
|
||||
path2str(package_path / model_name_v))
|
||||
create_file(main_path / 'meta.json', json_dumps(meta))
|
||||
create_file(main_path / 'setup.py', template_setup)
|
||||
create_file(main_path / 'MANIFEST.in', template_manifest)
|
||||
create_file(package_path / '__init__.py', template_init)
|
||||
prints(main_path, "To build the package, run `python setup.py sdist` in this "
|
||||
"directory.", title="Successfully created package '%s'" % model_name_v)
|
||||
prints(main_path, "To build the package, run `python setup.py sdist` in "
|
||||
"this directory.",
|
||||
title="Successfully created package '%s'" % model_name_v)
|
||||
|
||||
|
||||
def create_dirs(package_path, force):
|
||||
|
@ -66,9 +70,10 @@ def create_dirs(package_path, force):
|
|||
if force:
|
||||
shutil.rmtree(path2str(package_path))
|
||||
else:
|
||||
prints(package_path, "Please delete the directory and try again, or "
|
||||
"use the --force flag to overwrite existing directories.",
|
||||
title="Package directory already exists", exits=1)
|
||||
prints(package_path, "Please delete the directory and try again, "
|
||||
"or use the --force flag to overwrite existing "
|
||||
"directories.", title="Package directory already exists",
|
||||
exits=1)
|
||||
Path.mkdir(package_path, parents=True)
|
||||
|
||||
|
||||
|
@ -82,7 +87,8 @@ def generate_meta(model_path):
|
|||
settings = [('lang', 'Model language', 'en'),
|
||||
('name', 'Model name', 'model'),
|
||||
('version', 'Model version', '0.0.0'),
|
||||
('spacy_version', 'Required spaCy version', '>=%s,<3.0.0' % about.__version__),
|
||||
('spacy_version', 'Required spaCy version',
|
||||
'>=%s,<3.0.0' % about.__version__),
|
||||
('description', 'Model description', False),
|
||||
('author', 'Author', False),
|
||||
('email', 'Author email', False),
|
||||
|
|
|
@ -27,15 +27,15 @@ def read_inputs(loc):
|
|||
|
||||
@plac.annotations(
|
||||
lang=("model/language", "positional", None, str),
|
||||
inputs=("Location of input file", "positional", None, read_inputs)
|
||||
)
|
||||
inputs=("Location of input file", "positional", None, read_inputs))
|
||||
def profile(cmd, lang, inputs=None):
|
||||
"""
|
||||
Profile a spaCy pipeline, to find out which functions take the most time.
|
||||
"""
|
||||
nlp = spacy.load(lang)
|
||||
texts = list(cytoolz.take(10000, inputs))
|
||||
cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
|
||||
cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(),
|
||||
"Profile.prof")
|
||||
s = pstats.Stats("Profile.prof")
|
||||
s.strip_dirs().sort_stats("time").print_stats()
|
||||
|
||||
|
|
|
@ -2,21 +2,14 @@
|
|||
from __future__ import unicode_literals, division, print_function
|
||||
|
||||
import plac
|
||||
import json
|
||||
from collections import defaultdict
|
||||
import cytoolz
|
||||
from pathlib import Path
|
||||
import dill
|
||||
import tqdm
|
||||
from thinc.neural._classes.model import Model
|
||||
from thinc.neural.optimizers import linear_decay
|
||||
from timeit import default_timer as timer
|
||||
import random
|
||||
import numpy.random
|
||||
|
||||
from ..tokens.doc import Doc
|
||||
from ..scorer import Scorer
|
||||
from ..gold import GoldParse, merge_sents
|
||||
from ..gold import GoldCorpus, minibatch
|
||||
from ..util import prints
|
||||
from .. import util
|
||||
|
@ -31,8 +24,10 @@ numpy.random.seed(0)
|
|||
@plac.annotations(
|
||||
lang=("model language", "positional", None, str),
|
||||
output_dir=("output directory to store model in", "positional", None, str),
|
||||
train_data=("location of JSON-formatted training data", "positional", None, str),
|
||||
dev_data=("location of JSON-formatted development data (optional)", "positional", None, str),
|
||||
train_data=("location of JSON-formatted training data", "positional",
|
||||
None, str),
|
||||
dev_data=("location of JSON-formatted development data (optional)",
|
||||
"positional", None, str),
|
||||
n_iter=("number of iterations", "option", "n", int),
|
||||
n_sents=("number of sentences", "option", "ns", int),
|
||||
use_gpu=("Use GPU", "option", "g", int),
|
||||
|
@ -42,11 +37,12 @@ numpy.random.seed(0)
|
|||
no_entities=("Don't train NER", "flag", "N", bool),
|
||||
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
|
||||
version=("Model version", "option", "V", str),
|
||||
meta_path=("Optional path to meta.json. All relevant properties will be overwritten.", "option", "m", Path)
|
||||
)
|
||||
meta_path=("Optional path to meta.json. All relevant properties will be "
|
||||
"overwritten.", "option", "m", Path))
|
||||
def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
|
||||
use_gpu=-1, vectors=None, no_tagger=False, no_parser=False, no_entities=False,
|
||||
gold_preproc=False, version="0.0.0", meta_path=None):
|
||||
use_gpu=-1, vectors=None, no_tagger=False, no_parser=False,
|
||||
no_entities=False, gold_preproc=False, version="0.0.0",
|
||||
meta_path=None):
|
||||
"""
|
||||
Train a model. Expects data in spaCy's JSON format.
|
||||
"""
|
||||
|
@ -72,9 +68,12 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
|
|||
meta.setdefault('name', 'unnamed')
|
||||
|
||||
pipeline = ['tagger', 'parser', 'ner']
|
||||
if no_tagger and 'tagger' in pipeline: pipeline.remove('tagger')
|
||||
if no_parser and 'parser' in pipeline: pipeline.remove('parser')
|
||||
if no_entities and 'ner' in pipeline: pipeline.remove('ner')
|
||||
if no_tagger and 'tagger' in pipeline:
|
||||
pipeline.remove('tagger')
|
||||
if no_parser and 'parser' in pipeline:
|
||||
pipeline.remove('parser')
|
||||
if no_entities and 'ner' in pipeline:
|
||||
pipeline.remove('ner')
|
||||
|
||||
# Take dropout and batch size as generators of values -- dropout
|
||||
# starts high and decays sharply, to force the optimizer to explore.
|
||||
|
@ -157,7 +156,8 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
|
|||
with meta_loc.open('w') as file_:
|
||||
file_.write(json_dumps(meta))
|
||||
util.set_env_log(True)
|
||||
print_progress(i, losses, scorer.scores, cpu_wps=cpu_wps, gpu_wps=gpu_wps)
|
||||
print_progress(i, losses, scorer.scores, cpu_wps=cpu_wps,
|
||||
gpu_wps=gpu_wps)
|
||||
finally:
|
||||
print("Saving model...")
|
||||
try:
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
import requests
|
||||
import pkg_resources
|
||||
|
@ -29,8 +29,10 @@ def validate(cmd):
|
|||
model_links = get_model_links(current_compat)
|
||||
model_pkgs = get_model_pkgs(current_compat, all_models)
|
||||
incompat_links = {l for l, d in model_links.items() if not d['compat']}
|
||||
incompat_models = {d['name'] for _, d in model_pkgs.items() if not d['compat']}
|
||||
incompat_models.update([d['name'] for _, d in model_links.items() if not d['compat']])
|
||||
incompat_models = {d['name'] for _, d in model_pkgs.items()
|
||||
if not d['compat']}
|
||||
incompat_models.update([d['name'] for _, d in model_links.items()
|
||||
if not d['compat']])
|
||||
na_models = [m for m in incompat_models if m not in current_compat]
|
||||
update_models = [m for m in incompat_models if m in current_compat]
|
||||
|
||||
|
@ -90,7 +92,6 @@ def get_model_pkgs(compat, all_models):
|
|||
|
||||
|
||||
def get_model_row(compat, name, data, type='package'):
|
||||
tpl_row = ' {:<10}' + (' {:<20}' * 4)
|
||||
tpl_red = '\x1b[38;5;1m{}\x1b[0m'
|
||||
tpl_green = '\x1b[38;5;2m{}\x1b[0m'
|
||||
if data['compat']:
|
||||
|
@ -110,7 +111,8 @@ def get_row(*args):
|
|||
def is_model_path(model_path):
|
||||
exclude = ['cache', 'pycache', '__pycache__']
|
||||
name = model_path.parts[-1]
|
||||
return model_path.is_dir() and name not in exclude and not name.startswith('.')
|
||||
return (model_path.is_dir() and name not in exclude
|
||||
and not name.startswith('.'))
|
||||
|
||||
|
||||
def is_compat(compat, name, version):
|
||||
|
@ -118,6 +120,7 @@ def is_compat(compat, name, version):
|
|||
|
||||
|
||||
def reformat_version(version):
|
||||
"""Hack to reformat old versions ending on '-alpha' to match pip format."""
|
||||
if version.endswith('-alpha'):
|
||||
return version.replace('-alpha', 'a0')
|
||||
return version.replace('-alpha', 'a')
|
||||
|
|
Loading…
Reference in New Issue
Block a user