mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Tidy up CLI
This commit is contained in:
parent
298c3d973c
commit
d941fc3667
|
@ -7,10 +7,9 @@ from pathlib import Path
|
||||||
from .converters import conllu2json, iob2json, conll_ner2json
|
from .converters import conllu2json, iob2json, conll_ner2json
|
||||||
from ..util import prints
|
from ..util import prints
|
||||||
|
|
||||||
# Converters are matched by file extension. To add a converter, add a new entry
|
# Converters are matched by file extension. To add a converter, add a new
|
||||||
# to this dict with the file extension mapped to the converter function imported
|
# entry to this dict with the file extension mapped to the converter function
|
||||||
# from /converters.
|
# imported from /converters.
|
||||||
|
|
||||||
CONVERTERS = {
|
CONVERTERS = {
|
||||||
'conllu': conllu2json,
|
'conllu': conllu2json,
|
||||||
'conll': conllu2json,
|
'conll': conllu2json,
|
||||||
|
@ -24,8 +23,7 @@ CONVERTERS = {
|
||||||
output_dir=("output directory for converted file", "positional", None, str),
|
output_dir=("output directory for converted file", "positional", None, str),
|
||||||
n_sents=("Number of sentences per doc", "option", "n", int),
|
n_sents=("Number of sentences per doc", "option", "n", int),
|
||||||
converter=("Name of converter (auto, iob, conllu or ner)", "option", "c", str),
|
converter=("Name of converter (auto, iob, conllu or ner)", "option", "c", str),
|
||||||
morphology=("Enable appending morphology to tags", "flag", "m", bool)
|
morphology=("Enable appending morphology to tags", "flag", "m", bool))
|
||||||
)
|
|
||||||
def convert(cmd, input_file, output_dir, n_sents=1, morphology=False,
|
def convert(cmd, input_file, output_dir, n_sents=1, morphology=False,
|
||||||
converter='auto'):
|
converter='auto'):
|
||||||
"""
|
"""
|
||||||
|
@ -40,7 +38,7 @@ def convert(cmd, input_file, output_dir, n_sents=1, morphology=False,
|
||||||
prints(output_path, title="Output directory not found", exits=1)
|
prints(output_path, title="Output directory not found", exits=1)
|
||||||
if converter == 'auto':
|
if converter == 'auto':
|
||||||
converter = input_path.suffix[1:]
|
converter = input_path.suffix[1:]
|
||||||
if not converter in CONVERTERS:
|
if converter not in CONVERTERS:
|
||||||
prints("Can't find converter for %s" % converter,
|
prints("Can't find converter for %s" % converter,
|
||||||
title="Unknown format", exits=1)
|
title="Unknown format", exits=1)
|
||||||
func = CONVERTERS[converter]
|
func = CONVERTERS[converter]
|
||||||
|
|
|
@ -8,7 +8,8 @@ from ...gold import iob_to_biluo
|
||||||
|
|
||||||
def conll_ner2json(input_path, output_path, n_sents=10, use_morphology=False):
|
def conll_ner2json(input_path, output_path, n_sents=10, use_morphology=False):
|
||||||
"""
|
"""
|
||||||
Convert files in the CoNLL-2003 NER format into JSON format for use with train cli.
|
Convert files in the CoNLL-2003 NER format into JSON format for use with
|
||||||
|
train cli.
|
||||||
"""
|
"""
|
||||||
docs = read_conll_ner(input_path)
|
docs = read_conll_ner(input_path)
|
||||||
|
|
||||||
|
|
|
@ -13,10 +13,9 @@ from .. import about
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
model=("model to download (shortcut or model name)", "positional", None, str),
|
model=("model to download, shortcut or name)", "positional", None, str),
|
||||||
direct=("force direct download. Needs model name with version and won't "
|
direct=("force direct download. Needs model name with version and won't "
|
||||||
"perform compatibility check", "flag", "d", bool)
|
"perform compatibility check", "flag", "d", bool))
|
||||||
)
|
|
||||||
def download(cmd, model, direct=False):
|
def download(cmd, model, direct=False):
|
||||||
"""
|
"""
|
||||||
Download compatible model from default download path using pip. Model
|
Download compatible model from default download path using pip. Model
|
||||||
|
@ -30,21 +29,25 @@ def download(cmd, model, direct=False):
|
||||||
model_name = shortcuts.get(model, model)
|
model_name = shortcuts.get(model, model)
|
||||||
compatibility = get_compatibility()
|
compatibility = get_compatibility()
|
||||||
version = get_version(model_name, compatibility)
|
version = get_version(model_name, compatibility)
|
||||||
dl = download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version))
|
dl = download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name,
|
||||||
|
v=version))
|
||||||
if dl == 0:
|
if dl == 0:
|
||||||
try:
|
try:
|
||||||
# Get package path here because link uses
|
# Get package path here because link uses
|
||||||
# pip.get_installed_distributions() to check if model is a package,
|
# pip.get_installed_distributions() to check if model is a
|
||||||
# which fails if model was just installed via subprocess
|
# package, which fails if model was just installed via
|
||||||
|
# subprocess
|
||||||
package_path = get_package_path(model_name)
|
package_path = get_package_path(model_name)
|
||||||
link(None, model_name, model, force=True, model_path=package_path)
|
link(None, model_name, model, force=True,
|
||||||
|
model_path=package_path)
|
||||||
except:
|
except:
|
||||||
# Dirty, but since spacy.download and the auto-linking is mostly
|
# Dirty, but since spacy.download and the auto-linking is
|
||||||
# a convenience wrapper, it's best to show a success message and
|
# mostly a convenience wrapper, it's best to show a success
|
||||||
# loading instructions, even if linking fails.
|
# message and loading instructions, even if linking fails.
|
||||||
prints("Creating a shortcut link for 'en' didn't work (maybe you "
|
prints(
|
||||||
"don't have admin permissions?), but you can still load "
|
"Creating a shortcut link for 'en' didn't work (maybe "
|
||||||
"the model via its full package name:",
|
"you don't have admin permissions?), but you can still "
|
||||||
|
"load the model via its full package name:",
|
||||||
"nlp = spacy.load('%s')" % model_name,
|
"nlp = spacy.load('%s')" % model_name,
|
||||||
title="Download successful")
|
title="Download successful")
|
||||||
|
|
||||||
|
@ -52,9 +55,10 @@ def download(cmd, model, direct=False):
|
||||||
def get_json(url, desc):
|
def get_json(url, desc):
|
||||||
r = requests.get(url)
|
r = requests.get(url)
|
||||||
if r.status_code != 200:
|
if r.status_code != 200:
|
||||||
prints("Couldn't fetch %s. Please find a model for your spaCy installation "
|
msg = ("Couldn't fetch %s. Please find a model for your spaCy "
|
||||||
"(v%s), and download it manually." % (desc, about.__version__),
|
"installation (v%s), and download it manually.")
|
||||||
about.__docs_models__, title="Server error (%d)" % r.status_code, exits=1)
|
prints(msg % (desc, about.__version__), about.__docs_models__,
|
||||||
|
title="Server error (%d)" % r.status_code, exits=1)
|
||||||
return r.json()
|
return r.json()
|
||||||
|
|
||||||
|
|
||||||
|
@ -71,13 +75,13 @@ def get_compatibility():
|
||||||
def get_version(model, comp):
|
def get_version(model, comp):
|
||||||
if model not in comp:
|
if model not in comp:
|
||||||
version = about.__version__
|
version = about.__version__
|
||||||
prints("No compatible model found for '%s' (spaCy v%s)." % (model, version),
|
msg = "No compatible model found for '%s' (spaCy v%s)."
|
||||||
title="Compatibility error", exits=1)
|
prints(msg % (model, version), title="Compatibility error", exits=1)
|
||||||
return comp[model][0]
|
return comp[model][0]
|
||||||
|
|
||||||
|
|
||||||
def download_model(filename):
|
def download_model(filename):
|
||||||
download_url = about.__download_url__ + '/' + filename
|
download_url = about.__download_url__ + '/' + filename
|
||||||
return subprocess.call([sys.executable, '-m',
|
return subprocess.call(
|
||||||
'pip', 'install', '--no-cache-dir', download_url],
|
[sys.executable, '-m', 'pip', 'install', '--no-cache-dir',
|
||||||
env=os.environ.copy())
|
download_url], env=os.environ.copy())
|
||||||
|
|
|
@ -2,27 +2,15 @@
|
||||||
from __future__ import unicode_literals, division, print_function
|
from __future__ import unicode_literals, division, print_function
|
||||||
|
|
||||||
import plac
|
import plac
|
||||||
import json
|
|
||||||
from collections import defaultdict
|
|
||||||
import cytoolz
|
|
||||||
from pathlib import Path
|
|
||||||
import dill
|
|
||||||
import tqdm
|
|
||||||
from thinc.neural._classes.model import Model
|
|
||||||
from thinc.neural.optimizers import linear_decay
|
|
||||||
from timeit import default_timer as timer
|
from timeit import default_timer as timer
|
||||||
import random
|
import random
|
||||||
import numpy.random
|
import numpy.random
|
||||||
|
|
||||||
from ..tokens.doc import Doc
|
from ..gold import GoldCorpus
|
||||||
from ..scorer import Scorer
|
|
||||||
from ..gold import GoldParse, merge_sents
|
|
||||||
from ..gold import GoldCorpus, minibatch
|
|
||||||
from ..util import prints
|
from ..util import prints
|
||||||
from .. import util
|
from .. import util
|
||||||
from .. import about
|
|
||||||
from .. import displacy
|
from .. import displacy
|
||||||
from ..compat import json_dumps
|
|
||||||
|
|
||||||
random.seed(0)
|
random.seed(0)
|
||||||
numpy.random.seed(0)
|
numpy.random.seed(0)
|
||||||
|
@ -30,17 +18,18 @@ numpy.random.seed(0)
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
model=("Model name or path", "positional", None, str),
|
model=("Model name or path", "positional", None, str),
|
||||||
data_path=("Location of JSON-formatted evaluation data", "positional", None, str),
|
data_path=("Location of JSON-formatted evaluation data", "positional",
|
||||||
|
None, str),
|
||||||
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
|
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
|
||||||
gpu_id=("Use GPU", "option", "g", int),
|
gpu_id=("Use GPU", "option", "g", int),
|
||||||
displacy_path=("Directory to output rendered parses as HTML", "option", "dp", str),
|
displacy_path=("Directory to output rendered parses as HTML", "option",
|
||||||
displacy_limit=("Limit of parses to render as HTML", "option", "dl", int)
|
"dp", str),
|
||||||
)
|
displacy_limit=("Limit of parses to render as HTML", "option", "dl", int))
|
||||||
def evaluate(cmd, model, data_path, gpu_id=-1, gold_preproc=False,
|
def evaluate(cmd, model, data_path, gpu_id=-1, gold_preproc=False,
|
||||||
displacy_path=None, displacy_limit=25):
|
displacy_path=None, displacy_limit=25):
|
||||||
"""
|
"""
|
||||||
Evaluate a model. To render a sample of parses in a HTML file, set an output
|
Evaluate a model. To render a sample of parses in a HTML file, set an
|
||||||
directory as the displacy_path argument.
|
output directory as the displacy_path argument.
|
||||||
"""
|
"""
|
||||||
if gpu_id >= 0:
|
if gpu_id >= 0:
|
||||||
util.use_gpu(gpu_id)
|
util.use_gpu(gpu_id)
|
||||||
|
@ -50,7 +39,8 @@ def evaluate(cmd, model, data_path, gpu_id=-1, gold_preproc=False,
|
||||||
if not data_path.exists():
|
if not data_path.exists():
|
||||||
prints(data_path, title="Evaluation data not found", exits=1)
|
prints(data_path, title="Evaluation data not found", exits=1)
|
||||||
if displacy_path and not displacy_path.exists():
|
if displacy_path and not displacy_path.exists():
|
||||||
prints(displacy_path, title="Visualization output directory not found", exits=1)
|
prints(displacy_path, title="Visualization output directory not found",
|
||||||
|
exits=1)
|
||||||
corpus = GoldCorpus(data_path, data_path)
|
corpus = GoldCorpus(data_path, data_path)
|
||||||
nlp = util.load_model(model)
|
nlp = util.load_model(model)
|
||||||
dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc))
|
dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc))
|
||||||
|
@ -64,12 +54,14 @@ def evaluate(cmd, model, data_path, gpu_id=-1, gold_preproc=False,
|
||||||
docs, golds = zip(*dev_docs)
|
docs, golds = zip(*dev_docs)
|
||||||
render_deps = 'parser' in nlp.meta.get('pipeline', [])
|
render_deps = 'parser' in nlp.meta.get('pipeline', [])
|
||||||
render_ents = 'ner' in nlp.meta.get('pipeline', [])
|
render_ents = 'ner' in nlp.meta.get('pipeline', [])
|
||||||
render_parses(docs, displacy_path, model_name=model, limit=displacy_limit,
|
render_parses(docs, displacy_path, model_name=model,
|
||||||
deps=render_deps, ents=render_ents)
|
limit=displacy_limit, deps=render_deps, ents=render_ents)
|
||||||
prints(displacy_path, title="Generated %s parses as HTML" % displacy_limit)
|
msg = "Generated %s parses as HTML" % displacy_limit
|
||||||
|
prints(displacy_path, title=msg)
|
||||||
|
|
||||||
|
|
||||||
def render_parses(docs, output_path, model_name='', limit=250, deps=True, ents=True):
|
def render_parses(docs, output_path, model_name='', limit=250, deps=True,
|
||||||
|
ents=True):
|
||||||
docs[0].user_data['title'] = model_name
|
docs[0].user_data['title'] = model_name
|
||||||
if ents:
|
if ents:
|
||||||
with (output_path / 'entities.html').open('w') as file_:
|
with (output_path / 'entities.html').open('w') as file_:
|
||||||
|
@ -77,7 +69,8 @@ def render_parses(docs, output_path, model_name='', limit=250, deps=True, ents=T
|
||||||
file_.write(html)
|
file_.write(html)
|
||||||
if deps:
|
if deps:
|
||||||
with (output_path / 'parses.html').open('w') as file_:
|
with (output_path / 'parses.html').open('w') as file_:
|
||||||
html = displacy.render(docs[:limit], style='dep', page=True, options={'compact': True})
|
html = displacy.render(docs[:limit], style='dep', page=True,
|
||||||
|
options={'compact': True})
|
||||||
file_.write(html)
|
file_.write(html)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -12,8 +12,7 @@ from .. import util
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
model=("optional: shortcut link of model", "positional", None, str),
|
model=("optional: shortcut link of model", "positional", None, str),
|
||||||
markdown=("generate Markdown for GitHub issues", "flag", "md", str)
|
markdown=("generate Markdown for GitHub issues", "flag", "md", str))
|
||||||
)
|
|
||||||
def info(cmd, model=None, markdown=False):
|
def info(cmd, model=None, markdown=False):
|
||||||
"""Print info about spaCy installation. If a model shortcut link is
|
"""Print info about spaCy installation. If a model shortcut link is
|
||||||
speficied as an argument, print model information. Flag --markdown
|
speficied as an argument, print model information. Flag --markdown
|
||||||
|
|
|
@ -12,8 +12,7 @@ from .. import util
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
origin=("package name or local path to model", "positional", None, str),
|
origin=("package name or local path to model", "positional", None, str),
|
||||||
link_name=("name of shortuct link to create", "positional", None, str),
|
link_name=("name of shortuct link to create", "positional", None, str),
|
||||||
force=("force overwriting of existing link", "flag", "f", bool)
|
force=("force overwriting of existing link", "flag", "f", bool))
|
||||||
)
|
|
||||||
def link(cmd, origin, link_name, force=False, model_path=None):
|
def link(cmd, origin, link_name, force=False, model_path=None):
|
||||||
"""
|
"""
|
||||||
Create a symlink for models within the spacy/data directory. Accepts
|
Create a symlink for models within the spacy/data directory. Accepts
|
||||||
|
@ -46,8 +45,9 @@ def link(cmd, origin, link_name, force=False, model_path=None):
|
||||||
# This is quite dirty, but just making sure other errors are caught.
|
# This is quite dirty, but just making sure other errors are caught.
|
||||||
prints("Creating a symlink in spacy/data failed. Make sure you have "
|
prints("Creating a symlink in spacy/data failed. Make sure you have "
|
||||||
"the required permissions and try re-running the command as "
|
"the required permissions and try re-running the command as "
|
||||||
"admin, or use a virtualenv. You can still import the model as a "
|
"admin, or use a virtualenv. You can still import the model as "
|
||||||
"module and call its load() method, or create the symlink manually.",
|
"a module and call its load() method, or create the symlink "
|
||||||
|
"manually.",
|
||||||
"%s --> %s" % (path2str(model_path), path2str(link_path)),
|
"%s --> %s" % (path2str(model_path), path2str(link_path)),
|
||||||
title="Error: Couldn't link model to '%s'" % link_name)
|
title="Error: Couldn't link model to '%s'" % link_name)
|
||||||
raise
|
raise
|
||||||
|
|
|
@ -16,10 +16,12 @@ from .. import about
|
||||||
input_dir=("directory with model data", "positional", None, str),
|
input_dir=("directory with model data", "positional", None, str),
|
||||||
output_dir=("output parent directory", "positional", None, str),
|
output_dir=("output parent directory", "positional", None, str),
|
||||||
meta_path=("path to meta.json", "option", "m", str),
|
meta_path=("path to meta.json", "option", "m", str),
|
||||||
create_meta=("create meta.json, even if one exists in directory", "flag", "c", bool),
|
create_meta=("create meta.json, even if one exists in directory", "flag",
|
||||||
force=("force overwriting of existing folder in output directory", "flag", "f", bool)
|
"c", bool),
|
||||||
)
|
force=("force overwriting of existing folder in output directory", "flag",
|
||||||
def package(cmd, input_dir, output_dir, meta_path=None, create_meta=False, force=False):
|
"f", bool))
|
||||||
|
def package(cmd, input_dir, output_dir, meta_path=None, create_meta=False,
|
||||||
|
force=False):
|
||||||
"""
|
"""
|
||||||
Generate Python package for model data, including meta and required
|
Generate Python package for model data, including meta and required
|
||||||
installation files. A new directory will be created in the specified
|
installation files. A new directory will be created in the specified
|
||||||
|
@ -52,13 +54,15 @@ def package(cmd, input_dir, output_dir, meta_path=None, create_meta=False, force
|
||||||
package_path = main_path / model_name
|
package_path = main_path / model_name
|
||||||
|
|
||||||
create_dirs(package_path, force)
|
create_dirs(package_path, force)
|
||||||
shutil.copytree(path2str(input_path), path2str(package_path / model_name_v))
|
shutil.copytree(path2str(input_path),
|
||||||
|
path2str(package_path / model_name_v))
|
||||||
create_file(main_path / 'meta.json', json_dumps(meta))
|
create_file(main_path / 'meta.json', json_dumps(meta))
|
||||||
create_file(main_path / 'setup.py', template_setup)
|
create_file(main_path / 'setup.py', template_setup)
|
||||||
create_file(main_path / 'MANIFEST.in', template_manifest)
|
create_file(main_path / 'MANIFEST.in', template_manifest)
|
||||||
create_file(package_path / '__init__.py', template_init)
|
create_file(package_path / '__init__.py', template_init)
|
||||||
prints(main_path, "To build the package, run `python setup.py sdist` in this "
|
prints(main_path, "To build the package, run `python setup.py sdist` in "
|
||||||
"directory.", title="Successfully created package '%s'" % model_name_v)
|
"this directory.",
|
||||||
|
title="Successfully created package '%s'" % model_name_v)
|
||||||
|
|
||||||
|
|
||||||
def create_dirs(package_path, force):
|
def create_dirs(package_path, force):
|
||||||
|
@ -66,9 +70,10 @@ def create_dirs(package_path, force):
|
||||||
if force:
|
if force:
|
||||||
shutil.rmtree(path2str(package_path))
|
shutil.rmtree(path2str(package_path))
|
||||||
else:
|
else:
|
||||||
prints(package_path, "Please delete the directory and try again, or "
|
prints(package_path, "Please delete the directory and try again, "
|
||||||
"use the --force flag to overwrite existing directories.",
|
"or use the --force flag to overwrite existing "
|
||||||
title="Package directory already exists", exits=1)
|
"directories.", title="Package directory already exists",
|
||||||
|
exits=1)
|
||||||
Path.mkdir(package_path, parents=True)
|
Path.mkdir(package_path, parents=True)
|
||||||
|
|
||||||
|
|
||||||
|
@ -82,7 +87,8 @@ def generate_meta(model_path):
|
||||||
settings = [('lang', 'Model language', 'en'),
|
settings = [('lang', 'Model language', 'en'),
|
||||||
('name', 'Model name', 'model'),
|
('name', 'Model name', 'model'),
|
||||||
('version', 'Model version', '0.0.0'),
|
('version', 'Model version', '0.0.0'),
|
||||||
('spacy_version', 'Required spaCy version', '>=%s,<3.0.0' % about.__version__),
|
('spacy_version', 'Required spaCy version',
|
||||||
|
'>=%s,<3.0.0' % about.__version__),
|
||||||
('description', 'Model description', False),
|
('description', 'Model description', False),
|
||||||
('author', 'Author', False),
|
('author', 'Author', False),
|
||||||
('email', 'Author email', False),
|
('email', 'Author email', False),
|
||||||
|
|
|
@ -27,15 +27,15 @@ def read_inputs(loc):
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
lang=("model/language", "positional", None, str),
|
lang=("model/language", "positional", None, str),
|
||||||
inputs=("Location of input file", "positional", None, read_inputs)
|
inputs=("Location of input file", "positional", None, read_inputs))
|
||||||
)
|
|
||||||
def profile(cmd, lang, inputs=None):
|
def profile(cmd, lang, inputs=None):
|
||||||
"""
|
"""
|
||||||
Profile a spaCy pipeline, to find out which functions take the most time.
|
Profile a spaCy pipeline, to find out which functions take the most time.
|
||||||
"""
|
"""
|
||||||
nlp = spacy.load(lang)
|
nlp = spacy.load(lang)
|
||||||
texts = list(cytoolz.take(10000, inputs))
|
texts = list(cytoolz.take(10000, inputs))
|
||||||
cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
|
cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(),
|
||||||
|
"Profile.prof")
|
||||||
s = pstats.Stats("Profile.prof")
|
s = pstats.Stats("Profile.prof")
|
||||||
s.strip_dirs().sort_stats("time").print_stats()
|
s.strip_dirs().sort_stats("time").print_stats()
|
||||||
|
|
||||||
|
|
|
@ -2,21 +2,14 @@
|
||||||
from __future__ import unicode_literals, division, print_function
|
from __future__ import unicode_literals, division, print_function
|
||||||
|
|
||||||
import plac
|
import plac
|
||||||
import json
|
|
||||||
from collections import defaultdict
|
|
||||||
import cytoolz
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import dill
|
import dill
|
||||||
import tqdm
|
import tqdm
|
||||||
from thinc.neural._classes.model import Model
|
from thinc.neural._classes.model import Model
|
||||||
from thinc.neural.optimizers import linear_decay
|
|
||||||
from timeit import default_timer as timer
|
from timeit import default_timer as timer
|
||||||
import random
|
import random
|
||||||
import numpy.random
|
import numpy.random
|
||||||
|
|
||||||
from ..tokens.doc import Doc
|
|
||||||
from ..scorer import Scorer
|
|
||||||
from ..gold import GoldParse, merge_sents
|
|
||||||
from ..gold import GoldCorpus, minibatch
|
from ..gold import GoldCorpus, minibatch
|
||||||
from ..util import prints
|
from ..util import prints
|
||||||
from .. import util
|
from .. import util
|
||||||
|
@ -31,8 +24,10 @@ numpy.random.seed(0)
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
lang=("model language", "positional", None, str),
|
lang=("model language", "positional", None, str),
|
||||||
output_dir=("output directory to store model in", "positional", None, str),
|
output_dir=("output directory to store model in", "positional", None, str),
|
||||||
train_data=("location of JSON-formatted training data", "positional", None, str),
|
train_data=("location of JSON-formatted training data", "positional",
|
||||||
dev_data=("location of JSON-formatted development data (optional)", "positional", None, str),
|
None, str),
|
||||||
|
dev_data=("location of JSON-formatted development data (optional)",
|
||||||
|
"positional", None, str),
|
||||||
n_iter=("number of iterations", "option", "n", int),
|
n_iter=("number of iterations", "option", "n", int),
|
||||||
n_sents=("number of sentences", "option", "ns", int),
|
n_sents=("number of sentences", "option", "ns", int),
|
||||||
use_gpu=("Use GPU", "option", "g", int),
|
use_gpu=("Use GPU", "option", "g", int),
|
||||||
|
@ -42,11 +37,12 @@ numpy.random.seed(0)
|
||||||
no_entities=("Don't train NER", "flag", "N", bool),
|
no_entities=("Don't train NER", "flag", "N", bool),
|
||||||
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
|
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
|
||||||
version=("Model version", "option", "V", str),
|
version=("Model version", "option", "V", str),
|
||||||
meta_path=("Optional path to meta.json. All relevant properties will be overwritten.", "option", "m", Path)
|
meta_path=("Optional path to meta.json. All relevant properties will be "
|
||||||
)
|
"overwritten.", "option", "m", Path))
|
||||||
def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
|
def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
|
||||||
use_gpu=-1, vectors=None, no_tagger=False, no_parser=False, no_entities=False,
|
use_gpu=-1, vectors=None, no_tagger=False, no_parser=False,
|
||||||
gold_preproc=False, version="0.0.0", meta_path=None):
|
no_entities=False, gold_preproc=False, version="0.0.0",
|
||||||
|
meta_path=None):
|
||||||
"""
|
"""
|
||||||
Train a model. Expects data in spaCy's JSON format.
|
Train a model. Expects data in spaCy's JSON format.
|
||||||
"""
|
"""
|
||||||
|
@ -72,9 +68,12 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
|
||||||
meta.setdefault('name', 'unnamed')
|
meta.setdefault('name', 'unnamed')
|
||||||
|
|
||||||
pipeline = ['tagger', 'parser', 'ner']
|
pipeline = ['tagger', 'parser', 'ner']
|
||||||
if no_tagger and 'tagger' in pipeline: pipeline.remove('tagger')
|
if no_tagger and 'tagger' in pipeline:
|
||||||
if no_parser and 'parser' in pipeline: pipeline.remove('parser')
|
pipeline.remove('tagger')
|
||||||
if no_entities and 'ner' in pipeline: pipeline.remove('ner')
|
if no_parser and 'parser' in pipeline:
|
||||||
|
pipeline.remove('parser')
|
||||||
|
if no_entities and 'ner' in pipeline:
|
||||||
|
pipeline.remove('ner')
|
||||||
|
|
||||||
# Take dropout and batch size as generators of values -- dropout
|
# Take dropout and batch size as generators of values -- dropout
|
||||||
# starts high and decays sharply, to force the optimizer to explore.
|
# starts high and decays sharply, to force the optimizer to explore.
|
||||||
|
@ -139,7 +138,7 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
|
||||||
scorer = nlp_loaded.evaluate(dev_docs)
|
scorer = nlp_loaded.evaluate(dev_docs)
|
||||||
end_time = timer()
|
end_time = timer()
|
||||||
cpu_wps = nwords/(end_time-start_time)
|
cpu_wps = nwords/(end_time-start_time)
|
||||||
acc_loc =(output_path / ('model%d' % i) / 'accuracy.json')
|
acc_loc = (output_path / ('model%d' % i) / 'accuracy.json')
|
||||||
with acc_loc.open('w') as file_:
|
with acc_loc.open('w') as file_:
|
||||||
file_.write(json_dumps(scorer.scores))
|
file_.write(json_dumps(scorer.scores))
|
||||||
meta_loc = output_path / ('model%d' % i) / 'meta.json'
|
meta_loc = output_path / ('model%d' % i) / 'meta.json'
|
||||||
|
@ -157,7 +156,8 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
|
||||||
with meta_loc.open('w') as file_:
|
with meta_loc.open('w') as file_:
|
||||||
file_.write(json_dumps(meta))
|
file_.write(json_dumps(meta))
|
||||||
util.set_env_log(True)
|
util.set_env_log(True)
|
||||||
print_progress(i, losses, scorer.scores, cpu_wps=cpu_wps, gpu_wps=gpu_wps)
|
print_progress(i, losses, scorer.scores, cpu_wps=cpu_wps,
|
||||||
|
gpu_wps=gpu_wps)
|
||||||
finally:
|
finally:
|
||||||
print("Saving model...")
|
print("Saving model...")
|
||||||
try:
|
try:
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
import pkg_resources
|
import pkg_resources
|
||||||
|
@ -29,8 +29,10 @@ def validate(cmd):
|
||||||
model_links = get_model_links(current_compat)
|
model_links = get_model_links(current_compat)
|
||||||
model_pkgs = get_model_pkgs(current_compat, all_models)
|
model_pkgs = get_model_pkgs(current_compat, all_models)
|
||||||
incompat_links = {l for l, d in model_links.items() if not d['compat']}
|
incompat_links = {l for l, d in model_links.items() if not d['compat']}
|
||||||
incompat_models = {d['name'] for _, d in model_pkgs.items() if not d['compat']}
|
incompat_models = {d['name'] for _, d in model_pkgs.items()
|
||||||
incompat_models.update([d['name'] for _, d in model_links.items() if not d['compat']])
|
if not d['compat']}
|
||||||
|
incompat_models.update([d['name'] for _, d in model_links.items()
|
||||||
|
if not d['compat']])
|
||||||
na_models = [m for m in incompat_models if m not in current_compat]
|
na_models = [m for m in incompat_models if m not in current_compat]
|
||||||
update_models = [m for m in incompat_models if m in current_compat]
|
update_models = [m for m in incompat_models if m in current_compat]
|
||||||
|
|
||||||
|
@ -90,7 +92,6 @@ def get_model_pkgs(compat, all_models):
|
||||||
|
|
||||||
|
|
||||||
def get_model_row(compat, name, data, type='package'):
|
def get_model_row(compat, name, data, type='package'):
|
||||||
tpl_row = ' {:<10}' + (' {:<20}' * 4)
|
|
||||||
tpl_red = '\x1b[38;5;1m{}\x1b[0m'
|
tpl_red = '\x1b[38;5;1m{}\x1b[0m'
|
||||||
tpl_green = '\x1b[38;5;2m{}\x1b[0m'
|
tpl_green = '\x1b[38;5;2m{}\x1b[0m'
|
||||||
if data['compat']:
|
if data['compat']:
|
||||||
|
@ -110,7 +111,8 @@ def get_row(*args):
|
||||||
def is_model_path(model_path):
|
def is_model_path(model_path):
|
||||||
exclude = ['cache', 'pycache', '__pycache__']
|
exclude = ['cache', 'pycache', '__pycache__']
|
||||||
name = model_path.parts[-1]
|
name = model_path.parts[-1]
|
||||||
return model_path.is_dir() and name not in exclude and not name.startswith('.')
|
return (model_path.is_dir() and name not in exclude
|
||||||
|
and not name.startswith('.'))
|
||||||
|
|
||||||
|
|
||||||
def is_compat(compat, name, version):
|
def is_compat(compat, name, version):
|
||||||
|
@ -118,6 +120,7 @@ def is_compat(compat, name, version):
|
||||||
|
|
||||||
|
|
||||||
def reformat_version(version):
|
def reformat_version(version):
|
||||||
|
"""Hack to reformat old versions ending on '-alpha' to match pip format."""
|
||||||
if version.endswith('-alpha'):
|
if version.endswith('-alpha'):
|
||||||
return version.replace('-alpha', 'a0')
|
return version.replace('-alpha', 'a0')
|
||||||
return version.replace('-alpha', 'a')
|
return version.replace('-alpha', 'a')
|
||||||
|
|
Loading…
Reference in New Issue
Block a user