mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 09:26:27 +03:00
Tidy up CLI and fix print functions
This commit is contained in:
parent
311704674d
commit
59c3b9d4dd
|
@ -2,6 +2,7 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from . import util
|
from . import util
|
||||||
|
from .util import prints
|
||||||
from .deprecated import resolve_model_name
|
from .deprecated import resolve_model_name
|
||||||
from .cli.info import info
|
from .cli.info import info
|
||||||
from .glossary import explain
|
from .glossary import explain
|
||||||
|
@ -26,9 +27,8 @@ def load(name, **overrides):
|
||||||
if not model_path.exists():
|
if not model_path.exists():
|
||||||
lang_name = util.get_lang_class(name).lang
|
lang_name = util.get_lang_class(name).lang
|
||||||
model_path = None
|
model_path = None
|
||||||
util.print_msg(
|
prints("Only loading the '%s' tokenizer." % lang_name,
|
||||||
"Only loading the '{}' tokenizer.".format(lang_name),
|
title="Warning: no model found for '%s'" % name)
|
||||||
title="Warning: no model found for '{}'".format(name))
|
|
||||||
else:
|
else:
|
||||||
model_path = util.ensure_path(overrides['path'])
|
model_path = util.ensure_path(overrides['path'])
|
||||||
data_path = model_path.parent
|
data_path = model_path.parent
|
||||||
|
|
|
@ -4,7 +4,7 @@ from __future__ import unicode_literals
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from .converters import conllu2json
|
from .converters import conllu2json
|
||||||
from .. import util
|
from ..util import prints
|
||||||
|
|
||||||
|
|
||||||
# Converters are matched by file extension. To add a converter, add a new entry
|
# Converters are matched by file extension. To add a converter, add a new entry
|
||||||
|
@ -19,17 +19,12 @@ CONVERTERS = {
|
||||||
def convert(input_file, output_dir, *args):
|
def convert(input_file, output_dir, *args):
|
||||||
input_path = Path(input_file)
|
input_path = Path(input_file)
|
||||||
output_path = Path(output_dir)
|
output_path = Path(output_dir)
|
||||||
check_dirs(input_path, output_path)
|
if not input_path.exists():
|
||||||
file_ext = input_path.suffix
|
prints(input_path, title="Input file not found", exits=True)
|
||||||
if file_ext in CONVERTERS:
|
|
||||||
CONVERTERS[file_ext](input_path, output_path, *args)
|
|
||||||
else:
|
|
||||||
util.sys_exit("Can't find converter for {}".format(input_path.parts[-1]),
|
|
||||||
title="Unknown format")
|
|
||||||
|
|
||||||
|
|
||||||
def check_dirs(input_file, output_path):
|
|
||||||
if not input_file.exists():
|
|
||||||
util.sys_exit(input_file.as_posix(), title="Input file not found")
|
|
||||||
if not output_path.exists():
|
if not output_path.exists():
|
||||||
util.sys_exit(output_path.as_posix(), title="Output directory not found")
|
prints(output_path, title="Output directory not found", exits=True)
|
||||||
|
file_ext = input_path.suffix
|
||||||
|
if not file_ext in CONVERTERS:
|
||||||
|
prints("Can't find converter for %s" % input_path.parts[-1],
|
||||||
|
title="Unknown format", exits=True)
|
||||||
|
CONVERTERS[file_ext](input_path, output_path, *args)
|
||||||
|
|
|
@ -1,9 +1,8 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import json
|
from ...compat import json_dumps, path2str
|
||||||
from ...compat import json_dumps
|
from ...util import prints
|
||||||
from ... import util
|
|
||||||
|
|
||||||
|
|
||||||
def conllu2json(input_path, output_path, n_sents=10, use_morphology=False):
|
def conllu2json(input_path, output_path, n_sents=10, use_morphology=False):
|
||||||
|
@ -32,8 +31,8 @@ def conllu2json(input_path, output_path, n_sents=10, use_morphology=False):
|
||||||
output_file = output_path / output_filename
|
output_file = output_path / output_filename
|
||||||
with output_file.open('w', encoding='utf-8') as f:
|
with output_file.open('w', encoding='utf-8') as f:
|
||||||
f.write(json_dumps(docs))
|
f.write(json_dumps(docs))
|
||||||
util.print_msg("Created {} documents".format(len(docs)),
|
prints("Created %d documents" % len(docs),
|
||||||
title="Generated output file {}".format(output_file))
|
title="Generated output file %s" % path2str(output_file))
|
||||||
|
|
||||||
|
|
||||||
def read_conllx(input_path, use_morphology=False, n=0):
|
def read_conllx(input_path, use_morphology=False, n=0):
|
||||||
|
|
|
@ -6,78 +6,52 @@ import os
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
from .link import link_package
|
from .link import link
|
||||||
|
from ..util import prints
|
||||||
from .. import about
|
from .. import about
|
||||||
from .. import util
|
|
||||||
|
|
||||||
|
|
||||||
def download(model=None, direct=False):
|
def download(model, direct=False):
|
||||||
check_error_depr(model)
|
|
||||||
|
|
||||||
if direct:
|
if direct:
|
||||||
download_model('{m}/{m}.tar.gz'.format(m=model))
|
download_model('{m}/{m}.tar.gz'.format(m=model))
|
||||||
else:
|
else:
|
||||||
model_name = check_shortcut(model)
|
shortcuts = get_json(about.__shortcuts__, "available shortcuts")
|
||||||
|
model_name = shortcuts.get(model, model)
|
||||||
compatibility = get_compatibility()
|
compatibility = get_compatibility()
|
||||||
version = get_version(model_name, compatibility)
|
version = get_version(model_name, compatibility)
|
||||||
download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version))
|
download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version))
|
||||||
link_package(model_name, model, force=True)
|
link(model_name, model, force=True)
|
||||||
|
|
||||||
|
|
||||||
def get_json(url, desc):
|
def get_json(url, desc):
|
||||||
r = requests.get(url)
|
r = requests.get(url)
|
||||||
if r.status_code != 200:
|
if r.status_code != 200:
|
||||||
util.sys_exit(
|
prints("Couldn't fetch %s. Please find a model for your spaCy installation "
|
||||||
"Couldn't fetch {d}. Please find the right model for your spaCy "
|
"(v%s), and download it manually." % (desc, about.__version__),
|
||||||
"installation (v{v}), and download it manually:".format(d=desc, v=about.__version__),
|
about.__docs__, title="Server error (%d)" % r.status_code, exits=True)
|
||||||
"python -m spacy.download [full model name + version] --direct",
|
|
||||||
title="Server error ({c})".format(c=r.status_code))
|
|
||||||
return r.json()
|
return r.json()
|
||||||
|
|
||||||
|
|
||||||
def check_shortcut(model):
|
|
||||||
shortcuts = get_json(about.__shortcuts__, "available shortcuts")
|
|
||||||
return shortcuts.get(model, model)
|
|
||||||
|
|
||||||
|
|
||||||
def get_compatibility():
|
def get_compatibility():
|
||||||
version = about.__version__
|
version = about.__version__
|
||||||
comp_table = get_json(about.__compatibility__, "compatibility table")
|
comp_table = get_json(about.__compatibility__, "compatibility table")
|
||||||
comp = comp_table['spacy']
|
comp = comp_table['spacy']
|
||||||
if version not in comp:
|
if version not in comp:
|
||||||
util.sys_exit(
|
prints("No compatible models found for v%s of spaCy." % version,
|
||||||
"No compatible models found for v{v} of spaCy.".format(v=version),
|
title="Compatibility error", exits=True)
|
||||||
title="Compatibility error")
|
|
||||||
return comp[version]
|
return comp[version]
|
||||||
|
|
||||||
|
|
||||||
def get_version(model, comp):
|
def get_version(model, comp):
|
||||||
if model not in comp:
|
if model not in comp:
|
||||||
util.sys_exit(
|
version = about.__version__
|
||||||
"No compatible model found for "
|
prints("No compatible model found for '%s' (spaCy v%s)." % (model, version),
|
||||||
"'{m}' (spaCy v{v}).".format(m=model, v=about.__version__),
|
title="Compatibility error", exits=True)
|
||||||
title="Compatibility error")
|
|
||||||
return comp[model][0]
|
return comp[model][0]
|
||||||
|
|
||||||
|
|
||||||
def download_model(filename):
|
def download_model(filename):
|
||||||
util.print_msg("Downloading {f}".format(f=filename))
|
|
||||||
download_url = about.__download_url__ + '/' + filename
|
download_url = about.__download_url__ + '/' + filename
|
||||||
subprocess.call([sys.executable, '-m',
|
subprocess.call([sys.executable, '-m',
|
||||||
'pip', 'install', '--no-cache-dir', download_url],
|
'pip', 'install', '--no-cache-dir', download_url],
|
||||||
env=os.environ.copy())
|
env=os.environ.copy())
|
||||||
|
|
||||||
|
|
||||||
def check_error_depr(model):
|
|
||||||
if not model:
|
|
||||||
util.sys_exit(
|
|
||||||
"python -m spacy.download [name or shortcut]",
|
|
||||||
title="Missing model name or shortcut")
|
|
||||||
|
|
||||||
if model == 'all':
|
|
||||||
util.sys_exit(
|
|
||||||
"As of v1.7.0, the download all command is deprecated. Please "
|
|
||||||
"download the models individually via spacy.download [model name] "
|
|
||||||
"or pip install. For more info on this, see the documentation: "
|
|
||||||
"{d}".format(d=about.__docs__),
|
|
||||||
title="Deprecated command")
|
|
||||||
|
|
|
@ -4,49 +4,46 @@ from __future__ import unicode_literals
|
||||||
import platform
|
import platform
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from ..compat import unicode_
|
from ..compat import path2str
|
||||||
from .. import about
|
from .. import about
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
def info(model=None, markdown=False):
|
def info(model=None, markdown=False):
|
||||||
if model:
|
if model:
|
||||||
data = util.parse_package_meta(util.get_data_path(), model, require=True)
|
data_path = util.get_data_path()
|
||||||
model_path = Path(__file__).parent / util.get_data_path() / model
|
data = util.parse_package_meta(data_path, model, require=True)
|
||||||
|
model_path = Path(__file__).parent / data_path / model
|
||||||
if model_path.resolve() != model_path:
|
if model_path.resolve() != model_path:
|
||||||
data['link'] = unicode_(model_path)
|
data['link'] = path2str(model_path)
|
||||||
data['source'] = unicode_(model_path.resolve())
|
data['source'] = path2str(model_path.resolve())
|
||||||
else:
|
else:
|
||||||
data['source'] = unicode_(model_path)
|
data['source'] = path2str(model_path)
|
||||||
print_info(data, "model " + model, markdown)
|
print_info(data, 'model %s' % model, markdown)
|
||||||
else:
|
else:
|
||||||
data = get_spacy_data()
|
data = {'spaCy version': about.__version__,
|
||||||
print_info(data, "spaCy", markdown)
|
'Location': path2str(Path(__file__).parent.parent),
|
||||||
|
'Platform': platform.platform(),
|
||||||
|
'Python version': platform.python_version(),
|
||||||
|
'Models': list_models()}
|
||||||
|
print_info(data, 'spaCy', markdown)
|
||||||
|
|
||||||
|
|
||||||
def print_info(data, title, markdown):
|
def print_info(data, title, markdown):
|
||||||
title = "Info about {title}".format(title=title)
|
title = 'Info about %s' % title
|
||||||
if markdown:
|
if markdown:
|
||||||
util.print_markdown(data, title=title)
|
util.print_markdown(data, title=title)
|
||||||
else:
|
else:
|
||||||
util.print_table(data, title=title)
|
util.print_table(data, title=title)
|
||||||
|
|
||||||
|
|
||||||
def get_spacy_data():
|
|
||||||
return {
|
|
||||||
'spaCy version': about.__version__,
|
|
||||||
'Location': unicode_(Path(__file__).parent.parent),
|
|
||||||
'Platform': platform.platform(),
|
|
||||||
'Python version': platform.python_version(),
|
|
||||||
'Installed models': ', '.join(list_models())
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def list_models():
|
def list_models():
|
||||||
# exclude common cache directories – this means models called "cache" etc.
|
def exclude_dir(dir_name):
|
||||||
# won't show up in list, but it seems worth it
|
# exclude common cache directories and hidden directories
|
||||||
exclude = ['cache', 'pycache', '__pycache__']
|
exclude = ['cache', 'pycache', '__pycache__']
|
||||||
|
return dir_name in exclude or dir_name.startswith('.')
|
||||||
data_path = util.get_data_path()
|
data_path = util.get_data_path()
|
||||||
if data_path:
|
if data_path:
|
||||||
models = [f.parts[-1] for f in data_path.iterdir() if f.is_dir()]
|
models = [f.parts[-1] for f in data_path.iterdir() if f.is_dir()]
|
||||||
return [m for m in models if m not in exclude]
|
return ', '.join([m for m in models if not exclude_dir(m)])
|
||||||
|
return '-'
|
||||||
|
|
|
@ -1,78 +1,37 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import pip
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import importlib
|
from ..compat import symlink_to, path2str
|
||||||
from ..compat import unicode_, symlink_to
|
from ..util import prints
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
def link(origin, link_name, force=False):
|
def link(origin, link_name, force=False):
|
||||||
if is_package(origin):
|
if util.is_package(origin):
|
||||||
link_package(origin, link_name, force)
|
model_path = util.get_model_package_path(origin)
|
||||||
else:
|
else:
|
||||||
symlink(origin, link_name, force)
|
model_path = Path(origin)
|
||||||
|
|
||||||
|
|
||||||
def link_package(package_name, link_name, force=False):
|
|
||||||
# Here we're importing the module just to find it. This is worryingly
|
|
||||||
# indirect, but it's otherwise very difficult to find the package.
|
|
||||||
# Python's installation and import rules are very complicated.
|
|
||||||
pkg = importlib.import_module(package_name)
|
|
||||||
package_path = Path(pkg.__file__).parent.parent
|
|
||||||
meta = get_meta(package_path, package_name)
|
|
||||||
model_name = package_name + '-' + meta['version']
|
|
||||||
model_path = package_path / package_name / model_name
|
|
||||||
symlink(model_path, link_name, force)
|
|
||||||
|
|
||||||
|
|
||||||
def symlink(model_path, link_name, force):
|
|
||||||
model_path = Path(model_path)
|
|
||||||
if not model_path.exists():
|
if not model_path.exists():
|
||||||
util.sys_exit(
|
prints("The data should be located in %s" % path2str(model_path),
|
||||||
"The data should be located in {p}".format(p=model_path),
|
title="Can't locate model data", exits=True)
|
||||||
title="Can't locate model data")
|
|
||||||
|
|
||||||
link_path = util.get_data_path() / link_name
|
link_path = util.get_data_path() / link_name
|
||||||
|
|
||||||
if link_path.exists() and not force:
|
if link_path.exists() and not force:
|
||||||
util.sys_exit(
|
prints("To overwrite an existing link, use the --force flag.",
|
||||||
"To overwrite an existing link, use the --force flag.",
|
title="Link %s already exists" % link_name, exits=True)
|
||||||
title="Link {l} already exists".format(l=link_name))
|
|
||||||
elif link_path.exists():
|
elif link_path.exists():
|
||||||
link_path.unlink()
|
link_path.unlink()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
symlink_to(link_path, model_path)
|
symlink_to(link_path, model_path)
|
||||||
except:
|
except:
|
||||||
# This is quite dirty, but just making sure other errors are caught so
|
# This is quite dirty, but just making sure other errors are caught.
|
||||||
# users at least see a proper message.
|
prints("Creating a symlink in spacy/data failed. Make sure you have "
|
||||||
util.print_msg(
|
"the required permissions and try re-running the command as "
|
||||||
"Creating a symlink in spacy/data failed. Make sure you have the "
|
"admin, or use a virtualenv. You can still import the model as a "
|
||||||
"required permissions and try re-running the command as admin, or "
|
"module and call its load() method, or create the symlink manually.",
|
||||||
"use a virtualenv to install spaCy in a user directory, instead of "
|
"%s --> %s" % (path2str(model_path), path2str(link_path)),
|
||||||
"doing a system installation.",
|
title="Error: Couldn't link model to '%s'" % link_name)
|
||||||
"You can still import the model as a Python package and call its "
|
|
||||||
"load() method, or create the symlink manually:",
|
|
||||||
"{a} --> {b}".format(a=unicode_(model_path), b=unicode_(link_path)),
|
|
||||||
title="Error: Couldn't link model to '{l}'".format(l=link_name))
|
|
||||||
raise
|
raise
|
||||||
|
prints("%s --> %s" % (path2str(model_path), path2str(link_path)),
|
||||||
util.print_msg(
|
"You can now load the model via spacy.load('%s')." % link_name,
|
||||||
"{a} --> {b}".format(a=model_path.as_posix(), b=link_path.as_posix()),
|
|
||||||
"You can now load the model via spacy.load('{l}').".format(l=link_name),
|
|
||||||
title="Linking successful")
|
title="Linking successful")
|
||||||
|
|
||||||
|
|
||||||
def get_meta(package_path, package):
|
|
||||||
meta = util.parse_package_meta(package_path, package)
|
|
||||||
return meta
|
|
||||||
|
|
||||||
|
|
||||||
def is_package(origin):
|
|
||||||
packages = pip.get_installed_distributions()
|
|
||||||
for package in packages:
|
|
||||||
if package.project_name.replace('-', '_') == origin:
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
|
@ -4,21 +4,25 @@ from __future__ import unicode_literals
|
||||||
import gzip
|
import gzip
|
||||||
import math
|
import math
|
||||||
from ast import literal_eval
|
from ast import literal_eval
|
||||||
from pathlib import Path
|
|
||||||
from preshed.counter import PreshCounter
|
from preshed.counter import PreshCounter
|
||||||
|
|
||||||
from ..vocab import write_binary_vectors
|
from ..vocab import write_binary_vectors
|
||||||
from ..compat import fix_text
|
from ..compat import fix_text, path2str
|
||||||
|
from ..util import prints
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
def model(lang, model_dir, freqs_data, clusters_data, vectors_data):
|
def model(lang, model_dir, freqs_data, clusters_data, vectors_data):
|
||||||
model_path = Path(model_dir)
|
model_path = util.ensure_path(model_dir)
|
||||||
freqs_path = Path(freqs_data)
|
freqs_path = util.ensure_path(freqs_data)
|
||||||
clusters_path = Path(clusters_data) if clusters_data else None
|
clusters_path = util.ensure_path(clusters_data)
|
||||||
vectors_path = Path(vectors_data) if vectors_data else None
|
vectors_path = util.ensure_path(vectors_data)
|
||||||
|
if not freqs_path.is_file():
|
||||||
check_dirs(freqs_path, clusters_path, vectors_path)
|
prints(freqs_path, title="No frequencies file found", exits=True)
|
||||||
|
if clusters_path and not clusters_path.is_file():
|
||||||
|
prints(clusters_path, title="No Brown clusters file found", exits=True)
|
||||||
|
if vectors_path and not vectors_path.is_file():
|
||||||
|
prints(vectors_path, title="No word vectors file found", exits=True)
|
||||||
vocab = util.get_lang_class(lang).Defaults.create_vocab()
|
vocab = util.get_lang_class(lang).Defaults.create_vocab()
|
||||||
probs, oov_prob = read_probs(freqs_path)
|
probs, oov_prob = read_probs(freqs_path)
|
||||||
clusters = read_clusters(clusters_path) if clusters_path else {}
|
clusters = read_clusters(clusters_path) if clusters_path else {}
|
||||||
|
@ -36,14 +40,14 @@ def create_model(model_path, vectors_path, vocab, oov_prob):
|
||||||
model_path.mkdir()
|
model_path.mkdir()
|
||||||
if not vocab_path.exists():
|
if not vocab_path.exists():
|
||||||
vocab_path.mkdir()
|
vocab_path.mkdir()
|
||||||
vocab.dump(lexemes_path.as_posix())
|
vocab.dump(path2str(lexemes_path))
|
||||||
with strings_path.open('w') as f:
|
with strings_path.open('w') as f:
|
||||||
vocab.strings.dump(f)
|
vocab.strings.dump(f)
|
||||||
with oov_path.open('w') as f:
|
with oov_path.open('w') as f:
|
||||||
f.write('%f' % oov_prob)
|
f.write('%f' % oov_prob)
|
||||||
if vectors_path:
|
if vectors_path:
|
||||||
vectors_dest = vocab_path / 'vec.bin'
|
vectors_dest = vocab_path / 'vec.bin'
|
||||||
write_binary_vectors(vectors_path.as_posix(), vectors_dest.as_posix())
|
write_binary_vectors(path2str(vectors_path), path2str(vectors_dest))
|
||||||
|
|
||||||
|
|
||||||
def read_probs(freqs_path, max_length=100, min_doc_freq=5, min_freq=200):
|
def read_probs(freqs_path, max_length=100, min_doc_freq=5, min_freq=200):
|
||||||
|
@ -115,17 +119,8 @@ def populate_vocab(vocab, clusters, probs, oov_prob):
|
||||||
|
|
||||||
|
|
||||||
def check_unzip(file_path):
|
def check_unzip(file_path):
|
||||||
file_path_str = file_path.as_posix()
|
file_path_str = path2str(file_path)
|
||||||
if file_path_str.endswith('gz'):
|
if file_path_str.endswith('gz'):
|
||||||
return gzip.open(file_path_str)
|
return gzip.open(file_path_str)
|
||||||
else:
|
else:
|
||||||
return file_path.open()
|
return file_path.open()
|
||||||
|
|
||||||
|
|
||||||
def check_dirs(freqs_data, clusters_data, vectors_data):
|
|
||||||
if not freqs_data.is_file():
|
|
||||||
util.sys_exit(freqs_data.as_posix(), title="No frequencies file found")
|
|
||||||
if clusters_data and not clusters_data.is_file():
|
|
||||||
util.sys_exit(clusters_data.as_posix(), title="No Brown clusters file found")
|
|
||||||
if vectors_data and not vectors_data.is_file():
|
|
||||||
util.sys_exit(vectors_data.as_posix(), title="No word vectors file found")
|
|
||||||
|
|
|
@ -5,64 +5,57 @@ import shutil
|
||||||
import requests
|
import requests
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from ..compat import unicode_, json_dumps
|
from ..compat import path2str, json_dumps
|
||||||
|
from ..util import prints
|
||||||
from .. import util
|
from .. import util
|
||||||
|
from .. import about
|
||||||
|
|
||||||
|
|
||||||
def package(input_dir, output_dir, meta_path, force):
|
def package(input_dir, output_dir, meta_path, force):
|
||||||
input_path = Path(input_dir)
|
input_path = util.ensure_path(input_dir)
|
||||||
output_path = Path(output_dir)
|
output_path = util.ensure_path(output_dir)
|
||||||
meta_path = util.ensure_path(meta_path)
|
meta_path = util.ensure_path(meta_path)
|
||||||
check_dirs(input_path, output_path, meta_path)
|
if not input_path or not input_path.exists():
|
||||||
|
prints(input_path, title="Model directory not found", exits=True)
|
||||||
|
if not output_path or not output_path.exists():
|
||||||
|
prints(output_path, title="Output directory not found", exits=True)
|
||||||
|
if meta_path and not meta_path.exists():
|
||||||
|
prints(meta_path, title="meta.json not found", exits=True)
|
||||||
|
|
||||||
template_setup = get_template('setup.py')
|
template_setup = get_template('setup.py')
|
||||||
template_manifest = get_template('MANIFEST.in')
|
template_manifest = get_template('MANIFEST.in')
|
||||||
template_init = get_template('en_model_name/__init__.py')
|
template_init = get_template('en_model_name/__init__.py')
|
||||||
|
|
||||||
meta_path = meta_path or input_path / 'meta.json'
|
meta_path = meta_path or input_path / 'meta.json'
|
||||||
if meta_path.is_file():
|
if meta_path.is_file():
|
||||||
util.print_msg(unicode_(meta_path), title="Reading meta.json from file")
|
prints(meta_path, title="Reading meta.json from file")
|
||||||
meta = util.read_json(meta_path)
|
meta = util.read_json(meta_path)
|
||||||
else:
|
else:
|
||||||
meta = generate_meta()
|
meta = generate_meta()
|
||||||
|
|
||||||
validate_meta(meta, ['lang', 'name', 'version'])
|
validate_meta(meta, ['lang', 'name', 'version'])
|
||||||
|
|
||||||
model_name = meta['lang'] + '_' + meta['name']
|
model_name = meta['lang'] + '_' + meta['name']
|
||||||
model_name_v = model_name + '-' + meta['version']
|
model_name_v = model_name + '-' + meta['version']
|
||||||
main_path = output_path / model_name_v
|
main_path = output_path / model_name_v
|
||||||
package_path = main_path / model_name
|
package_path = main_path / model_name
|
||||||
|
|
||||||
create_dirs(package_path, force)
|
create_dirs(package_path, force)
|
||||||
shutil.copytree(unicode_(input_path), unicode_(package_path / model_name_v))
|
shutil.copytree(path2str(input_path), path2str(package_path / model_name_v))
|
||||||
create_file(main_path / 'meta.json', json_dumps(meta))
|
create_file(main_path / 'meta.json', json_dumps(meta))
|
||||||
create_file(main_path / 'setup.py', template_setup)
|
create_file(main_path / 'setup.py', template_setup)
|
||||||
create_file(main_path / 'MANIFEST.in', template_manifest)
|
create_file(main_path / 'MANIFEST.in', template_manifest)
|
||||||
create_file(package_path / '__init__.py', template_init)
|
create_file(package_path / '__init__.py', template_init)
|
||||||
|
prints(main_path, "To build the package, run `python setup.py sdist` in this "
|
||||||
util.print_msg(
|
"directory.", title="Successfully created package '%s'" % model_name_v)
|
||||||
unicode_(main_path),
|
|
||||||
"To build the package, run `python setup.py sdist` in that directory.",
|
|
||||||
title="Successfully created package {p}".format(p=model_name_v))
|
|
||||||
|
|
||||||
|
|
||||||
def check_dirs(input_path, output_path, meta_path):
|
|
||||||
if not input_path.exists():
|
|
||||||
util.sys_exit(unicode_(input_path.as_poisx), title="Model directory not found")
|
|
||||||
if not output_path.exists():
|
|
||||||
util.sys_exit(unicode_(output_path), title="Output directory not found")
|
|
||||||
if meta_path and not meta_path.exists():
|
|
||||||
util.sys_exit(unicode_(meta_path), title="meta.json not found")
|
|
||||||
|
|
||||||
|
|
||||||
def create_dirs(package_path, force):
|
def create_dirs(package_path, force):
|
||||||
if package_path.exists():
|
if package_path.exists():
|
||||||
if force:
|
if force:
|
||||||
shutil.rmtree(unicode_(package_path))
|
shutil.rmtree(path2str(package_path))
|
||||||
else:
|
else:
|
||||||
util.sys_exit(unicode_(package_path),
|
prints(package_path, "Please delete the directory and try again, or "
|
||||||
"Please delete the directory and try again, or use the --force "
|
"use the --force flag to overwrite existing directories.",
|
||||||
"flag to overwrite existing directories.",
|
title="Package directory already exists", exits=True)
|
||||||
title="Package directory already exists")
|
|
||||||
Path.mkdir(package_path, parents=True)
|
Path.mkdir(package_path, parents=True)
|
||||||
|
|
||||||
|
|
||||||
|
@ -75,15 +68,14 @@ def generate_meta():
|
||||||
settings = [('lang', 'Model language', 'en'),
|
settings = [('lang', 'Model language', 'en'),
|
||||||
('name', 'Model name', 'model'),
|
('name', 'Model name', 'model'),
|
||||||
('version', 'Model version', '0.0.0'),
|
('version', 'Model version', '0.0.0'),
|
||||||
('spacy_version', 'Required spaCy version', '>=1.7.0,<2.0.0'),
|
('spacy_version', 'Required spaCy version', '>=2.0.0,<3.0.0'),
|
||||||
('description', 'Model description', False),
|
('description', 'Model description', False),
|
||||||
('author', 'Author', False),
|
('author', 'Author', False),
|
||||||
('email', 'Author email', False),
|
('email', 'Author email', False),
|
||||||
('url', 'Author website', False),
|
('url', 'Author website', False),
|
||||||
('license', 'License', 'CC BY-NC 3.0')]
|
('license', 'License', 'CC BY-NC 3.0')]
|
||||||
|
|
||||||
util.print_msg("Enter the package settings for your model.", title="Generating meta.json")
|
prints("Enter the package settings for your model.", title="Generating meta.json")
|
||||||
|
|
||||||
meta = {}
|
meta = {}
|
||||||
for setting, desc, default in settings:
|
for setting, desc, default in settings:
|
||||||
response = util.get_raw_input(desc, default)
|
response = util.get_raw_input(desc, default)
|
||||||
|
@ -94,16 +86,13 @@ def generate_meta():
|
||||||
def validate_meta(meta, keys):
|
def validate_meta(meta, keys):
|
||||||
for key in keys:
|
for key in keys:
|
||||||
if key not in meta or meta[key] == '':
|
if key not in meta or meta[key] == '':
|
||||||
util.sys_exit(
|
prints("This setting is required to build your package.",
|
||||||
"This setting is required to build your package.",
|
title='No "%s" setting found in meta.json' % key, exits=True)
|
||||||
title='No "{k}" setting found in meta.json'.format(k=key))
|
|
||||||
|
|
||||||
|
|
||||||
def get_template(filepath):
|
def get_template(filepath):
|
||||||
url = 'https://raw.githubusercontent.com/explosion/spacy-dev-resources/master/templates/model/'
|
r = requests.get(about.__model_files__ + filepath)
|
||||||
r = requests.get(url + filepath)
|
|
||||||
if r.status_code != 200:
|
if r.status_code != 200:
|
||||||
util.sys_exit(
|
prints("Couldn't fetch template files from GitHub.",
|
||||||
"Couldn't fetch template files from GitHub.",
|
title="Server error (%d)" % r.status_code, exits=True)
|
||||||
title="Server error ({c})".format(c=r.status_code))
|
|
||||||
return r.text
|
return r.text
|
||||||
|
|
|
@ -4,19 +4,24 @@ from __future__ import unicode_literals, division, print_function
|
||||||
import json
|
import json
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
|
||||||
from ..util import ensure_path
|
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
from ..gold import GoldParse, merge_sents
|
from ..gold import GoldParse, merge_sents
|
||||||
from ..gold import read_json_file as read_gold_json
|
from ..gold import read_json_file as read_gold_json
|
||||||
|
from ..util import prints
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
def train(language, output_dir, train_data, dev_data, n_iter, tagger, parser, ner,
|
def train(language, output_dir, train_data, dev_data, n_iter, tagger, parser, ner,
|
||||||
parser_L1):
|
parser_L1):
|
||||||
output_path = ensure_path(output_dir)
|
output_path = util.ensure_path(output_dir)
|
||||||
train_path = ensure_path(train_data)
|
train_path = util.ensure_path(train_data)
|
||||||
dev_path = ensure_path(dev_data)
|
dev_path = util.ensure_path(dev_data)
|
||||||
check_dirs(output_path, train_path, dev_path)
|
if not output_path.exists():
|
||||||
|
prints(output_path, title="Output directory not found", exits=True)
|
||||||
|
if not train_path.exists():
|
||||||
|
prints(train_path, title="Training data not found", exits=True)
|
||||||
|
if dev_path and not dev_path.exists():
|
||||||
|
prints(dev_path, title="Development data not found", exits=True)
|
||||||
|
|
||||||
lang = util.get_lang_class(language)
|
lang = util.get_lang_class(language)
|
||||||
parser_cfg = {
|
parser_cfg = {
|
||||||
|
@ -44,14 +49,13 @@ def train(language, output_dir, train_data, dev_data, n_iter, tagger, parser, ne
|
||||||
|
|
||||||
|
|
||||||
def train_config(config):
|
def train_config(config):
|
||||||
config_path = ensure_path(config)
|
config_path = util.ensure_path(config)
|
||||||
if not config_path.is_file():
|
if not config_path.is_file():
|
||||||
util.sys_exit(config_path.as_posix(), title="Config file not found")
|
prints(config_path, title="Config file not found", exits=True)
|
||||||
config = json.load(config_path)
|
config = json.load(config_path)
|
||||||
for setting in []:
|
for setting in []:
|
||||||
if setting not in config.keys():
|
if setting not in config.keys():
|
||||||
util.sys_exit("{s} not found in config file.".format(s=setting),
|
prints("%s not found in config file." % setting, title="Missing setting")
|
||||||
title="Missing setting")
|
|
||||||
|
|
||||||
|
|
||||||
def train_model(Language, train_data, dev_data, output_path, tagger_cfg, parser_cfg,
|
def train_model(Language, train_data, dev_data, output_path, tagger_cfg, parser_cfg,
|
||||||
|
@ -88,16 +92,8 @@ def evaluate(Language, gold_tuples, output_path):
|
||||||
return scorer
|
return scorer
|
||||||
|
|
||||||
|
|
||||||
def check_dirs(output_path, train_path, dev_path):
|
|
||||||
if not output_path.exists():
|
|
||||||
util.sys_exit(output_path.as_posix(), title="Output directory not found")
|
|
||||||
if not train_path.exists():
|
|
||||||
util.sys_exit(train_path.as_posix(), title="Training data not found")
|
|
||||||
if dev_path and not dev_path.exists():
|
|
||||||
util.sys_exit(dev_path.as_posix(), title="Development data not found")
|
|
||||||
|
|
||||||
|
|
||||||
def print_progress(itn, nr_weight, nr_active_feat, **scores):
|
def print_progress(itn, nr_weight, nr_active_feat, **scores):
|
||||||
|
# TODO: Fix!
|
||||||
tpl = '{:d}\t{:d}\t{:d}\t{uas:.3f}\t{ents_f:.3f}\t{tags_acc:.3f}\t{token_acc:.3f}'
|
tpl = '{:d}\t{:d}\t{:d}\t{uas:.3f}\t{ents_f:.3f}\t{tags_acc:.3f}\t{token_acc:.3f}'
|
||||||
print(tpl.format(itn, nr_weight, nr_active_feat, **scores))
|
print(tpl.format(itn, nr_weight, nr_active_feat, **scores))
|
||||||
|
|
||||||
|
|
|
@ -5,6 +5,8 @@ from pathlib import Path
|
||||||
|
|
||||||
from . import about
|
from . import about
|
||||||
from . import util
|
from . import util
|
||||||
|
from .util import prints
|
||||||
|
from .compat import path2str
|
||||||
from .cli import download
|
from .cli import download
|
||||||
from .cli import link
|
from .cli import link
|
||||||
|
|
||||||
|
@ -114,9 +116,9 @@ def resolve_model_name(name):
|
||||||
"""
|
"""
|
||||||
if name == 'en' or name == 'de':
|
if name == 'en' or name == 'de':
|
||||||
versions = ['1.0.0', '1.1.0']
|
versions = ['1.0.0', '1.1.0']
|
||||||
data_path = Path(util.get_data_path())
|
data_path = util.get_data_path()
|
||||||
model_path = data_path / name
|
model_path = data_path / name
|
||||||
v_model_paths = [data_path / Path(name + '-' + v) for v in versions]
|
v_model_paths = [data_path / '%s-%s' % (name, v) for v in versions]
|
||||||
|
|
||||||
if not model_path.exists(): # no shortcut found
|
if not model_path.exists(): # no shortcut found
|
||||||
for v_path in v_model_paths:
|
for v_path in v_model_paths:
|
||||||
|
@ -126,10 +128,10 @@ def resolve_model_name(name):
|
||||||
return name
|
return name
|
||||||
else:
|
else:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Found English model at {p}. This model is not "
|
"Found English model at %s. This model is not "
|
||||||
"compatible with the current version. See "
|
"compatible with the current version. See "
|
||||||
"https://spacy.io/docs/usage/models to download the "
|
"https://spacy.io/docs/usage/models to download the "
|
||||||
"new model.".format(p=v_path))
|
"new model." % path2str(v_path))
|
||||||
return name
|
return name
|
||||||
|
|
||||||
|
|
||||||
|
@ -142,11 +144,10 @@ class ModelDownload():
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def load(self, lang):
|
def load(self, lang):
|
||||||
util.print_msg(
|
prints("The spacy.%s.download command is now deprecated. Please use "
|
||||||
"The spacy.{l}.download command is now deprecated. Please use "
|
"python -m spacy download [model name or shortcut] instead. For "
|
||||||
"python -m spacy download [model name or shortcut] instead. For more "
|
"more info, see the docs: %s." % (lang, about.__docs__),
|
||||||
"info and available models, see the documentation: {d}. "
|
"Downloading default '%s' model now..." % lang,
|
||||||
"Downloading default '{l}' model now...".format(d=about.__docs__, l=lang),
|
|
||||||
title="Warning: deprecated command")
|
title="Warning: deprecated command")
|
||||||
download(lang)
|
download(lang)
|
||||||
|
|
||||||
|
|
|
@ -9,7 +9,7 @@ from pathlib import Path
|
||||||
import sys
|
import sys
|
||||||
import textwrap
|
import textwrap
|
||||||
|
|
||||||
from .compat import basestring_, unicode_, input_
|
from .compat import path2str, basestring_, input_
|
||||||
|
|
||||||
|
|
||||||
LANGUAGES = {}
|
LANGUAGES = {}
|
||||||
|
@ -151,95 +151,66 @@ def parse_package_meta(package_path, package, require=True):
|
||||||
def get_raw_input(description, default=False):
|
def get_raw_input(description, default=False):
|
||||||
"""
|
"""
|
||||||
Get user input via raw_input / input and return input value. Takes a
|
Get user input via raw_input / input and return input value. Takes a
|
||||||
description for the prompt, and an optional default value that's displayed
|
description, and an optional default value to display with the prompt.
|
||||||
with the prompt.
|
|
||||||
"""
|
"""
|
||||||
additional = ' (default: {d})'.format(d=default) if default else ''
|
additional = ' (default: %s)' % default if default else ''
|
||||||
prompt = ' {d}{a}: '.format(d=description, a=additional)
|
prompt = ' %s%s: ' % (description, additional)
|
||||||
user_input = input_(prompt)
|
user_input = input_(prompt)
|
||||||
return user_input
|
return user_input
|
||||||
|
|
||||||
|
|
||||||
def print_table(data, **kwargs):
|
def print_table(data, title=None):
|
||||||
"""
|
"""
|
||||||
Print data in table format. Can either take a list of tuples or a
|
Print data in table format. Can either take a list of tuples or a
|
||||||
dictionary, which will be converted to a list of tuples.
|
dictionary, which will be converted to a list of tuples.
|
||||||
"""
|
"""
|
||||||
if type(data) == dict:
|
if type(data) == dict:
|
||||||
data = list(data.items())
|
data = list(data.items())
|
||||||
|
tpl_row = ' {:<15}' * len(data[0])
|
||||||
tpl_msg = '\n{msg}\n'
|
|
||||||
tpl_title = '\n \033[93m{msg}\033[0m'
|
|
||||||
tpl_row =" {:<15}" * len(data[0])
|
|
||||||
table = '\n'.join([tpl_row.format(l, v) for l, v in data])
|
table = '\n'.join([tpl_row.format(l, v) for l, v in data])
|
||||||
|
if title:
|
||||||
if 'title' in kwargs and kwargs['title']:
|
print('\n \033[93m{}\033[0m'.format(title))
|
||||||
print(tpl_title.format(msg=kwargs['title']))
|
print('\n{}\n'.format(table))
|
||||||
|
|
||||||
print(tpl_msg.format(msg=table))
|
|
||||||
|
|
||||||
|
|
||||||
def print_markdown(data, **kwargs):
|
def print_markdown(data, title=None):
|
||||||
"""
|
"""
|
||||||
Print listed data in GitHub-flavoured Markdown format so it can be
|
Print listed data in GitHub-flavoured Markdown format so it can be
|
||||||
copy-pasted into issues. Can either take a list of tuples or a dictionary,
|
copy-pasted into issues. Can either take a list of tuples or a dictionary.
|
||||||
which will be converted to a list of tuples.
|
|
||||||
"""
|
"""
|
||||||
def excl_value(value):
|
def excl_value(value):
|
||||||
# don't print value if it contains absolute path of directory (i.e.
|
return Path(value).exists() # contains path (personal info)
|
||||||
# personal info). Other conditions can be included here if necessary.
|
|
||||||
if unicode_(Path(__file__).parent) in value:
|
|
||||||
return True
|
|
||||||
|
|
||||||
if type(data) == dict:
|
if type(data) == dict:
|
||||||
data = list(data.items())
|
data = list(data.items())
|
||||||
|
markdown = ["* **{}:** {}".format(l, v) for l, v in data if not excl_value(v)]
|
||||||
tpl_msg = "\n{msg}\n"
|
if title:
|
||||||
tpl_title = "\n## {msg}"
|
print("\n## {}".format(title))
|
||||||
tpl_row = "* **{l}:** {v}"
|
print('\n{}\n'.format('\n'.join(markdown)))
|
||||||
markdown = '\n'.join([tpl_row.format(l=l, v=v) for l, v in data if not excl_value(v)])
|
|
||||||
|
|
||||||
if 'title' in kwargs and kwargs['title']:
|
|
||||||
print(tpl_title.format(msg=kwargs['title']))
|
|
||||||
print(tpl_msg.format(msg=markdown))
|
|
||||||
|
|
||||||
|
|
||||||
def print_msg(*text, **kwargs):
|
def prints(*texts, title=None, exits=False):
|
||||||
"""
|
"""
|
||||||
Print formatted message. Each positional argument is rendered as newline-
|
Print formatted message. Each positional argument is rendered as newline-
|
||||||
separated paragraph. If kwarg 'title' exist, title is printed above the text
|
separated paragraph. An optional highlighted title is printed above the text
|
||||||
and highlighted (using ANSI escape sequences manually to avoid unnecessary
|
(using ANSI escape sequences manually to avoid unnecessary dependency).
|
||||||
dependency).
|
|
||||||
"""
|
"""
|
||||||
message = '\n\n'.join([_wrap_text(t) for t in text])
|
title = '\033[93m{}\033[0m\n'.format(_wrap(title)) if title else ''
|
||||||
tpl_msg = '\n{msg}\n'
|
message = '\n\n'.join([_wrap(text) for text in texts])
|
||||||
tpl_title = '\n\033[93m{msg}\033[0m'
|
print('\n{}{}\n'.format(title, message))
|
||||||
|
if exits:
|
||||||
if 'title' in kwargs and kwargs['title']:
|
sys.exit(0)
|
||||||
title = _wrap_text(kwargs['title'])
|
|
||||||
print(tpl_title.format(msg=title))
|
|
||||||
print(tpl_msg.format(msg=message))
|
|
||||||
|
|
||||||
|
|
||||||
def _wrap_text(text):
|
def _wrap(text, wrap_max=80, indent=4):
|
||||||
"""
|
"""
|
||||||
Wrap text at given width using textwrap module. Indent should consist of
|
Wrap text at given width using textwrap module. Indent should consist of
|
||||||
spaces. Its length is deducted from wrap width to ensure exact wrapping.
|
spaces. Its length is deducted from wrap width to ensure exact wrapping.
|
||||||
"""
|
"""
|
||||||
wrap_max = 80
|
indent = indent * ' '
|
||||||
indent = ' '
|
|
||||||
wrap_width = wrap_max - len(indent)
|
wrap_width = wrap_max - len(indent)
|
||||||
|
if isinstance(text, Path):
|
||||||
|
text = path2str(text)
|
||||||
return textwrap.fill(text, width=wrap_width, initial_indent=indent,
|
return textwrap.fill(text, width=wrap_width, initial_indent=indent,
|
||||||
subsequent_indent=indent, break_long_words=False,
|
subsequent_indent=indent, break_long_words=False,
|
||||||
break_on_hyphens=False)
|
break_on_hyphens=False)
|
||||||
|
|
||||||
|
|
||||||
def sys_exit(*messages, **kwargs):
|
|
||||||
"""
|
|
||||||
Performs SystemExit. For modules used from the command line, like
|
|
||||||
download and link. To print message, use the same arguments as for
|
|
||||||
print_msg().
|
|
||||||
"""
|
|
||||||
if messages:
|
|
||||||
print_msg(*messages, **kwargs)
|
|
||||||
sys.exit(0)
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user