Merge remote-tracking branch 'origin/develop-downloads'

This commit is contained in:
Matthew Honnibal 2017-03-16 12:00:42 -05:00
commit 8843b84bd1
15 changed files with 346 additions and 199 deletions

View File

@ -8,5 +8,5 @@ murmurhash>=0.26,<0.27
plac<0.9.3
six
ujson>=1.35
sputnik>=0.9.2,<0.10.0
dill>=0.2,<0.3
requests>=2.13.0,<3.0.0

View File

@ -240,9 +240,9 @@ def setup_package():
'plac<0.9.3',
'six',
'pathlib',
'sputnik>=0.9.2,<0.10.0',
'ujson>=1.35',
'dill>=0.2,<0.3'],
'dill>=0.2,<0.3',
'requests>=2.13.0,<3.0.0'],
classifiers=[
'Development Status :: 5 - Production/Stable',
'Environment :: Console',

View File

@ -1,7 +1,9 @@
import pathlib
# coding: utf8
from __future__ import unicode_literals, print_function
from .util import set_lang_class, get_lang_class
from .about import __version__
import json
from pathlib import Path
from .util import set_lang_class, get_lang_class, parse_package_meta
from . import en
from . import de
@ -16,11 +18,6 @@ from . import sv
from . import fi
from . import bn
try:
basestring
except NameError:
basestring = str
set_lang_class(en.English.lang, en.English)
set_lang_class(de.German.lang, de.German)
@ -36,11 +33,16 @@ set_lang_class(fi.Finnish.lang, fi.Finnish)
set_lang_class(bn.Bengali.lang, bn.Bengali)
def load(name, **overrides):
target_name, target_version = util.split_data_name(name)
data_path = overrides.get('path', util.get_data_path())
path = util.match_best_version(target_name, target_version, data_path)
cls = get_lang_class(target_name)
overrides['path'] = path
meta = parse_package_meta(data_path, name)
lang = meta['lang'] if meta and 'lang' in meta else 'en'
cls = get_lang_class(lang)
overrides['meta'] = meta
overrides['path'] = Path(data_path / name)
return cls(**overrides)
def info(name):
meta = parse_package_meta(util.get_data_path(), name)
print(json.dumps(meta, indent=2))

View File

@ -1,5 +1,4 @@
# inspired from:
# https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
@ -10,7 +9,8 @@ __uri__ = 'https://spacy.io'
__author__ = 'Matthew Honnibal'
__email__ = 'matt@explosion.ai'
__license__ = 'MIT'
__models__ = {
'en': 'en>=1.1.0,<1.2.0',
'de': 'de>=1.0.0,<1.1.0',
}
__docs__ = 'https://spacy.io/docs/usage'
__download_url__ = 'https://github.com/explosion/spacy-models/releases/download'
__compatibility__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json?token=ANAt54fi5zcUtnwGhMLw2klWwcAyHkZGks5Y0nw1wA%3D%3D'
__shortcuts__ = {'en': 'en_core_web_sm', 'de': 'de_core_web_md', 'vectors': 'en_vectors_glove_md'}

View File

@ -1,14 +1,5 @@
import plac
from ..download import download
@plac.annotations(
force=("Force overwrite", "flag", "f", bool),
data_path=("Path to download model", "option", "d", str)
)
def main(data_size='all', force=False, data_path=None):
download('de', force=force, data_path=data_path)
from ..deprecated import ModelDownload as download
if __name__ == '__main__':
plac.call(main)
download.de()

View File

@ -1,35 +1,13 @@
from sputnik.dir_package import DirPackage
from sputnik.package_list import (PackageNotFoundException,
CompatiblePackageNotFoundException)
import sputnik
from pathlib import Path
from . import about
from . import util
from .download import download
def get_package(data_dir):
if not isinstance(data_dir, six.string_types):
raise RuntimeError('data_dir must be a string')
return DirPackage(data_dir)
def get_package_by_name(name=None, via=None):
if name is None:
return
lang = get_lang_class(name)
try:
return sputnik.package(about.__title__, about.__version__,
name, data_path=via)
except PackageNotFoundException as e:
raise RuntimeError("Model '%s' not installed. Please run 'python -m "
"%s.download' to install latest compatible "
"model." % (name, lang.__module__))
except CompatiblePackageNotFoundException as e:
raise RuntimeError("Installed model is not compatible with spaCy "
"version. Please run 'python -m %s.download "
"--force' to install latest compatible model." %
(lang.__module__))
try:
basestring
except NameError:
basestring = str
def read_lang_data(package):
@ -43,7 +21,6 @@ def read_lang_data(package):
return tokenization, prefix, suffix, infix
def align_tokens(ref, indices): # Deprecated, surely?
start = 0
queue = list(indices)
@ -79,4 +56,55 @@ def detokenize(token_rules, words): # Deprecated?
return positions
def fix_glove_vectors_loading(overrides):
"""Special-case hack for loading the GloVe vectors, to support deprecated
<1.0 stuff. Phase this out once the data is fixed."""
if 'data_dir' in overrides and 'path' not in overrides:
raise ValueError("The argument 'data_dir' has been renamed to 'path'")
if overrides.get('path') is False:
return overrides
if overrides.get('path') in (None, True):
data_path = util.get_data_path()
else:
path = overrides['path']
if isinstance(path, basestring):
path = Path(path)
data_path = path.parent
vec_path = None
if 'add_vectors' not in overrides:
if 'vectors' in overrides:
vec_path = util.match_best_version(overrides['vectors'], None, data_path)
if vec_path is None:
return overrides
else:
vec_path = util.match_best_version('en_glove_cc_300_1m_vectors', None, data_path)
if vec_path is not None:
vec_path = vec_path / 'vocab' / 'vec.bin'
if vec_path is not None:
overrides['add_vectors'] = lambda vocab: vocab.load_vectors_from_bin_loc(vec_path)
return overrides
class ModelDownload():
"""Replace download modules within en and de with deprecation warning and
download default language model (using shortcut). Use classmethods to allow
importing ModelDownload as download and calling download.en() etc."""
@classmethod
def load(self, lang):
util.print_msg(
"The spacy.{l}.download command is now deprecated. Please use "
"spacy.download [model name or shortcut] instead. For more "
"info and available models, see the documentation: {d}. "
"Downloading default '{l}' model now...".format(d=about.__docs__, l=lang),
title="Warning: deprecated command")
download(lang)
@classmethod
def en(cls, *args, **kwargs):
cls.load('en')
@classmethod
def de(cls, *args, **kwargs):
cls.load('de')

View File

@ -1,47 +1,80 @@
from __future__ import print_function
import sys
import shutil
import sputnik
from sputnik.package_list import (PackageNotFoundException,
CompatiblePackageNotFoundException)
# coding: utf8
from __future__ import unicode_literals
import pip
import plac
import requests
from os import path
from . import about
from . import util
def download(lang, force=False, fail_on_exist=True, data_path=None):
if not data_path:
data_path = util.get_data_path(require_exists=False)
@plac.annotations(
model=("Model to download", "positional", None, str),
direct=("Force direct download", "flag", "d", bool)
)
def download(model=None, direct=False):
"""Download compatible model from default download path using pip."""
# spaCy uses pathlib, and util.get_data_path returns a pathlib.Path object,
# but sputnik (which we're using below) doesn't use pathlib and requires
# its data_path parameters to be strings, so we coerce the data_path to a
# str here.
data_path = str(data_path)
check_error_depr(model)
try:
pkg = sputnik.package(about.__title__, about.__version__,
about.__models__.get(lang, lang), data_path)
if force:
shutil.rmtree(pkg.path)
elif fail_on_exist:
print("Model already installed. Please run 'python -m "
"spacy.%s.download --force' to reinstall." % lang, file=sys.stderr)
sys.exit(0)
except (PackageNotFoundException, CompatiblePackageNotFoundException):
pass
if direct:
download_model('{m}/{m}.tar.gz'.format(m=model))
else:
model = about.__shortcuts__[model] if model in about.__shortcuts__ else model
compatibility = get_compatibility()
version = get_version(model, compatibility)
download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model, v=version))
package = sputnik.install(about.__title__, about.__version__,
about.__models__.get(lang, lang), data_path)
try:
sputnik.package(about.__title__, about.__version__,
about.__models__.get(lang, lang), data_path)
except (PackageNotFoundException, CompatiblePackageNotFoundException):
print("Model failed to install. Please run 'python -m "
"spacy.%s.download --force'." % lang, file=sys.stderr)
sys.exit(1)
def get_compatibility():
version = about.__version__
r = requests.get(about.__compatibility__)
if r.status_code != 200:
util.sys_exit(
"Couldn't fetch compatibility table. Please find the right model for "
"your spaCy installation (v{v}), and download it manually:".format(v=version),
"python -m spacy.download [full model name + version] --direct",
title="Server error ({c})".format(c=r.status_code))
print("Model successfully installed to %s" % data_path, file=sys.stderr)
comp = r.json()['spacy']
if version not in comp:
util.sys_exit(
"No compatible models found for v{v} of spaCy.".format(v=version),
title="Compatibility error")
else:
return comp[version]
def get_version(model, comp):
if model not in comp:
util.sys_exit(
"No compatible model found for "
"{m} (spaCy v{v}).".format(m=model, v=about.__version__),
title="Compatibility error")
return comp[model][0]
def download_model(filename):
util.print_msg("Downloading {f}".format(f=filename))
download_url = path.join(about.__download_url__, filename)
pip.main(['install', download_url])
def check_error_depr(model):
if not model:
util.sys_exit(
"python -m spacy.download [name or shortcut]",
title="Missing model name or shortcut")
if model == 'all':
util.sys_exit(
"As of v1.7.0, the download all command is deprecated. Please "
"download the models individually via spacy.download [model name] "
"or pip install. For more info on this, see the documentation: "
"{d}".format(d=about.__docs__),
title="Deprecated command")
if __name__ == '__main__':
plac.call(download)

View File

@ -1,19 +1,16 @@
# coding: utf8
from __future__ import unicode_literals, print_function
from __future__ import unicode_literals
from os import path
from pathlib import Path
from ..util import match_best_version
from ..util import get_data_path
from ..language import Language
from ..lemmatizer import Lemmatizer
from ..vocab import Vocab
from ..tokenizer import Tokenizer
from ..attrs import LANG
from ..deprecated import fix_glove_vectors_loading
from .language_data import *
try:
basestring
except NameError:
@ -38,34 +35,6 @@ class English(Language):
def __init__(self, **overrides):
# Make a special-case hack for loading the GloVe vectors, to support
# deprecated <1.0 stuff. Phase this out once the data is fixed.
overrides = _fix_deprecated_glove_vectors_loading(overrides)
# Special-case hack for loading the GloVe vectors, to support <1.0
overrides = fix_glove_vectors_loading(overrides)
Language.__init__(self, **overrides)
def _fix_deprecated_glove_vectors_loading(overrides):
if 'data_dir' in overrides and 'path' not in overrides:
raise ValueError("The argument 'data_dir' has been renamed to 'path'")
if overrides.get('path') is False:
return overrides
if overrides.get('path') in (None, True):
data_path = get_data_path()
else:
path = overrides['path']
if isinstance(path, basestring):
path = Path(path)
data_path = path.parent
vec_path = None
if 'add_vectors' not in overrides:
if 'vectors' in overrides:
vec_path = match_best_version(overrides['vectors'], None, data_path)
if vec_path is None:
return overrides
else:
vec_path = match_best_version('en_glove_cc_300_1m_vectors', None, data_path)
if vec_path is not None:
vec_path = vec_path / 'vocab' / 'vec.bin'
if vec_path is not None:
overrides['add_vectors'] = lambda vocab: vocab.load_vectors_from_bin_loc(vec_path)
return overrides

View File

@ -1,25 +1,5 @@
import plac
import sputnik
from ..download import download
from .. import about
@plac.annotations(
force=("Force overwrite", "flag", "f", bool),
data_path=("Path to download model", "option", "d", str)
)
def main(data_size='all', force=False, data_path=None):
if force:
sputnik.purge(about.__title__, about.__version__)
if data_size in ('all', 'parser'):
print("Downloading parsing model")
download('en', force=False, data_path=data_path)
if data_size in ('all', 'glove'):
print("Downloading GloVe vectors")
download('en_glove_cc_300_1m_vectors', force=False, data_path=data_path)
from ..deprecated import ModelDownload as download
if __name__ == '__main__':
plac.call(main)
download.en()

View File

@ -281,6 +281,7 @@ class Language(object):
if path is True:
path = util.match_best_version(self.lang, '', util.get_data_path())
self.meta = overrides.get('meta', {})
self.path = path
self.vocab = self.Defaults.create_vocab(self) \

72
spacy/link.py Normal file
View File

@ -0,0 +1,72 @@
# coding: utf8
from __future__ import unicode_literals
import io
import os
import pip
import site
import plac
from . import util
@plac.annotations(
origin=("Package name or path to model", "positional", None, str),
link_name=("Name of link", "positional", None, str),
force=("Force overwriting existing link", "flag", "f", bool)
)
def link(origin, link_name, force=False):
"""Create a symlink for models within the spacy/data directory. Accepts
either the name of a pip package, or the local path to the model data
directory. Linking models allows loading them via spacy.load(link_name)."""
if is_package(origin):
package_path = site.getsitepackages()[0]
meta = get_meta(package_path, origin)
data_dir = origin + '-' + meta['version']
model_path = os.path.join(package_path, origin, data_dir)
symlink(model_path, link_name, force)
else:
symlink(origin, link_name, force)
def symlink(model_path, link_name, force):
if not os.path.isdir(model_path):
util.sys_exit(
"The data should be located in {p}".format(p=model_path),
title="Can't locate model data")
data_path = str(util.get_data_path())
link_path = os.path.join(os.path.abspath(__file__ + '/../../'), data_path, link_name)
if os.path.isdir(link_path):
if force:
os.unlink(link_path)
else:
util.sys_exit(
"To overwrite an existing link, use the --force flag.",
title="Link {l} already exists".format(l=link_name))
os.symlink(model_path, link_path)
util.print_msg(
"{a} --> {b}".format(a=model_path, b=link_path),
"You can now load the model via spacy.load('{l}').".format(l=link_name),
title="Linking successful")
def get_meta(package_path, package):
meta = util.parse_package_meta(package_path, package)
if not meta:
util.sys_exit()
return meta
def is_package(origin):
packages = pip.get_installed_distributions()
for package in packages:
if package.project_name.replace('-', '_') == origin:
return True
return False
if __name__ == '__main__':
plac.call(link)

View File

@ -0,0 +1,36 @@
# coding: utf-8
from __future__ import unicode_literals
from ..download import download, get_compatibility, get_version, check_error_depr
import pytest
def test_download_fetch_compatibility():
compatibility = get_compatibility()
assert type(compatibility) == dict
@pytest.mark.slow
@pytest.mark.parametrize('model', ['en_core_web_md-1.2.0'])
def test_download_direct_download(model):
download(model, direct=True)
@pytest.mark.parametrize('model', ['en_core_web_md'])
def test_download_get_matching_version_succeeds(model):
comp = { model: ['1.7.0', '0.100.0'] }
assert get_version(model, comp)
@pytest.mark.parametrize('model', ['en_core_web_md'])
def test_download_get_matching_version_fails(model):
diff_model = 'test_' + model
comp = { diff_model: ['1.7.0', '0.100.0'] }
with pytest.raises(SystemExit):
assert get_version(model, comp)
@pytest.mark.parametrize('model', [False, None, '', 'all'])
def test_download_no_model_depr_error(model):
with pytest.raises(SystemExit):
check_error_depr(model)

View File

@ -1,40 +1,36 @@
# coding: utf-8
from __future__ import unicode_literals
import io
import pytest
import dill as pickle
from ..strings import StringStore
from ..vocab import Vocab
from ..attrs import NORM
def test_pickle_string_store():
sstore = StringStore()
hello = sstore['hello']
bye = sstore['bye']
bdata = pickle.dumps(sstore, protocol=-1)
unpickled = pickle.loads(bdata)
assert unpickled['hello'] == hello
assert unpickled['bye'] == bye
assert len(sstore) == len(unpickled)
@pytest.mark.parametrize('text1,text2', [('hello', 'bye')])
def test_pickle_string_store(stringstore, text1, text2):
store1 = stringstore[text1]
store2 = stringstore[text2]
data = pickle.dumps(stringstore, protocol=-1)
unpickled = pickle.loads(data)
assert unpickled[text1] == store1
assert unpickled[text2] == store2
assert len(stringstore) == len(unpickled)
@pytest.mark.xfail
def test_pickle_vocab():
@pytest.mark.parametrize('text1,text2', [('dog', 'cat')])
def test_pickle_vocab(text1, text2):
vocab = Vocab(lex_attr_getters={int(NORM): lambda string: string[:-1]})
dog = vocab[u'dog']
cat = vocab[u'cat']
assert dog.norm_ == 'do'
assert cat.norm_ == 'ca'
bdata = pickle.dumps(vocab)
unpickled = pickle.loads(bdata)
assert unpickled[u'dog'].orth == dog.orth
assert unpickled[u'cat'].orth == cat.orth
assert unpickled[u'dog'].norm == dog.norm
assert unpickled[u'cat'].norm == cat.norm
dog_ = unpickled[u'dog']
cat_ = unpickled[u'cat']
assert dog_.norm != cat_.norm
lex1 = vocab[text1]
lex2 = vocab[text2]
assert lex1.norm_ == text1[:-1]
assert lex2.norm_ == text2[:-1]
data = pickle.dumps(vocab)
unpickled = pickle.loads(data)
assert unpickled[text1].orth == lex1.orth
assert unpickled[text2].orth == lex2.orth
assert unpickled[text1].norm == lex1.norm
assert unpickled[text2].norm == lex2.norm
assert unpickled[text1].norm != unpickled[text2].norm

View File

@ -1,13 +1,16 @@
# coding: utf8
from __future__ import unicode_literals
from __future__ import unicode_literals, print_function
import os
import io
import json
import re
import os.path
import pathlib
import sys
import six
import textwrap
from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
try:
@ -144,3 +147,53 @@ def check_renamed_kwargs(renamed, kwargs):
for old, new in renamed.items():
if old in kwargs:
raise TypeError("Keyword argument %s now renamed to %s" % (old, new))
def parse_package_meta(package_path, package):
location = os.path.join(str(package_path), package, 'meta.json')
if not os.path.isfile(location):
print_msg("'{p}' doesn't seem to be a valid model package.".format(p=package),
title="No meta.json found")
else:
with io.open(location, encoding='utf8') as f:
meta = json.load(f)
return meta
return False
def print_msg(*text, **kwargs):
"""Print formatted message. Each positional argument is rendered as newline-
separated paragraph. If kwarg 'title' exist, title is printed above the text
and highlighted (using ANSI escape sequences manually to avoid unnecessary
dependency)."""
message = '\n\n'.join([_wrap_text(t) for t in text])
tpl_msg = '\n{msg}\n'
tpl_title = '\n\033[93m{msg}\033[0m'
if 'title' in kwargs and kwargs['title']:
title = _wrap_text(kwargs['title'])
print(tpl_title.format(msg=title))
print(tpl_msg.format(msg=message))
def _wrap_text(text):
"""Wrap text at given width using textwrap module. Indent should consist of
spaces. Its length is deducted from wrap width to ensure exact wrapping."""
wrap_max = 80
indent = ' '
wrap_width = wrap_max - len(indent)
return textwrap.fill(text, width=wrap_width, initial_indent=indent,
subsequent_indent=indent, break_long_words=False,
break_on_hyphens=False)
def sys_exit(*messages, **kwargs):
"""Performs SystemExit. For modules used from the command line, like
download and link. To print message, use the same arguments as for
print_msg()."""
if messages:
print_msg(*messages, **kwargs)
sys.exit(0)

View File

@ -57,20 +57,6 @@ p Many of the associated tools and resources that we're developing alongside spa
+cell
| Super sparse multi-class machine learning with Cython.
+row
+cell
+src(gh("sputnik")) Sputnik
+cell
| Data package manager library for spaCy.
+row
+cell
+src(gh("sputnik-server")) Sputnik Server
+cell
| Index service for the Sputnik data package manager for spaCy.
+row
+cell
+src(gh("cymem")) Cymem