Merge remote-tracking branch 'origin/develop-downloads'

This commit is contained in:
Matthew Honnibal 2017-03-16 12:00:42 -05:00
commit 8843b84bd1
15 changed files with 346 additions and 199 deletions

View File

@ -8,5 +8,5 @@ murmurhash>=0.26,<0.27
plac<0.9.3 plac<0.9.3
six six
ujson>=1.35 ujson>=1.35
sputnik>=0.9.2,<0.10.0
dill>=0.2,<0.3 dill>=0.2,<0.3
requests>=2.13.0,<3.0.0

View File

@ -240,9 +240,9 @@ def setup_package():
'plac<0.9.3', 'plac<0.9.3',
'six', 'six',
'pathlib', 'pathlib',
'sputnik>=0.9.2,<0.10.0',
'ujson>=1.35', 'ujson>=1.35',
'dill>=0.2,<0.3'], 'dill>=0.2,<0.3',
'requests>=2.13.0,<3.0.0'],
classifiers=[ classifiers=[
'Development Status :: 5 - Production/Stable', 'Development Status :: 5 - Production/Stable',
'Environment :: Console', 'Environment :: Console',

View File

@ -1,7 +1,9 @@
import pathlib # coding: utf8
from __future__ import unicode_literals, print_function
from .util import set_lang_class, get_lang_class import json
from .about import __version__ from pathlib import Path
from .util import set_lang_class, get_lang_class, parse_package_meta
from . import en from . import en
from . import de from . import de
@ -16,11 +18,6 @@ from . import sv
from . import fi from . import fi
from . import bn from . import bn
try:
basestring
except NameError:
basestring = str
set_lang_class(en.English.lang, en.English) set_lang_class(en.English.lang, en.English)
set_lang_class(de.German.lang, de.German) set_lang_class(de.German.lang, de.German)
@ -36,11 +33,16 @@ set_lang_class(fi.Finnish.lang, fi.Finnish)
set_lang_class(bn.Bengali.lang, bn.Bengali) set_lang_class(bn.Bengali.lang, bn.Bengali)
def load(name, **overrides): def load(name, **overrides):
target_name, target_version = util.split_data_name(name)
data_path = overrides.get('path', util.get_data_path()) data_path = overrides.get('path', util.get_data_path())
path = util.match_best_version(target_name, target_version, data_path) meta = parse_package_meta(data_path, name)
cls = get_lang_class(target_name) lang = meta['lang'] if meta and 'lang' in meta else 'en'
overrides['path'] = path cls = get_lang_class(lang)
overrides['meta'] = meta
overrides['path'] = Path(data_path / name)
return cls(**overrides) return cls(**overrides)
def info(name):
meta = parse_package_meta(util.get_data_path(), name)
print(json.dumps(meta, indent=2))

View File

@ -1,5 +1,4 @@
# inspired from: # inspired from:
# https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/ # https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
@ -10,7 +9,8 @@ __uri__ = 'https://spacy.io'
__author__ = 'Matthew Honnibal' __author__ = 'Matthew Honnibal'
__email__ = 'matt@explosion.ai' __email__ = 'matt@explosion.ai'
__license__ = 'MIT' __license__ = 'MIT'
__models__ = {
'en': 'en>=1.1.0,<1.2.0', __docs__ = 'https://spacy.io/docs/usage'
'de': 'de>=1.0.0,<1.1.0', __download_url__ = 'https://github.com/explosion/spacy-models/releases/download'
} __compatibility__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json?token=ANAt54fi5zcUtnwGhMLw2klWwcAyHkZGks5Y0nw1wA%3D%3D'
__shortcuts__ = {'en': 'en_core_web_sm', 'de': 'de_core_web_md', 'vectors': 'en_vectors_glove_md'}

View File

@ -1,14 +1,5 @@
import plac from ..deprecated import ModelDownload as download
from ..download import download
@plac.annotations(
force=("Force overwrite", "flag", "f", bool),
data_path=("Path to download model", "option", "d", str)
)
def main(data_size='all', force=False, data_path=None):
download('de', force=force, data_path=data_path)
if __name__ == '__main__': if __name__ == '__main__':
plac.call(main) download.de()

View File

@ -1,35 +1,13 @@
from sputnik.dir_package import DirPackage from pathlib import Path
from sputnik.package_list import (PackageNotFoundException,
CompatiblePackageNotFoundException)
import sputnik
from . import about from . import about
from . import util
from .download import download
def get_package(data_dir): try:
if not isinstance(data_dir, six.string_types): basestring
raise RuntimeError('data_dir must be a string') except NameError:
return DirPackage(data_dir) basestring = str
def get_package_by_name(name=None, via=None):
if name is None:
return
lang = get_lang_class(name)
try:
return sputnik.package(about.__title__, about.__version__,
name, data_path=via)
except PackageNotFoundException as e:
raise RuntimeError("Model '%s' not installed. Please run 'python -m "
"%s.download' to install latest compatible "
"model." % (name, lang.__module__))
except CompatiblePackageNotFoundException as e:
raise RuntimeError("Installed model is not compatible with spaCy "
"version. Please run 'python -m %s.download "
"--force' to install latest compatible model." %
(lang.__module__))
def read_lang_data(package): def read_lang_data(package):
@ -43,7 +21,6 @@ def read_lang_data(package):
return tokenization, prefix, suffix, infix return tokenization, prefix, suffix, infix
def align_tokens(ref, indices): # Deprecated, surely? def align_tokens(ref, indices): # Deprecated, surely?
start = 0 start = 0
queue = list(indices) queue = list(indices)
@ -79,4 +56,55 @@ def detokenize(token_rules, words): # Deprecated?
return positions return positions
def fix_glove_vectors_loading(overrides):
"""Special-case hack for loading the GloVe vectors, to support deprecated
<1.0 stuff. Phase this out once the data is fixed."""
if 'data_dir' in overrides and 'path' not in overrides:
raise ValueError("The argument 'data_dir' has been renamed to 'path'")
if overrides.get('path') is False:
return overrides
if overrides.get('path') in (None, True):
data_path = util.get_data_path()
else:
path = overrides['path']
if isinstance(path, basestring):
path = Path(path)
data_path = path.parent
vec_path = None
if 'add_vectors' not in overrides:
if 'vectors' in overrides:
vec_path = util.match_best_version(overrides['vectors'], None, data_path)
if vec_path is None:
return overrides
else:
vec_path = util.match_best_version('en_glove_cc_300_1m_vectors', None, data_path)
if vec_path is not None:
vec_path = vec_path / 'vocab' / 'vec.bin'
if vec_path is not None:
overrides['add_vectors'] = lambda vocab: vocab.load_vectors_from_bin_loc(vec_path)
return overrides
class ModelDownload():
"""Replace download modules within en and de with deprecation warning and
download default language model (using shortcut). Use classmethods to allow
importing ModelDownload as download and calling download.en() etc."""
@classmethod
def load(self, lang):
util.print_msg(
"The spacy.{l}.download command is now deprecated. Please use "
"spacy.download [model name or shortcut] instead. For more "
"info and available models, see the documentation: {d}. "
"Downloading default '{l}' model now...".format(d=about.__docs__, l=lang),
title="Warning: deprecated command")
download(lang)
@classmethod
def en(cls, *args, **kwargs):
cls.load('en')
@classmethod
def de(cls, *args, **kwargs):
cls.load('de')

View File

@ -1,47 +1,80 @@
from __future__ import print_function # coding: utf8
from __future__ import unicode_literals
import sys
import shutil
import sputnik
from sputnik.package_list import (PackageNotFoundException,
CompatiblePackageNotFoundException)
import pip
import plac
import requests
from os import path
from . import about from . import about
from . import util from . import util
def download(lang, force=False, fail_on_exist=True, data_path=None): @plac.annotations(
if not data_path: model=("Model to download", "positional", None, str),
data_path = util.get_data_path(require_exists=False) direct=("Force direct download", "flag", "d", bool)
)
def download(model=None, direct=False):
"""Download compatible model from default download path using pip."""
# spaCy uses pathlib, and util.get_data_path returns a pathlib.Path object, check_error_depr(model)
# but sputnik (which we're using below) doesn't use pathlib and requires
# its data_path parameters to be strings, so we coerce the data_path to a
# str here.
data_path = str(data_path)
try: if direct:
pkg = sputnik.package(about.__title__, about.__version__, download_model('{m}/{m}.tar.gz'.format(m=model))
about.__models__.get(lang, lang), data_path) else:
if force: model = about.__shortcuts__[model] if model in about.__shortcuts__ else model
shutil.rmtree(pkg.path) compatibility = get_compatibility()
elif fail_on_exist: version = get_version(model, compatibility)
print("Model already installed. Please run 'python -m " download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model, v=version))
"spacy.%s.download --force' to reinstall." % lang, file=sys.stderr)
sys.exit(0)
except (PackageNotFoundException, CompatiblePackageNotFoundException):
pass
package = sputnik.install(about.__title__, about.__version__,
about.__models__.get(lang, lang), data_path)
try: def get_compatibility():
sputnik.package(about.__title__, about.__version__, version = about.__version__
about.__models__.get(lang, lang), data_path) r = requests.get(about.__compatibility__)
except (PackageNotFoundException, CompatiblePackageNotFoundException): if r.status_code != 200:
print("Model failed to install. Please run 'python -m " util.sys_exit(
"spacy.%s.download --force'." % lang, file=sys.stderr) "Couldn't fetch compatibility table. Please find the right model for "
sys.exit(1) "your spaCy installation (v{v}), and download it manually:".format(v=version),
"python -m spacy.download [full model name + version] --direct",
title="Server error ({c})".format(c=r.status_code))
print("Model successfully installed to %s" % data_path, file=sys.stderr) comp = r.json()['spacy']
if version not in comp:
util.sys_exit(
"No compatible models found for v{v} of spaCy.".format(v=version),
title="Compatibility error")
else:
return comp[version]
def get_version(model, comp):
if model not in comp:
util.sys_exit(
"No compatible model found for "
"{m} (spaCy v{v}).".format(m=model, v=about.__version__),
title="Compatibility error")
return comp[model][0]
def download_model(filename):
util.print_msg("Downloading {f}".format(f=filename))
download_url = path.join(about.__download_url__, filename)
pip.main(['install', download_url])
def check_error_depr(model):
if not model:
util.sys_exit(
"python -m spacy.download [name or shortcut]",
title="Missing model name or shortcut")
if model == 'all':
util.sys_exit(
"As of v1.7.0, the download all command is deprecated. Please "
"download the models individually via spacy.download [model name] "
"or pip install. For more info on this, see the documentation: "
"{d}".format(d=about.__docs__),
title="Deprecated command")
if __name__ == '__main__':
plac.call(download)

View File

@ -1,19 +1,16 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals, print_function from __future__ import unicode_literals
from os import path
from pathlib import Path
from ..util import match_best_version
from ..util import get_data_path
from ..language import Language from ..language import Language
from ..lemmatizer import Lemmatizer from ..lemmatizer import Lemmatizer
from ..vocab import Vocab from ..vocab import Vocab
from ..tokenizer import Tokenizer from ..tokenizer import Tokenizer
from ..attrs import LANG from ..attrs import LANG
from ..deprecated import fix_glove_vectors_loading
from .language_data import * from .language_data import *
try: try:
basestring basestring
except NameError: except NameError:
@ -38,34 +35,6 @@ class English(Language):
def __init__(self, **overrides): def __init__(self, **overrides):
# Make a special-case hack for loading the GloVe vectors, to support # Special-case hack for loading the GloVe vectors, to support <1.0
# deprecated <1.0 stuff. Phase this out once the data is fixed. overrides = fix_glove_vectors_loading(overrides)
overrides = _fix_deprecated_glove_vectors_loading(overrides)
Language.__init__(self, **overrides) Language.__init__(self, **overrides)
def _fix_deprecated_glove_vectors_loading(overrides):
if 'data_dir' in overrides and 'path' not in overrides:
raise ValueError("The argument 'data_dir' has been renamed to 'path'")
if overrides.get('path') is False:
return overrides
if overrides.get('path') in (None, True):
data_path = get_data_path()
else:
path = overrides['path']
if isinstance(path, basestring):
path = Path(path)
data_path = path.parent
vec_path = None
if 'add_vectors' not in overrides:
if 'vectors' in overrides:
vec_path = match_best_version(overrides['vectors'], None, data_path)
if vec_path is None:
return overrides
else:
vec_path = match_best_version('en_glove_cc_300_1m_vectors', None, data_path)
if vec_path is not None:
vec_path = vec_path / 'vocab' / 'vec.bin'
if vec_path is not None:
overrides['add_vectors'] = lambda vocab: vocab.load_vectors_from_bin_loc(vec_path)
return overrides

View File

@ -1,25 +1,5 @@
import plac from ..deprecated import ModelDownload as download
import sputnik
from ..download import download
from .. import about
@plac.annotations(
force=("Force overwrite", "flag", "f", bool),
data_path=("Path to download model", "option", "d", str)
)
def main(data_size='all', force=False, data_path=None):
if force:
sputnik.purge(about.__title__, about.__version__)
if data_size in ('all', 'parser'):
print("Downloading parsing model")
download('en', force=False, data_path=data_path)
if data_size in ('all', 'glove'):
print("Downloading GloVe vectors")
download('en_glove_cc_300_1m_vectors', force=False, data_path=data_path)
if __name__ == '__main__': if __name__ == '__main__':
plac.call(main) download.en()

View File

@ -281,6 +281,7 @@ class Language(object):
if path is True: if path is True:
path = util.match_best_version(self.lang, '', util.get_data_path()) path = util.match_best_version(self.lang, '', util.get_data_path())
self.meta = overrides.get('meta', {})
self.path = path self.path = path
self.vocab = self.Defaults.create_vocab(self) \ self.vocab = self.Defaults.create_vocab(self) \

72
spacy/link.py Normal file
View File

@ -0,0 +1,72 @@
# coding: utf8
from __future__ import unicode_literals
import io
import os
import pip
import site
import plac
from . import util
@plac.annotations(
origin=("Package name or path to model", "positional", None, str),
link_name=("Name of link", "positional", None, str),
force=("Force overwriting existing link", "flag", "f", bool)
)
def link(origin, link_name, force=False):
"""Create a symlink for models within the spacy/data directory. Accepts
either the name of a pip package, or the local path to the model data
directory. Linking models allows loading them via spacy.load(link_name)."""
if is_package(origin):
package_path = site.getsitepackages()[0]
meta = get_meta(package_path, origin)
data_dir = origin + '-' + meta['version']
model_path = os.path.join(package_path, origin, data_dir)
symlink(model_path, link_name, force)
else:
symlink(origin, link_name, force)
def symlink(model_path, link_name, force):
if not os.path.isdir(model_path):
util.sys_exit(
"The data should be located in {p}".format(p=model_path),
title="Can't locate model data")
data_path = str(util.get_data_path())
link_path = os.path.join(os.path.abspath(__file__ + '/../../'), data_path, link_name)
if os.path.isdir(link_path):
if force:
os.unlink(link_path)
else:
util.sys_exit(
"To overwrite an existing link, use the --force flag.",
title="Link {l} already exists".format(l=link_name))
os.symlink(model_path, link_path)
util.print_msg(
"{a} --> {b}".format(a=model_path, b=link_path),
"You can now load the model via spacy.load('{l}').".format(l=link_name),
title="Linking successful")
def get_meta(package_path, package):
meta = util.parse_package_meta(package_path, package)
if not meta:
util.sys_exit()
return meta
def is_package(origin):
packages = pip.get_installed_distributions()
for package in packages:
if package.project_name.replace('-', '_') == origin:
return True
return False
if __name__ == '__main__':
plac.call(link)

View File

@ -0,0 +1,36 @@
# coding: utf-8
from __future__ import unicode_literals
from ..download import download, get_compatibility, get_version, check_error_depr
import pytest
def test_download_fetch_compatibility():
compatibility = get_compatibility()
assert type(compatibility) == dict
@pytest.mark.slow
@pytest.mark.parametrize('model', ['en_core_web_md-1.2.0'])
def test_download_direct_download(model):
download(model, direct=True)
@pytest.mark.parametrize('model', ['en_core_web_md'])
def test_download_get_matching_version_succeeds(model):
comp = { model: ['1.7.0', '0.100.0'] }
assert get_version(model, comp)
@pytest.mark.parametrize('model', ['en_core_web_md'])
def test_download_get_matching_version_fails(model):
diff_model = 'test_' + model
comp = { diff_model: ['1.7.0', '0.100.0'] }
with pytest.raises(SystemExit):
assert get_version(model, comp)
@pytest.mark.parametrize('model', [False, None, '', 'all'])
def test_download_no_model_depr_error(model):
with pytest.raises(SystemExit):
check_error_depr(model)

View File

@ -1,40 +1,36 @@
# coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import io
import pytest import pytest
import dill as pickle import dill as pickle
from ..strings import StringStore
from ..vocab import Vocab from ..vocab import Vocab
from ..attrs import NORM from ..attrs import NORM
def test_pickle_string_store(): @pytest.mark.parametrize('text1,text2', [('hello', 'bye')])
sstore = StringStore() def test_pickle_string_store(stringstore, text1, text2):
hello = sstore['hello'] store1 = stringstore[text1]
bye = sstore['bye'] store2 = stringstore[text2]
bdata = pickle.dumps(sstore, protocol=-1) data = pickle.dumps(stringstore, protocol=-1)
unpickled = pickle.loads(bdata) unpickled = pickle.loads(data)
assert unpickled['hello'] == hello assert unpickled[text1] == store1
assert unpickled['bye'] == bye assert unpickled[text2] == store2
assert len(sstore) == len(unpickled) assert len(stringstore) == len(unpickled)
@pytest.mark.xfail @pytest.mark.xfail
def test_pickle_vocab(): @pytest.mark.parametrize('text1,text2', [('dog', 'cat')])
def test_pickle_vocab(text1, text2):
vocab = Vocab(lex_attr_getters={int(NORM): lambda string: string[:-1]}) vocab = Vocab(lex_attr_getters={int(NORM): lambda string: string[:-1]})
dog = vocab[u'dog'] lex1 = vocab[text1]
cat = vocab[u'cat'] lex2 = vocab[text2]
assert dog.norm_ == 'do' assert lex1.norm_ == text1[:-1]
assert cat.norm_ == 'ca' assert lex2.norm_ == text2[:-1]
data = pickle.dumps(vocab)
bdata = pickle.dumps(vocab) unpickled = pickle.loads(data)
unpickled = pickle.loads(bdata) assert unpickled[text1].orth == lex1.orth
assert unpickled[text2].orth == lex2.orth
assert unpickled[u'dog'].orth == dog.orth assert unpickled[text1].norm == lex1.norm
assert unpickled[u'cat'].orth == cat.orth assert unpickled[text2].norm == lex2.norm
assert unpickled[u'dog'].norm == dog.norm assert unpickled[text1].norm != unpickled[text2].norm
assert unpickled[u'cat'].norm == cat.norm
dog_ = unpickled[u'dog']
cat_ = unpickled[u'cat']
assert dog_.norm != cat_.norm

View File

@ -1,13 +1,16 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals, print_function
import os import os
import io import io
import json import json
import re import re
import os.path import os.path
import pathlib import pathlib
import sys
import six import six
import textwrap
from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
try: try:
@ -144,3 +147,53 @@ def check_renamed_kwargs(renamed, kwargs):
for old, new in renamed.items(): for old, new in renamed.items():
if old in kwargs: if old in kwargs:
raise TypeError("Keyword argument %s now renamed to %s" % (old, new)) raise TypeError("Keyword argument %s now renamed to %s" % (old, new))
def parse_package_meta(package_path, package):
location = os.path.join(str(package_path), package, 'meta.json')
if not os.path.isfile(location):
print_msg("'{p}' doesn't seem to be a valid model package.".format(p=package),
title="No meta.json found")
else:
with io.open(location, encoding='utf8') as f:
meta = json.load(f)
return meta
return False
def print_msg(*text, **kwargs):
"""Print formatted message. Each positional argument is rendered as newline-
separated paragraph. If kwarg 'title' exist, title is printed above the text
and highlighted (using ANSI escape sequences manually to avoid unnecessary
dependency)."""
message = '\n\n'.join([_wrap_text(t) for t in text])
tpl_msg = '\n{msg}\n'
tpl_title = '\n\033[93m{msg}\033[0m'
if 'title' in kwargs and kwargs['title']:
title = _wrap_text(kwargs['title'])
print(tpl_title.format(msg=title))
print(tpl_msg.format(msg=message))
def _wrap_text(text):
"""Wrap text at given width using textwrap module. Indent should consist of
spaces. Its length is deducted from wrap width to ensure exact wrapping."""
wrap_max = 80
indent = ' '
wrap_width = wrap_max - len(indent)
return textwrap.fill(text, width=wrap_width, initial_indent=indent,
subsequent_indent=indent, break_long_words=False,
break_on_hyphens=False)
def sys_exit(*messages, **kwargs):
"""Performs SystemExit. For modules used from the command line, like
download and link. To print message, use the same arguments as for
print_msg()."""
if messages:
print_msg(*messages, **kwargs)
sys.exit(0)

View File

@ -57,20 +57,6 @@ p Many of the associated tools and resources that we're developing alongside spa
+cell +cell
| Super sparse multi-class machine learning with Cython. | Super sparse multi-class machine learning with Cython.
+row
+cell
+src(gh("sputnik")) Sputnik
+cell
| Data package manager library for spaCy.
+row
+cell
+src(gh("sputnik-server")) Sputnik Server
+cell
| Index service for the Sputnik data package manager for spaCy.
+row +row
+cell +cell
+src(gh("cymem")) Cymem +src(gh("cymem")) Cymem