diff --git a/requirements.txt b/requirements.txt index a5db71dd5..a1721b02c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,5 +8,5 @@ murmurhash>=0.26,<0.27 plac<0.9.3 six ujson>=1.35 -sputnik>=0.9.2,<0.10.0 dill>=0.2,<0.3 +requests>=2.13.0,<3.0.0 diff --git a/setup.py b/setup.py index 26f395ea5..b774994f6 100644 --- a/setup.py +++ b/setup.py @@ -240,9 +240,9 @@ def setup_package(): 'plac<0.9.3', 'six', 'pathlib', - 'sputnik>=0.9.2,<0.10.0', 'ujson>=1.35', - 'dill>=0.2,<0.3'], + 'dill>=0.2,<0.3', + 'requests>=2.13.0,<3.0.0'], classifiers=[ 'Development Status :: 5 - Production/Stable', 'Environment :: Console', diff --git a/spacy/__init__.py b/spacy/__init__.py index d924c1532..ca5a39f05 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -1,7 +1,9 @@ -import pathlib +# coding: utf8 +from __future__ import unicode_literals, print_function -from .util import set_lang_class, get_lang_class -from .about import __version__ +import json +from pathlib import Path +from .util import set_lang_class, get_lang_class, parse_package_meta from . import en from . import de @@ -16,11 +18,6 @@ from . import sv from . import fi from . import bn -try: - basestring -except NameError: - basestring = str - set_lang_class(en.English.lang, en.English) set_lang_class(de.German.lang, de.German) @@ -36,11 +33,16 @@ set_lang_class(fi.Finnish.lang, fi.Finnish) set_lang_class(bn.Bengali.lang, bn.Bengali) - def load(name, **overrides): - target_name, target_version = util.split_data_name(name) data_path = overrides.get('path', util.get_data_path()) - path = util.match_best_version(target_name, target_version, data_path) - cls = get_lang_class(target_name) - overrides['path'] = path + meta = parse_package_meta(data_path, name) + lang = meta['lang'] if meta and 'lang' in meta else 'en' + cls = get_lang_class(lang) + overrides['meta'] = meta + overrides['path'] = Path(data_path / name) return cls(**overrides) + + +def info(name): + meta = parse_package_meta(util.get_data_path(), name) + print(json.dumps(meta, indent=2)) diff --git a/spacy/about.py b/spacy/about.py index d51dea286..8e21ab316 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,5 +1,4 @@ # inspired from: - # https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/ # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py @@ -10,7 +9,8 @@ __uri__ = 'https://spacy.io' __author__ = 'Matthew Honnibal' __email__ = 'matt@explosion.ai' __license__ = 'MIT' -__models__ = { - 'en': 'en>=1.1.0,<1.2.0', - 'de': 'de>=1.0.0,<1.1.0', -} + +__docs__ = 'https://spacy.io/docs/usage' +__download_url__ = 'https://github.com/explosion/spacy-models/releases/download' +__compatibility__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json?token=ANAt54fi5zcUtnwGhMLw2klWwcAyHkZGks5Y0nw1wA%3D%3D' +__shortcuts__ = {'en': 'en_core_web_sm', 'de': 'de_core_web_md', 'vectors': 'en_vectors_glove_md'} diff --git a/spacy/de/download.py b/spacy/de/download.py index 4f02f0474..239d46884 100644 --- a/spacy/de/download.py +++ b/spacy/de/download.py @@ -1,14 +1,5 @@ -import plac -from ..download import download - - -@plac.annotations( - force=("Force overwrite", "flag", "f", bool), - data_path=("Path to download model", "option", "d", str) -) -def main(data_size='all', force=False, data_path=None): - download('de', force=force, data_path=data_path) +from ..deprecated import ModelDownload as download if __name__ == '__main__': - plac.call(main) + download.de() diff --git a/spacy/deprecated.py b/spacy/deprecated.py index d75354f9c..72327c584 100644 --- a/spacy/deprecated.py +++ b/spacy/deprecated.py @@ -1,35 +1,13 @@ -from sputnik.dir_package import DirPackage -from sputnik.package_list import (PackageNotFoundException, - CompatiblePackageNotFoundException) - -import sputnik +from pathlib import Path from . import about +from . import util +from .download import download -def get_package(data_dir): - if not isinstance(data_dir, six.string_types): - raise RuntimeError('data_dir must be a string') - return DirPackage(data_dir) - - -def get_package_by_name(name=None, via=None): - if name is None: - return - lang = get_lang_class(name) - try: - return sputnik.package(about.__title__, about.__version__, - name, data_path=via) - except PackageNotFoundException as e: - raise RuntimeError("Model '%s' not installed. Please run 'python -m " - "%s.download' to install latest compatible " - "model." % (name, lang.__module__)) - except CompatiblePackageNotFoundException as e: - raise RuntimeError("Installed model is not compatible with spaCy " - "version. Please run 'python -m %s.download " - "--force' to install latest compatible model." % - (lang.__module__)) - - +try: + basestring +except NameError: + basestring = str def read_lang_data(package): @@ -43,7 +21,6 @@ def read_lang_data(package): return tokenization, prefix, suffix, infix - def align_tokens(ref, indices): # Deprecated, surely? start = 0 queue = list(indices) @@ -79,4 +56,55 @@ def detokenize(token_rules, words): # Deprecated? return positions +def fix_glove_vectors_loading(overrides): + """Special-case hack for loading the GloVe vectors, to support deprecated + <1.0 stuff. Phase this out once the data is fixed.""" + if 'data_dir' in overrides and 'path' not in overrides: + raise ValueError("The argument 'data_dir' has been renamed to 'path'") + if overrides.get('path') is False: + return overrides + if overrides.get('path') in (None, True): + data_path = util.get_data_path() + else: + path = overrides['path'] + if isinstance(path, basestring): + path = Path(path) + data_path = path.parent + vec_path = None + if 'add_vectors' not in overrides: + if 'vectors' in overrides: + vec_path = util.match_best_version(overrides['vectors'], None, data_path) + if vec_path is None: + return overrides + else: + vec_path = util.match_best_version('en_glove_cc_300_1m_vectors', None, data_path) + if vec_path is not None: + vec_path = vec_path / 'vocab' / 'vec.bin' + if vec_path is not None: + overrides['add_vectors'] = lambda vocab: vocab.load_vectors_from_bin_loc(vec_path) + return overrides + + +class ModelDownload(): + """Replace download modules within en and de with deprecation warning and + download default language model (using shortcut). Use classmethods to allow + importing ModelDownload as download and calling download.en() etc.""" + + @classmethod + def load(self, lang): + util.print_msg( + "The spacy.{l}.download command is now deprecated. Please use " + "spacy.download [model name or shortcut] instead. For more " + "info and available models, see the documentation: {d}. " + "Downloading default '{l}' model now...".format(d=about.__docs__, l=lang), + title="Warning: deprecated command") + download(lang) + + @classmethod + def en(cls, *args, **kwargs): + cls.load('en') + + @classmethod + def de(cls, *args, **kwargs): + cls.load('de') diff --git a/spacy/download.py b/spacy/download.py index 694638149..b00f11d18 100644 --- a/spacy/download.py +++ b/spacy/download.py @@ -1,47 +1,80 @@ -from __future__ import print_function - -import sys -import shutil - -import sputnik -from sputnik.package_list import (PackageNotFoundException, - CompatiblePackageNotFoundException) +# coding: utf8 +from __future__ import unicode_literals +import pip +import plac +import requests +from os import path from . import about from . import util -def download(lang, force=False, fail_on_exist=True, data_path=None): - if not data_path: - data_path = util.get_data_path(require_exists=False) +@plac.annotations( + model=("Model to download", "positional", None, str), + direct=("Force direct download", "flag", "d", bool) +) +def download(model=None, direct=False): + """Download compatible model from default download path using pip.""" - # spaCy uses pathlib, and util.get_data_path returns a pathlib.Path object, - # but sputnik (which we're using below) doesn't use pathlib and requires - # its data_path parameters to be strings, so we coerce the data_path to a - # str here. - data_path = str(data_path) + check_error_depr(model) - try: - pkg = sputnik.package(about.__title__, about.__version__, - about.__models__.get(lang, lang), data_path) - if force: - shutil.rmtree(pkg.path) - elif fail_on_exist: - print("Model already installed. Please run 'python -m " - "spacy.%s.download --force' to reinstall." % lang, file=sys.stderr) - sys.exit(0) - except (PackageNotFoundException, CompatiblePackageNotFoundException): - pass + if direct: + download_model('{m}/{m}.tar.gz'.format(m=model)) + else: + model = about.__shortcuts__[model] if model in about.__shortcuts__ else model + compatibility = get_compatibility() + version = get_version(model, compatibility) + download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model, v=version)) - package = sputnik.install(about.__title__, about.__version__, - about.__models__.get(lang, lang), data_path) - try: - sputnik.package(about.__title__, about.__version__, - about.__models__.get(lang, lang), data_path) - except (PackageNotFoundException, CompatiblePackageNotFoundException): - print("Model failed to install. Please run 'python -m " - "spacy.%s.download --force'." % lang, file=sys.stderr) - sys.exit(1) +def get_compatibility(): + version = about.__version__ + r = requests.get(about.__compatibility__) + if r.status_code != 200: + util.sys_exit( + "Couldn't fetch compatibility table. Please find the right model for " + "your spaCy installation (v{v}), and download it manually:".format(v=version), + "python -m spacy.download [full model name + version] --direct", + title="Server error ({c})".format(c=r.status_code)) - print("Model successfully installed to %s" % data_path, file=sys.stderr) + comp = r.json()['spacy'] + if version not in comp: + util.sys_exit( + "No compatible models found for v{v} of spaCy.".format(v=version), + title="Compatibility error") + else: + return comp[version] + + +def get_version(model, comp): + if model not in comp: + util.sys_exit( + "No compatible model found for " + "{m} (spaCy v{v}).".format(m=model, v=about.__version__), + title="Compatibility error") + return comp[model][0] + + +def download_model(filename): + util.print_msg("Downloading {f}".format(f=filename)) + download_url = path.join(about.__download_url__, filename) + pip.main(['install', download_url]) + + +def check_error_depr(model): + if not model: + util.sys_exit( + "python -m spacy.download [name or shortcut]", + title="Missing model name or shortcut") + + if model == 'all': + util.sys_exit( + "As of v1.7.0, the download all command is deprecated. Please " + "download the models individually via spacy.download [model name] " + "or pip install. For more info on this, see the documentation: " + "{d}".format(d=about.__docs__), + title="Deprecated command") + + +if __name__ == '__main__': + plac.call(download) diff --git a/spacy/en/__init__.py b/spacy/en/__init__.py index 7262f37fb..775d42a2b 100644 --- a/spacy/en/__init__.py +++ b/spacy/en/__init__.py @@ -1,19 +1,16 @@ # coding: utf8 -from __future__ import unicode_literals, print_function +from __future__ import unicode_literals -from os import path -from pathlib import Path - -from ..util import match_best_version -from ..util import get_data_path from ..language import Language from ..lemmatizer import Lemmatizer from ..vocab import Vocab from ..tokenizer import Tokenizer from ..attrs import LANG +from ..deprecated import fix_glove_vectors_loading from .language_data import * + try: basestring except NameError: @@ -38,34 +35,6 @@ class English(Language): def __init__(self, **overrides): - # Make a special-case hack for loading the GloVe vectors, to support - # deprecated <1.0 stuff. Phase this out once the data is fixed. - overrides = _fix_deprecated_glove_vectors_loading(overrides) + # Special-case hack for loading the GloVe vectors, to support <1.0 + overrides = fix_glove_vectors_loading(overrides) Language.__init__(self, **overrides) - - -def _fix_deprecated_glove_vectors_loading(overrides): - if 'data_dir' in overrides and 'path' not in overrides: - raise ValueError("The argument 'data_dir' has been renamed to 'path'") - if overrides.get('path') is False: - return overrides - if overrides.get('path') in (None, True): - data_path = get_data_path() - else: - path = overrides['path'] - if isinstance(path, basestring): - path = Path(path) - data_path = path.parent - vec_path = None - if 'add_vectors' not in overrides: - if 'vectors' in overrides: - vec_path = match_best_version(overrides['vectors'], None, data_path) - if vec_path is None: - return overrides - else: - vec_path = match_best_version('en_glove_cc_300_1m_vectors', None, data_path) - if vec_path is not None: - vec_path = vec_path / 'vocab' / 'vec.bin' - if vec_path is not None: - overrides['add_vectors'] = lambda vocab: vocab.load_vectors_from_bin_loc(vec_path) - return overrides diff --git a/spacy/en/download.py b/spacy/en/download.py index 7a2d58234..6d0a8dd40 100644 --- a/spacy/en/download.py +++ b/spacy/en/download.py @@ -1,25 +1,5 @@ -import plac -import sputnik - -from ..download import download -from .. import about - - -@plac.annotations( - force=("Force overwrite", "flag", "f", bool), - data_path=("Path to download model", "option", "d", str) -) -def main(data_size='all', force=False, data_path=None): - if force: - sputnik.purge(about.__title__, about.__version__) - - if data_size in ('all', 'parser'): - print("Downloading parsing model") - download('en', force=False, data_path=data_path) - if data_size in ('all', 'glove'): - print("Downloading GloVe vectors") - download('en_glove_cc_300_1m_vectors', force=False, data_path=data_path) +from ..deprecated import ModelDownload as download if __name__ == '__main__': - plac.call(main) + download.en() diff --git a/spacy/language.py b/spacy/language.py index ac38c5754..ed5a7ed96 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -281,6 +281,7 @@ class Language(object): if path is True: path = util.match_best_version(self.lang, '', util.get_data_path()) + self.meta = overrides.get('meta', {}) self.path = path self.vocab = self.Defaults.create_vocab(self) \ diff --git a/spacy/link.py b/spacy/link.py new file mode 100644 index 000000000..2626861fd --- /dev/null +++ b/spacy/link.py @@ -0,0 +1,72 @@ +# coding: utf8 +from __future__ import unicode_literals + +import io +import os +import pip +import site +import plac +from . import util + + +@plac.annotations( + origin=("Package name or path to model", "positional", None, str), + link_name=("Name of link", "positional", None, str), + force=("Force overwriting existing link", "flag", "f", bool) +) +def link(origin, link_name, force=False): + """Create a symlink for models within the spacy/data directory. Accepts + either the name of a pip package, or the local path to the model data + directory. Linking models allows loading them via spacy.load(link_name).""" + + if is_package(origin): + package_path = site.getsitepackages()[0] + meta = get_meta(package_path, origin) + data_dir = origin + '-' + meta['version'] + model_path = os.path.join(package_path, origin, data_dir) + symlink(model_path, link_name, force) + else: + symlink(origin, link_name, force) + + +def symlink(model_path, link_name, force): + if not os.path.isdir(model_path): + util.sys_exit( + "The data should be located in {p}".format(p=model_path), + title="Can't locate model data") + + data_path = str(util.get_data_path()) + link_path = os.path.join(os.path.abspath(__file__ + '/../../'), data_path, link_name) + + if os.path.isdir(link_path): + if force: + os.unlink(link_path) + else: + util.sys_exit( + "To overwrite an existing link, use the --force flag.", + title="Link {l} already exists".format(l=link_name)) + + os.symlink(model_path, link_path) + util.print_msg( + "{a} --> {b}".format(a=model_path, b=link_path), + "You can now load the model via spacy.load('{l}').".format(l=link_name), + title="Linking successful") + + +def get_meta(package_path, package): + meta = util.parse_package_meta(package_path, package) + if not meta: + util.sys_exit() + return meta + + +def is_package(origin): + packages = pip.get_installed_distributions() + for package in packages: + if package.project_name.replace('-', '_') == origin: + return True + return False + + +if __name__ == '__main__': + plac.call(link) diff --git a/spacy/tests/test_download.py b/spacy/tests/test_download.py new file mode 100644 index 000000000..8d67364ea --- /dev/null +++ b/spacy/tests/test_download.py @@ -0,0 +1,36 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from ..download import download, get_compatibility, get_version, check_error_depr +import pytest + + +def test_download_fetch_compatibility(): + compatibility = get_compatibility() + assert type(compatibility) == dict + + +@pytest.mark.slow +@pytest.mark.parametrize('model', ['en_core_web_md-1.2.0']) +def test_download_direct_download(model): + download(model, direct=True) + + +@pytest.mark.parametrize('model', ['en_core_web_md']) +def test_download_get_matching_version_succeeds(model): + comp = { model: ['1.7.0', '0.100.0'] } + assert get_version(model, comp) + + +@pytest.mark.parametrize('model', ['en_core_web_md']) +def test_download_get_matching_version_fails(model): + diff_model = 'test_' + model + comp = { diff_model: ['1.7.0', '0.100.0'] } + with pytest.raises(SystemExit): + assert get_version(model, comp) + + +@pytest.mark.parametrize('model', [False, None, '', 'all']) +def test_download_no_model_depr_error(model): + with pytest.raises(SystemExit): + check_error_depr(model) diff --git a/spacy/tests/test_pickles.py b/spacy/tests/test_pickles.py index 2e7fc6bf7..41e15884f 100644 --- a/spacy/tests/test_pickles.py +++ b/spacy/tests/test_pickles.py @@ -1,40 +1,36 @@ +# coding: utf-8 from __future__ import unicode_literals -import io import pytest import dill as pickle -from ..strings import StringStore from ..vocab import Vocab from ..attrs import NORM -def test_pickle_string_store(): - sstore = StringStore() - hello = sstore['hello'] - bye = sstore['bye'] - bdata = pickle.dumps(sstore, protocol=-1) - unpickled = pickle.loads(bdata) - assert unpickled['hello'] == hello - assert unpickled['bye'] == bye - assert len(sstore) == len(unpickled) +@pytest.mark.parametrize('text1,text2', [('hello', 'bye')]) +def test_pickle_string_store(stringstore, text1, text2): + store1 = stringstore[text1] + store2 = stringstore[text2] + data = pickle.dumps(stringstore, protocol=-1) + unpickled = pickle.loads(data) + assert unpickled[text1] == store1 + assert unpickled[text2] == store2 + assert len(stringstore) == len(unpickled) @pytest.mark.xfail -def test_pickle_vocab(): +@pytest.mark.parametrize('text1,text2', [('dog', 'cat')]) +def test_pickle_vocab(text1, text2): vocab = Vocab(lex_attr_getters={int(NORM): lambda string: string[:-1]}) - dog = vocab[u'dog'] - cat = vocab[u'cat'] - assert dog.norm_ == 'do' - assert cat.norm_ == 'ca' - - bdata = pickle.dumps(vocab) - unpickled = pickle.loads(bdata) - - assert unpickled[u'dog'].orth == dog.orth - assert unpickled[u'cat'].orth == cat.orth - assert unpickled[u'dog'].norm == dog.norm - assert unpickled[u'cat'].norm == cat.norm - dog_ = unpickled[u'dog'] - cat_ = unpickled[u'cat'] - assert dog_.norm != cat_.norm + lex1 = vocab[text1] + lex2 = vocab[text2] + assert lex1.norm_ == text1[:-1] + assert lex2.norm_ == text2[:-1] + data = pickle.dumps(vocab) + unpickled = pickle.loads(data) + assert unpickled[text1].orth == lex1.orth + assert unpickled[text2].orth == lex2.orth + assert unpickled[text1].norm == lex1.norm + assert unpickled[text2].norm == lex2.norm + assert unpickled[text1].norm != unpickled[text2].norm diff --git a/spacy/util.py b/spacy/util.py index dd1ae1b3b..946ce6aab 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -1,13 +1,16 @@ # coding: utf8 -from __future__ import unicode_literals +from __future__ import unicode_literals, print_function import os import io import json import re import os.path import pathlib +import sys import six +import textwrap + from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE try: @@ -144,3 +147,53 @@ def check_renamed_kwargs(renamed, kwargs): for old, new in renamed.items(): if old in kwargs: raise TypeError("Keyword argument %s now renamed to %s" % (old, new)) + + +def parse_package_meta(package_path, package): + location = os.path.join(str(package_path), package, 'meta.json') + if not os.path.isfile(location): + print_msg("'{p}' doesn't seem to be a valid model package.".format(p=package), + title="No meta.json found") + else: + with io.open(location, encoding='utf8') as f: + meta = json.load(f) + return meta + return False + + +def print_msg(*text, **kwargs): + """Print formatted message. Each positional argument is rendered as newline- + separated paragraph. If kwarg 'title' exist, title is printed above the text + and highlighted (using ANSI escape sequences manually to avoid unnecessary + dependency).""" + + message = '\n\n'.join([_wrap_text(t) for t in text]) + tpl_msg = '\n{msg}\n' + tpl_title = '\n\033[93m{msg}\033[0m' + + if 'title' in kwargs and kwargs['title']: + title = _wrap_text(kwargs['title']) + print(tpl_title.format(msg=title)) + print(tpl_msg.format(msg=message)) + + +def _wrap_text(text): + """Wrap text at given width using textwrap module. Indent should consist of + spaces. Its length is deducted from wrap width to ensure exact wrapping.""" + + wrap_max = 80 + indent = ' ' + wrap_width = wrap_max - len(indent) + return textwrap.fill(text, width=wrap_width, initial_indent=indent, + subsequent_indent=indent, break_long_words=False, + break_on_hyphens=False) + + +def sys_exit(*messages, **kwargs): + """Performs SystemExit. For modules used from the command line, like + download and link. To print message, use the same arguments as for + print_msg().""" + + if messages: + print_msg(*messages, **kwargs) + sys.exit(0) diff --git a/website/docs/usage/resources.jade b/website/docs/usage/resources.jade index 2b80ebe48..754b951c7 100644 --- a/website/docs/usage/resources.jade +++ b/website/docs/usage/resources.jade @@ -57,20 +57,6 @@ p Many of the associated tools and resources that we're developing alongside spa +cell | Super sparse multi-class machine learning with Cython. - +row - +cell - +src(gh("sputnik")) Sputnik - - +cell - | Data package manager library for spaCy. - - +row - +cell - +src(gh("sputnik-server")) Sputnik Server - - +cell - | Index service for the Sputnik data package manager for spaCy. - +row +cell +src(gh("cymem")) Cymem