Merge remote-tracking branch 'origin/develop-downloads'

2025-12-11 20:24:29 +03:00 · 2017-03-16 12:00:42 -05:00 · 2017-03-16 12:00:42 -05:00 · 8843b84bd1
commit 8843b84bd1
parent 55f813bfbb 618ce3b425
15 changed files with 346 additions and 199 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -8,5 +8,5 @@ murmurhash>=0.26,<0.27
 plac<0.9.3
 six
 ujson>=1.35
-sputnik>=0.9.2,<0.10.0
 dill>=0.2,<0.3
+requests>=2.13.0,<3.0.0
--- a/setup.py
+++ b/setup.py
@ -240,9 +240,9 @@ def setup_package():
                'plac<0.9.3',
                'six',
                'pathlib',
-                'sputnik>=0.9.2,<0.10.0',
                'ujson>=1.35',
-                'dill>=0.2,<0.3'],
+                'dill>=0.2,<0.3',
+                'requests>=2.13.0,<3.0.0'],
            classifiers=[
                'Development Status :: 5 - Production/Stable',
                'Environment :: Console',
--- a/spacy/init.py
+++ b/spacy/init.py
@ -1,7 +1,9 @@
-import pathlib
+# coding: utf8
+from __future__ import unicode_literals, print_function

-from .util import set_lang_class, get_lang_class
-from .about import __version__
+import json
+from pathlib import Path
+from .util import set_lang_class, get_lang_class, parse_package_meta

 from . import en
 from . import de
@ -16,11 +18,6 @@ from . import sv
 from . import fi
 from . import bn

-try:
-    basestring
-except NameError:
-    basestring = str
-

 set_lang_class(en.English.lang, en.English)
 set_lang_class(de.German.lang, de.German)
@ -36,11 +33,16 @@ set_lang_class(fi.Finnish.lang, fi.Finnish)
 set_lang_class(bn.Bengali.lang, bn.Bengali)


-
 def load(name, **overrides):
-    target_name, target_version = util.split_data_name(name)
    data_path = overrides.get('path', util.get_data_path())
-    path = util.match_best_version(target_name, target_version, data_path)
-    cls = get_lang_class(target_name)
-    overrides['path'] = path
+    meta = parse_package_meta(data_path, name)
+    lang = meta['lang'] if meta and 'lang' in meta else 'en'
+    cls = get_lang_class(lang)
+    overrides['meta'] = meta
+    overrides['path'] = Path(data_path / name)
    return cls(**overrides)
+
+
+def info(name):
+    meta = parse_package_meta(util.get_data_path(), name)
+    print(json.dumps(meta, indent=2))
--- a/spacy/about.py
+++ b/spacy/about.py
@ -1,5 +1,4 @@
 # inspired from:
-
 # https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/
 # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py

@ -10,7 +9,8 @@ __uri__ = 'https://spacy.io'
 __author__ = 'Matthew Honnibal'
 __email__ = 'matt@explosion.ai'
 __license__ = 'MIT'
-__models__ = {
-    'en': 'en>=1.1.0,<1.2.0',
-    'de': 'de>=1.0.0,<1.1.0',
-}
+
+__docs__ = 'https://spacy.io/docs/usage'
+__download_url__ = 'https://github.com/explosion/spacy-models/releases/download'
+__compatibility__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json?token=ANAt54fi5zcUtnwGhMLw2klWwcAyHkZGks5Y0nw1wA%3D%3D'
+__shortcuts__ = {'en': 'en_core_web_sm', 'de': 'de_core_web_md', 'vectors': 'en_vectors_glove_md'}
--- a/spacy/de/download.py
+++ b/spacy/de/download.py
@ -1,14 +1,5 @@
-import plac
-from ..download import download
-
-
-@plac.annotations(
-    force=("Force overwrite", "flag", "f", bool),
-    data_path=("Path to download model", "option", "d", str)
-)
-def main(data_size='all', force=False, data_path=None):
-    download('de', force=force, data_path=data_path)
+from ..deprecated import ModelDownload as download


 if __name__ == '__main__':
-    plac.call(main)
+    download.de()
--- a/spacy/deprecated.py
+++ b/spacy/deprecated.py
@ -1,35 +1,13 @@
-from sputnik.dir_package import DirPackage
-from sputnik.package_list import (PackageNotFoundException,
-                                  CompatiblePackageNotFoundException)
-
-import sputnik
+from pathlib import Path
 from . import about
+from . import util
+from .download import download


-def get_package(data_dir):
-    if not isinstance(data_dir, six.string_types):
-        raise RuntimeError('data_dir must be a string')
-    return DirPackage(data_dir)
-
-
-def get_package_by_name(name=None, via=None):
-    if name is None:
-        return
-    lang = get_lang_class(name)
-    try:
-        return sputnik.package(about.__title__, about.__version__,
-            name, data_path=via)
-    except PackageNotFoundException as e:
-        raise RuntimeError("Model '%s' not installed. Please run 'python -m "
-                           "%s.download' to install latest compatible "
-                           "model." % (name, lang.__module__))
-    except CompatiblePackageNotFoundException as e:
-        raise RuntimeError("Installed model is not compatible with spaCy "
-                           "version. Please run 'python -m %s.download "
-                           "--force' to install latest compatible model." %
-                           (lang.__module__))
-
-
+try:
+    basestring
+except NameError:
+    basestring = str


 def read_lang_data(package):
@ -43,7 +21,6 @@ def read_lang_data(package):
    return tokenization, prefix, suffix, infix


-
 def align_tokens(ref, indices): # Deprecated, surely?
    start = 0
    queue = list(indices)
@ -79,4 +56,55 @@ def detokenize(token_rules, words): # Deprecated?
    return positions


+def fix_glove_vectors_loading(overrides):
+    """Special-case hack for loading the GloVe vectors, to support deprecated
+    <1.0 stuff. Phase this out once the data is fixed."""

+    if 'data_dir' in overrides and 'path' not in overrides:
+        raise ValueError("The argument 'data_dir' has been renamed to 'path'")
+    if overrides.get('path') is False:
+        return overrides
+    if overrides.get('path') in (None, True):
+        data_path = util.get_data_path()
+    else:
+        path = overrides['path']
+        if isinstance(path, basestring):
+            path = Path(path)
+        data_path = path.parent
+    vec_path = None
+    if 'add_vectors' not in overrides:
+        if 'vectors' in overrides:
+            vec_path = util.match_best_version(overrides['vectors'], None, data_path)
+            if vec_path is None:
+                return overrides
+        else:
+            vec_path = util.match_best_version('en_glove_cc_300_1m_vectors', None, data_path)
+        if vec_path is not None:
+            vec_path = vec_path / 'vocab' / 'vec.bin'
+    if vec_path is not None:
+        overrides['add_vectors'] = lambda vocab: vocab.load_vectors_from_bin_loc(vec_path)
+    return overrides
+
+
+class ModelDownload():
+    """Replace download modules within en and de with deprecation warning and
+    download default language model (using shortcut). Use classmethods to allow
+    importing ModelDownload as download and calling download.en() etc."""
+
+    @classmethod
+    def load(self, lang):
+        util.print_msg(
+            "The spacy.{l}.download command is now deprecated. Please use "
+            "spacy.download [model name or shortcut] instead. For more "
+            "info and available models, see the documentation: {d}. "
+            "Downloading default '{l}' model now...".format(d=about.__docs__, l=lang),
+            title="Warning: deprecated command")
+        download(lang)
+
+    @classmethod
+    def en(cls, *args, **kwargs):
+        cls.load('en')
+
+    @classmethod
+    def de(cls, *args, **kwargs):
+        cls.load('de')
--- a/spacy/download.py
+++ b/spacy/download.py
@ -1,47 +1,80 @@
-from __future__ import print_function
-
-import sys
-import shutil
-
-import sputnik
-from sputnik.package_list import (PackageNotFoundException,
-                                  CompatiblePackageNotFoundException)
+# coding: utf8
+from __future__ import unicode_literals

+import pip
+import plac
+import requests
+from os import path
 from . import about
 from . import util


-def download(lang, force=False, fail_on_exist=True, data_path=None):
-    if not data_path:
-        data_path = util.get_data_path(require_exists=False)
+@plac.annotations(
+    model=("Model to download", "positional", None, str),
+    direct=("Force direct download", "flag", "d", bool)
+)
+def download(model=None, direct=False):
+    """Download compatible model from default download path using pip."""

-    # spaCy uses pathlib, and util.get_data_path returns a pathlib.Path object,
-    # but sputnik (which we're using below) doesn't use pathlib and requires
-    # its data_path parameters to be strings, so we coerce the data_path to a
-    # str here.
-    data_path = str(data_path)
+    check_error_depr(model)

-    try:
-        pkg = sputnik.package(about.__title__, about.__version__,
-                        about.__models__.get(lang, lang), data_path)
-        if force:
-            shutil.rmtree(pkg.path)
-        elif fail_on_exist:
-            print("Model already installed. Please run 'python -m "
-                  "spacy.%s.download --force' to reinstall." % lang, file=sys.stderr)
-            sys.exit(0)
-    except (PackageNotFoundException, CompatiblePackageNotFoundException):
-        pass
+    if direct:
+        download_model('{m}/{m}.tar.gz'.format(m=model))
+    else:
+        model = about.__shortcuts__[model] if model in about.__shortcuts__ else model
+        compatibility = get_compatibility()
+        version = get_version(model, compatibility)
+        download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model, v=version))

-    package = sputnik.install(about.__title__, about.__version__,
-                              about.__models__.get(lang, lang), data_path)

-    try:
-        sputnik.package(about.__title__, about.__version__,
-                        about.__models__.get(lang, lang), data_path)
-    except (PackageNotFoundException, CompatiblePackageNotFoundException):
-        print("Model failed to install. Please run 'python -m "
-              "spacy.%s.download --force'." % lang, file=sys.stderr)
-        sys.exit(1)
+def get_compatibility():
+    version = about.__version__
+    r = requests.get(about.__compatibility__)
+    if r.status_code != 200:
+        util.sys_exit(
+            "Couldn't fetch compatibility table. Please find the right model for "
+            "your spaCy installation (v{v}), and download it manually:".format(v=version),
+            "python -m spacy.download [full model name + version] --direct",
+            title="Server error ({c})".format(c=r.status_code))

-    print("Model successfully installed to %s" % data_path, file=sys.stderr)
+    comp = r.json()['spacy']
+    if version not in comp:
+        util.sys_exit(
+            "No compatible models found for v{v} of spaCy.".format(v=version),
+            title="Compatibility error")
+    else:
+        return comp[version]
+
+
+def get_version(model, comp):
+    if model not in comp:
+        util.sys_exit(
+            "No compatible model found for "
+            "{m} (spaCy v{v}).".format(m=model, v=about.__version__),
+            title="Compatibility error")
+    return comp[model][0]
+
+
+def download_model(filename):
+    util.print_msg("Downloading {f}".format(f=filename))
+    download_url = path.join(about.__download_url__, filename)
+    pip.main(['install', download_url])
+
+
+def check_error_depr(model):
+    if not model:
+        util.sys_exit(
+            "python -m spacy.download [name or shortcut]",
+            title="Missing model name or shortcut")
+
+    if model == 'all':
+        util.sys_exit(
+            "As of v1.7.0, the download all command is deprecated. Please "
+            "download the models individually via spacy.download [model name] "
+            "or pip install. For more info on this, see the documentation: "
+            "{d}".format(d=about.__docs__),
+            title="Deprecated command")
+
+
+if __name__ == '__main__':
+    plac.call(download)
--- a/spacy/en/init.py
+++ b/spacy/en/init.py
@ -1,19 +1,16 @@
 # coding: utf8
-from __future__ import unicode_literals, print_function
+from __future__ import unicode_literals

-from os import path
-from pathlib import Path
-
-from ..util import match_best_version
-from ..util import get_data_path
 from ..language import Language
 from ..lemmatizer import Lemmatizer
 from ..vocab import Vocab
 from ..tokenizer import Tokenizer
 from ..attrs import LANG
+from ..deprecated import fix_glove_vectors_loading

 from .language_data import *

+
 try:
    basestring
 except NameError:
@ -38,34 +35,6 @@ class English(Language):


    def __init__(self, **overrides):
-        # Make a special-case hack for loading the GloVe vectors, to support
-        # deprecated <1.0 stuff. Phase this out once the data is fixed.
-        overrides = _fix_deprecated_glove_vectors_loading(overrides)
+        # Special-case hack for loading the GloVe vectors, to support <1.0
+        overrides = fix_glove_vectors_loading(overrides)
        Language.__init__(self, **overrides)
-
-
-def _fix_deprecated_glove_vectors_loading(overrides):
-    if 'data_dir' in overrides and 'path' not in overrides:
-        raise ValueError("The argument 'data_dir' has been renamed to 'path'")
-    if overrides.get('path') is False:
-        return overrides
-    if overrides.get('path') in (None, True):
-        data_path = get_data_path()
-    else:
-        path = overrides['path']
-        if isinstance(path, basestring):
-            path = Path(path)
-        data_path = path.parent
-    vec_path = None
-    if 'add_vectors' not in overrides:
-        if 'vectors' in overrides:
-            vec_path = match_best_version(overrides['vectors'], None, data_path)
-            if vec_path is None:
-                return overrides
-        else:
-            vec_path = match_best_version('en_glove_cc_300_1m_vectors', None, data_path)
-        if vec_path is not None:
-            vec_path = vec_path / 'vocab' / 'vec.bin'
-    if vec_path is not None:
-        overrides['add_vectors'] = lambda vocab: vocab.load_vectors_from_bin_loc(vec_path)
-    return overrides
--- a/spacy/en/download.py
+++ b/spacy/en/download.py
@ -1,25 +1,5 @@
-import plac
-import sputnik
-
-from ..download import download
-from .. import about
-
-
-@plac.annotations(
-    force=("Force overwrite", "flag", "f", bool),
-    data_path=("Path to download model", "option", "d", str)
-)
-def main(data_size='all', force=False, data_path=None):
-    if force:
-        sputnik.purge(about.__title__, about.__version__)
-
-    if data_size in ('all', 'parser'):
-        print("Downloading parsing model")
-        download('en', force=False, data_path=data_path)
-    if data_size in ('all', 'glove'):
-        print("Downloading GloVe vectors")
-        download('en_glove_cc_300_1m_vectors', force=False, data_path=data_path)
+from ..deprecated import ModelDownload as download


 if __name__ == '__main__':
-    plac.call(main)
+    download.en()
--- a/spacy/language.py
+++ b/spacy/language.py
@ -281,6 +281,7 @@ class Language(object):
        if path is True:
            path = util.match_best_version(self.lang, '', util.get_data_path())

+        self.meta = overrides.get('meta', {})
        self.path = path

        self.vocab     = self.Defaults.create_vocab(self) \
--- a/spacy/link.py
+++ b/spacy/link.py
@ -0,0 +1,72 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+import io
+import os
+import pip
+import site
+import plac
+from . import util
+
+
+@plac.annotations(
+    origin=("Package name or path to model", "positional", None, str),
+    link_name=("Name of link", "positional", None, str),
+    force=("Force overwriting existing link", "flag", "f", bool)
+)
+def link(origin, link_name, force=False):
+    """Create a symlink for models within the spacy/data directory. Accepts
+    either the name of a pip package, or the local path to the model data
+    directory. Linking models allows loading them via spacy.load(link_name)."""
+
+    if is_package(origin):
+        package_path = site.getsitepackages()[0]
+        meta = get_meta(package_path, origin)
+        data_dir = origin + '-' + meta['version']
+        model_path = os.path.join(package_path, origin, data_dir)
+        symlink(model_path, link_name, force)
+    else:
+        symlink(origin, link_name, force)
+
+
+def symlink(model_path, link_name, force):
+    if not os.path.isdir(model_path):
+        util.sys_exit(
+            "The data should be located in {p}".format(p=model_path),
+            title="Can't locate model data")
+
+    data_path = str(util.get_data_path())
+    link_path = os.path.join(os.path.abspath(__file__ + '/../../'), data_path, link_name)
+
+    if os.path.isdir(link_path):
+        if force:
+            os.unlink(link_path)
+        else:
+            util.sys_exit(
+                "To overwrite an existing link, use the --force flag.",
+                title="Link {l} already exists".format(l=link_name))
+
+    os.symlink(model_path, link_path)
+    util.print_msg(
+        "{a} --> {b}".format(a=model_path, b=link_path),
+        "You can now load the model via spacy.load('{l}').".format(l=link_name),
+        title="Linking successful")
+
+
+def get_meta(package_path, package):
+    meta = util.parse_package_meta(package_path, package)
+    if not meta:
+        util.sys_exit()
+    return meta
+
+
+def is_package(origin):
+    packages = pip.get_installed_distributions()
+    for package in packages:
+        if package.project_name.replace('-', '_') == origin:
+            return True
+    return False
+
+
+if __name__ == '__main__':
+    plac.call(link)
--- a/spacy/tests/test_download.py
+++ b/spacy/tests/test_download.py
@ -0,0 +1,36 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from ..download import download, get_compatibility, get_version, check_error_depr
+import pytest
+
+
+def test_download_fetch_compatibility():
+    compatibility = get_compatibility()
+    assert type(compatibility) == dict
+
+
+@pytest.mark.slow
+@pytest.mark.parametrize('model', ['en_core_web_md-1.2.0'])
+def test_download_direct_download(model):
+    download(model, direct=True)
+
+
+@pytest.mark.parametrize('model', ['en_core_web_md'])
+def test_download_get_matching_version_succeeds(model):
+    comp = { model: ['1.7.0', '0.100.0'] }
+    assert get_version(model, comp)
+
+
+@pytest.mark.parametrize('model', ['en_core_web_md'])
+def test_download_get_matching_version_fails(model):
+    diff_model = 'test_' + model
+    comp = { diff_model: ['1.7.0', '0.100.0'] }
+    with pytest.raises(SystemExit):
+        assert get_version(model, comp)
+
+
+@pytest.mark.parametrize('model', [False, None, '', 'all'])
+def test_download_no_model_depr_error(model):
+    with pytest.raises(SystemExit):
+        check_error_depr(model)
--- a/spacy/tests/test_pickles.py
+++ b/spacy/tests/test_pickles.py
@ -1,40 +1,36 @@
+# coding: utf-8
 from __future__ import unicode_literals

-import io
 import pytest
 import dill as pickle

-from ..strings import StringStore
 from ..vocab import Vocab
 from ..attrs import NORM


-def test_pickle_string_store():
-    sstore = StringStore()
-    hello = sstore['hello']
-    bye = sstore['bye']
-    bdata = pickle.dumps(sstore, protocol=-1)
-    unpickled = pickle.loads(bdata)
-    assert unpickled['hello'] == hello
-    assert unpickled['bye'] == bye
-    assert len(sstore) == len(unpickled)
+@pytest.mark.parametrize('text1,text2', [('hello', 'bye')])
+def test_pickle_string_store(stringstore, text1, text2):
+    store1 = stringstore[text1]
+    store2 = stringstore[text2]
+    data = pickle.dumps(stringstore, protocol=-1)
+    unpickled = pickle.loads(data)
+    assert unpickled[text1] == store1
+    assert unpickled[text2] == store2
+    assert len(stringstore) == len(unpickled)


@pytest.mark.xfail
-def test_pickle_vocab():
+@pytest.mark.parametrize('text1,text2', [('dog', 'cat')])
+def test_pickle_vocab(text1, text2):
    vocab = Vocab(lex_attr_getters={int(NORM): lambda string: string[:-1]})
-    dog = vocab[u'dog']
-    cat = vocab[u'cat']
-    assert dog.norm_ == 'do'
-    assert cat.norm_ == 'ca'
-
-    bdata = pickle.dumps(vocab)
-    unpickled = pickle.loads(bdata)
-
-    assert unpickled[u'dog'].orth == dog.orth
-    assert unpickled[u'cat'].orth == cat.orth
-    assert unpickled[u'dog'].norm == dog.norm
-    assert unpickled[u'cat'].norm == cat.norm
-    dog_ = unpickled[u'dog']
-    cat_ = unpickled[u'cat']
-    assert dog_.norm != cat_.norm
+    lex1 = vocab[text1]
+    lex2 = vocab[text2]
+    assert lex1.norm_ == text1[:-1]
+    assert lex2.norm_ == text2[:-1]
+    data = pickle.dumps(vocab)
+    unpickled = pickle.loads(data)
+    assert unpickled[text1].orth == lex1.orth
+    assert unpickled[text2].orth == lex2.orth
+    assert unpickled[text1].norm == lex1.norm
+    assert unpickled[text2].norm == lex2.norm
+    assert unpickled[text1].norm != unpickled[text2].norm
--- a/spacy/util.py
+++ b/spacy/util.py
@ -1,13 +1,16 @@
 # coding: utf8
-from __future__ import unicode_literals
+from __future__ import unicode_literals, print_function
 import os
 import io
 import json
 import re
 import os.path
 import pathlib
+import sys

 import six
+import textwrap
+
 from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE

 try:
@ -144,3 +147,53 @@ def check_renamed_kwargs(renamed, kwargs):
    for old, new in renamed.items():
        if old in kwargs:
            raise TypeError("Keyword argument %s now renamed to %s" % (old, new))
+
+
+def parse_package_meta(package_path, package):
+    location = os.path.join(str(package_path), package, 'meta.json')
+    if not os.path.isfile(location):
+        print_msg("'{p}' doesn't seem to be a valid model package.".format(p=package),
+             title="No meta.json found")
+    else:
+        with io.open(location, encoding='utf8') as f:
+            meta = json.load(f)
+            return meta
+    return False
+
+
+def print_msg(*text, **kwargs):
+    """Print formatted message. Each positional argument is rendered as newline-
+    separated paragraph. If kwarg 'title' exist, title is printed above the text
+    and highlighted (using ANSI escape sequences manually to avoid unnecessary
+    dependency)."""
+
+    message = '\n\n'.join([_wrap_text(t) for t in text])
+    tpl_msg = '\n{msg}\n'
+    tpl_title = '\n\033[93m{msg}\033[0m'
+
+    if 'title' in kwargs and kwargs['title']:
+        title = _wrap_text(kwargs['title'])
+        print(tpl_title.format(msg=title))
+    print(tpl_msg.format(msg=message))
+
+
+def _wrap_text(text):
+    """Wrap text at given width using textwrap module. Indent should consist of
+    spaces. Its length is deducted from wrap width to ensure exact wrapping."""
+
+    wrap_max = 80
+    indent = '    '
+    wrap_width = wrap_max - len(indent)
+    return textwrap.fill(text, width=wrap_width, initial_indent=indent,
+                               subsequent_indent=indent, break_long_words=False,
+                               break_on_hyphens=False)
+
+
+def sys_exit(*messages, **kwargs):
+    """Performs SystemExit. For modules used from the command line, like
+    download and link. To print message, use the same arguments as for
+    print_msg()."""
+
+    if messages:
+        print_msg(*messages, **kwargs)
+    sys.exit(0)
--- a/website/docs/usage/resources.jade
+++ b/website/docs/usage/resources.jade
@ -57,20 +57,6 @@ p Many of the associated tools and resources that we're developing alongside spa
        +cell
            |  Super sparse multi-class machine learning with Cython.

-    +row
-        +cell
-            +src(gh("sputnik")) Sputnik
-
-        +cell
-            |  Data package manager library for spaCy.
-
-    +row
-        +cell
-            +src(gh("sputnik-server")) Sputnik Server
-
-        +cell
-            |  Index service for the Sputnik data package manager for spaCy.
-
    +row
        +cell
            +src(gh("cymem")) Cymem