Merge pull request #222 from henningpeters/revise_packaging

integrate with sputnik
2025-10-22 19:54:18 +03:00 · 2016-01-16 01:23:39 +11:00 · 2016-01-16 01:23:39 +11:00 · ed001ea977
commit ed001ea977
parent aa51014f71 04e67e8715
12 changed files with 59 additions and 103 deletions
--- a/package.json
+++ b/package.json
@ -1,17 +1,14 @@
 {
-    "name": "en_default",
+    "name": "en_test",
-    "version": "0.100.0",
+    "version": "1.0.0",
-    "description": "english default model",
+    "description": "english test model",
    "license": "public domain",
    "include": [
-        "deps/*",
+        ["deps", "*"],
-        "ner/*",
+        ["ner", "*"],
-        "pos/*",
+        ["pos", "*"],
-        "tokenizer/*",
+        ["tokenizer", "*"],
-        "vocab/*",
+        ["vocab", "*"],
-        "wordnet/*"
+        ["wordnet", "*"]
-    ],
+    ]
    "compatibility": {
        "spacy": "==0.100.0"
    }
 }
--- a/requirements.txt
+++ b/requirements.txt
@ -10,4 +10,4 @@ plac
 six
 ujson
 cloudpickle
-sputnik>=0.6.4,<0.7.0
+sputnik>=0.7.0,<0.8.0
--- a/setup.py
+++ b/setup.py
@ -271,7 +271,7 @@ def setup_package():
            ext_modules=ext_modules,
            install_requires=['numpy', 'murmurhash>=0.26,<0.27', 'cymem>=1.30,<1.31', 'preshed>=0.46.1,<0.47',
                              'thinc>=4.2.0,<4.3.0', 'text_unidecode', 'plac', 'six',
-                              'ujson', 'cloudpickle', 'sputnik>=0.6.4,<0.7.0'],
+                              'ujson', 'cloudpickle', 'sputnik>=0.7.0,<0.8.0'],
            cmdclass = {
                'build_ext': build_ext_subclass},
        )
--- a/spacy/en/download.py
+++ b/spacy/en/download.py
@ -3,7 +3,9 @@ import os
 import shutil
 import plac
-from sputnik import Sputnik
+import sputnik
 from .. import about
 def migrate(path):
@ -35,23 +37,17 @@ def link(package, path):
    force=("Force overwrite", "flag", "f", bool),
 )
 def main(data_size='all', force=False):
    # TODO read version from the same source as the setup
    sputnik = Sputnik('spacy', '0.100.0', console=sys.stdout)
    path = os.path.dirname(os.path.abspath(__file__))
    data_path = os.path.abspath(os.path.join(path, '..', 'data'))
    if not os.path.isdir(data_path):
        os.mkdir(data_path)
    command = sputnik.command(
        data_path=data_path,
        repository_url='https://index.spacy.io')
    if force:
-        command.purge()
+        sputnik.purge('spacy', about.short_version, data_path=data_path)
-    package = command.install('en_default')
+    package = sputnik.install('spacy', about.short_version, 'en_default==1.0.4',
                              data_path=data_path)
    # FIXME clean up old-style packages
    migrate(path)
--- a/spacy/language.py
+++ b/spacy/language.py
@ -20,7 +20,7 @@ from .syntax.ner import BiluoPushDown
 from .syntax.arc_eager import ArcEager
 from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD
-from .util import get_package, Package
+from .util import get_package
 class Language(object):
@ -146,13 +146,13 @@ class Language(object):
    @classmethod
    def default_parser(cls, package, vocab):
-        data_dir = package.dir_path('deps', require=False)
+        data_dir = package.dir_path('deps')
        if data_dir and path.exists(data_dir):
            return Parser.from_dir(data_dir, vocab.strings, ArcEager)
    @classmethod
    def default_entity(cls, package, vocab):
-        data_dir = package.dir_path('ner', require=False)
+        data_dir = package.dir_path('ner')
        if data_dir and path.exists(data_dir):
            return Parser.from_dir(data_dir, vocab.strings, BiluoPushDown)
@ -182,7 +182,7 @@ class Language(object):
             - Language(model='en_default ==1.0.0')
             - Language(model='en_default <1.1.0, data_dir='spacy/data')
        """
-        package = Package(data_dir)
+        package = get_package(model, data_path=data_dir)
        if load_vectors is not True:
            warn("load_vectors is deprecated", DeprecationWarning)
        if vocab in (None, True):
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@ -8,13 +8,13 @@ except ImportError:
    import json
 from .parts_of_speech import NOUN, VERB, ADJ, PUNCT
-from .util import Package
+from .util import get_package
 class Lemmatizer(object):
    @classmethod
    def load(cls, pkg_or_str_or_file):
-        pkg = Package.create_or_return(pkg_or_str_or_file)
+        pkg = get_package(pkg_or_str_or_file)
        index = {}
        exc = {}
        for pos in ['adj', 'noun', 'verb']:
--- a/spacy/matcher.pyx
+++ b/spacy/matcher.pyx
@ -21,7 +21,7 @@ from .tokens.doc cimport Doc
 from .vocab cimport Vocab
 from .attrs import FLAG61 as U_ENT
-from .util import Package
+from .util import get_package
 from .attrs import FLAG60 as B2_ENT
 from .attrs import FLAG59 as B3_ENT
@ -171,7 +171,7 @@ cdef class Matcher:
    @classmethod
    def load(cls, pkg_or_str_or_file, Vocab vocab):
-        package = Package.create_or_return(pkg_or_str_or_file)
+        package = get_package(pkg_or_str_or_file)
        patterns = package.load_json(('vocab', 'gazetteer.json'))
        return cls(vocab, patterns)
--- a/spacy/tagger.pyx
+++ b/spacy/tagger.pyx
@ -16,7 +16,7 @@ from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE
 from .attrs cimport *
-from .util import Package
+from .util import get_package
 cpdef enum:
@ -149,7 +149,7 @@ cdef class Tagger:
    @classmethod
    def load(cls, pkg_or_str_or_file, vocab):
-        pkg = Package.create_or_return(pkg_or_str_or_file)
+        pkg = get_package(pkg_or_str_or_file)
        # TODO: templates.json deprecated? not present in latest package
        templates = cls.default_templates()
        # templates = package.load_utf8(json.load,
--- a/spacy/tests/tagger/test_lemmatizer.py
+++ b/spacy/tests/tagger/test_lemmatizer.py
@ -5,7 +5,7 @@ import io
 import pickle
 from spacy.lemmatizer import Lemmatizer, read_index, read_exc
-from spacy.util import get_package, Package
+from spacy.util import get_package
 import pytest
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -17,7 +17,7 @@ cimport cython
 from . import util
 from .tokens.doc cimport Doc
 from .util import read_lang_data
-from .util import Package
+from .util import get_package
 cdef class Tokenizer:
@ -43,7 +43,7 @@ cdef class Tokenizer:
    @classmethod
    def load(cls, pkg_or_str_or_file, Vocab vocab):
-        pkg = Package.create_or_return(pkg_or_str_or_file)
+        pkg = get_package(pkg_or_str_or_file)
        rules, prefix_re, suffix_re, infix_re = read_lang_data(pkg)
        prefix_re = re.compile(prefix_re)
        suffix_re = re.compile(suffix_re)
--- a/spacy/util.py
+++ b/spacy/util.py
@ -3,76 +3,39 @@ import io
 import json
 import re
 import os.path
 from contextlib import contextmanager
 import types
 import sputnik
 from sputnik.dir_package import DirPackage
 from sputnik.package_stub import PackageStub
 from sputnik.package_list import PackageNotFoundException, CompatiblePackageNotFoundException
 from . import about
 from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
-def local_path(*dirs):
+def get_package(value=None, data_path=None):
    return os.path.abspath(os.path.join(os.path.dirname(__file__), *dirs))
 class Package(object):
    @classmethod
    def create_or_return(cls, me_or_arg):
        return me_or_arg if isinstance(me_or_arg, cls) else cls(me_or_arg)
    def __init__(self, data_path=None, model='en_default-1.0.3'):
    if data_path is None:
-            data_path = local_path('data', model)
+        if isinstance(value, PackageStub):
-        self.model = model
+            return value
-        self.data_path = data_path
+        elif value and os.path.isdir(value):
-        self._root = self.data_path
+            return DirPackage(value)
-    def get(self, key):
+    elif value is None and data_path is not None:
-        pass
+        return DirPackage(data_path)
-    def has_file(self, *path_parts):
+    try:
-        return os.path.exists(os.path.join(self._root, *path_parts))
+        return sputnik.package('spacy', about.short_version,
                               value or 'en_default==1.0.4',
                               data_path=data_path)
-    def file_path(self, *path_parts, **kwargs):
+    except PackageNotFoundException as e:
-        return os.path.join(self._root, *path_parts)
+        raise RuntimeError("Model not installed. Please run 'python -m "
-
+                           "spacy.en.download' to install latest compatible "
-    def dir_path(self, *path_parts, **kwargs):
+                           "model.")
-        return os.path.join(self._root, *path_parts)
+    except CompatiblePackageNotFoundException as e:
-
+        raise RuntimeError("Installed model is not compatible with spaCy "
-    def load_json(self, path_parts, default=None):
+                           "version. Please run 'python -m spacy.en.download' "
-        if not self.has_file(*path_parts):
+                           "--force' to install latest compatible model.")
            if _is_error_class(default):
                raise default(self.file_path(*path_parts))
            elif isinstance(default, Exception):
                raise default
            else:
                return default
        with io.open(self.file_path(os.path.join(*path_parts)),
                      mode='r', encoding='utf8') as file_:
            return json.load(file_)
    @contextmanager
    def open(self, path_parts, mode='r', encoding='utf8', default=IOError):
        if not self.has_file(*path_parts):
            if _is_error_class(default):
                raise default(self.file_path(*path_parts))
            elif isinstance(default, Exception):
                raise default
            else:
                yield default
        else:
            # Enter
            file_ = io.open(self.file_path(os.path.join(*path_parts)),
                            mode=mode, encoding='utf8')
            yield file_
            # Exit
            file_.close()
 def _is_error_class(e):
    return isinstance(e, types.TypeType) and issubclass(e, Exception)
 def get_package(name=None, data_path=None):
    return Package(data_path)
 def normalize_slice(length, start, stop, step=None):
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -19,7 +19,7 @@ from .orth cimport word_shape
 from .typedefs cimport attr_t
 from .cfile cimport CFile
 from .lemmatizer import Lemmatizer
-from .util import Package
+from .util import get_package
 from . import attrs
 from . import symbols
@ -49,7 +49,7 @@ cdef class Vocab:
    '''
    @classmethod
    def load(cls, pkg_or_str_or_file, get_lex_attr=None):
-        package = Package.create_or_return(pkg_or_str_or_file)
+        package = get_package(pkg_or_str_or_file)
        tag_map = package.load_json(('vocab', 'tag_map.json'), default={})
        lemmatizer = Lemmatizer.load(package)