Merge pull request #223 from henningpeters/revise_packaging

refactored data_dir->via, add zip_safe, add spacy.load()
2025-10-17 17:24:14 +03:00 · 2016-01-17 02:12:48 +11:00 · 2016-01-17 02:12:48 +11:00 · 65c5b03b9b
commit 65c5b03b9b
parent ed001ea977 41ea14a56f
18 changed files with 150 additions and 178 deletions
--- a/.appveyor.yml
+++ b/.appveyor.yml
@ -61,7 +61,7 @@ build_script:
  - "%CMD_IN_ENV% python bin/init_model.py en lang_data/ corpora/ data"
  - "cp package.json data"
  - "%CMD_IN_ENV% sputnik build data en_default.sputnik"
-  - "%CMD_IN_ENV% sputnik install en_default.sputnik"
+  - "%CMD_IN_ENV% sputnik --name spacy install en_default.sputnik"

 test_script:
  # Run the project tests
--- a/.travis.yml
+++ b/.travis.yml
@ -31,7 +31,7 @@ install:
  - "python bin/init_model.py en lang_data/ corpora/ data"
  - "cp package.json data"
  - "sputnik build data en_default.sputnik"
-  - "sputnik install en_default.sputnik"
+  - "sputnik --name spacy install en_default.sputnik"

 script:
  - python build.py $MODE;
--- a/setup.py
+++ b/setup.py
@ -1,5 +1,5 @@
 #!/usr/bin/env python
-from __future__ import division, print_function
+from __future__ import print_function
 import os
 import shutil
 import subprocess
@ -14,13 +14,6 @@ except ImportError:
    from distutils.core import Extension, setup


-MAJOR      = 0
-MINOR      = 100
-MICRO      = 0
-ISRELEASED = False
-VERSION    = '%d.%d.%d' % (MAJOR, MINOR, MICRO)
-
-
 PACKAGES = [
    'spacy',
    'spacy.tokens',
@ -103,73 +96,6 @@ class build_ext_subclass(build_ext, build_ext_options):
        build_ext.build_extensions(self)


-# Return the git revision as a string
-def git_version():
-    def _minimal_ext_cmd(cmd):
-        # construct minimal environment
-        env = {}
-        for k in ['SYSTEMROOT', 'PATH']:
-            v = os.environ.get(k)
-            if v is not None:
-                env[k] = v
-        # LANGUAGE is used on win32
-        env['LANGUAGE'] = 'C'
-        env['LANG'] = 'C'
-        env['LC_ALL'] = 'C'
-        out = subprocess.Popen(cmd, stdout = subprocess.PIPE, env=env).communicate()[0]
-        return out
-
-    try:
-        out = _minimal_ext_cmd(['git', 'rev-parse', 'HEAD'])
-        GIT_REVISION = out.strip().decode('ascii')
-    except OSError:
-        GIT_REVISION = 'Unknown'
-
-    return GIT_REVISION
-
-
-def get_version_info():
-    # Adding the git rev number needs to be done inside write_version_py(),
-    # otherwise the import of spacy.about messes up the build under Python 3.
-    FULLVERSION = VERSION
-    if os.path.exists('.git'):
-        GIT_REVISION = git_version()
-    elif os.path.exists(os.path.join('spacy', 'about.py')):
-        # must be a source distribution, use existing version file
-        try:
-            from spacy.about import git_revision as GIT_REVISION
-        except ImportError:
-            raise ImportError('Unable to import git_revision. Try removing '
-                              'spacy/about.py and the build directory '
-                              'before building.')
-    else:
-        GIT_REVISION = 'Unknown'
-
-    if not ISRELEASED:
-        FULLVERSION += '.dev0+' + GIT_REVISION[:7]
-
-    return FULLVERSION, GIT_REVISION
-
-
-def write_version(path):
-    cnt = """# THIS FILE IS GENERATED FROM SPACY SETUP.PY
-short_version = '%(version)s'
-version = '%(version)s'
-full_version = '%(full_version)s'
-git_revision = '%(git_revision)s'
-release = %(isrelease)s
-if not release:
-    version = full_version
-"""
-    FULLVERSION, GIT_REVISION = get_version_info()
-
-    with open(path, 'w') as f:
-        f.write(cnt % {'version': VERSION,
-                       'full_version' : FULLVERSION,
-                       'git_revision' : GIT_REVISION,
-                       'isrelease': str(ISRELEASED)})
-
-
 def generate_cython(root, source):
    print('Cythonizing sources')
    p = subprocess.call([sys.executable,
@ -241,7 +167,9 @@ def setup_package():
        return clean(root)

    with chdir(root):
-        write_version(os.path.join(root, 'spacy', 'about.py'))
+        about = {}
+        with open(os.path.join(root, "spacy", "about.py")) as f:
+            exec(f.read(), about)

        include_dirs = [
            get_python_inc(plat_specific=True),
@ -259,15 +187,16 @@ def setup_package():
            prepare_includes(root)

        setup(
-            name='spacy',
+            name=about['__name__'],
+            zip_safe=False,
            packages=PACKAGES,
            package_data={'': ['*.pyx', '*.pxd']},
-            description='Industrial-strength NLP',
-            author='Matthew Honnibal',
-            author_email='matt@spacy.io',
-            version=VERSION,
-            url='https://spacy.io',
-            license='MIT',
+            description=about['__summary__'],
+            author=about['__author__'],
+            author_email=about['__email__'],
+            version=about['__version__'],
+            url=about['__uri__'],
+            license=about['__license__'],
            ext_modules=ext_modules,
            install_requires=['numpy', 'murmurhash>=0.26,<0.27', 'cymem>=1.30,<1.31', 'preshed>=0.46.1,<0.47',
                              'thinc>=4.2.0,<4.3.0', 'text_unidecode', 'plac', 'six',
--- a/spacy/init.py
+++ b/spacy/init.py
@ -0,0 +1,7 @@
+from . import util
+from .en import English
+
+
+def load(name, via=None):
+    package = util.get_package_by_name(name, via=via)
+    return English(package=package)
--- a/spacy/about.py
+++ b/spacy/about.py
@ -0,0 +1,14 @@
+# inspired from:
+
+# https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/
+# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
+
+__name__ = 'spacy'
+__version__ = '0.100.0'
+__summary__ = 'Industrial-strength NLP'
+__uri__ = 'https://spacy.io'
+__author__ = 'Matthew Honnibal'
+__email__ = 'matt@spacy.io'
+__license__ = 'MIT'
+__release__ = False
+__default_model__ = 'en_default==1.0.4'
--- a/spacy/en/download.py
+++ b/spacy/en/download.py
@ -1,9 +1,13 @@
+from __future__ import print_function
+
 import sys
 import os
 import shutil

 import plac
 import sputnik
+from sputnik.package_list import (PackageNotFoundException,
+                                  CompatiblePackageNotFoundException)

 from .. import about

@ -20,37 +24,34 @@ def migrate(path):
            os.unlink(os.path.join(path, filename))


-def link(package, path):
-    if os.path.exists(path):
-        if os.path.isdir(path):
-            shutil.rmtree(path)
-        else:
-            os.unlink(path)
-
-    if not hasattr(os, 'symlink'):  # not supported by win+py27
-        shutil.copytree(package.dir_path('data'), path)
-    else:
-        os.symlink(package.dir_path('data'), path)
-
-
@plac.annotations(
    force=("Force overwrite", "flag", "f", bool),
 )
 def main(data_size='all', force=False):
-    path = os.path.dirname(os.path.abspath(__file__))
-
-    data_path = os.path.abspath(os.path.join(path, '..', 'data'))
-    if not os.path.isdir(data_path):
-        os.mkdir(data_path)
-
    if force:
-        sputnik.purge('spacy', about.short_version, data_path=data_path)
+        sputnik.purge(about.__name__, about.__version__)

-    package = sputnik.install('spacy', about.short_version, 'en_default==1.0.4',
-                              data_path=data_path)
+    try:
+        sputnik.package(about.__name__, about.__version__, about.__default_model__)
+        print("Model already installed. Please run 'python -m "
+              "spacy.en.download --force' to reinstall.", file=sys.stderr)
+        sys.exit(1)
+    except (PackageNotFoundException, CompatiblePackageNotFoundException):
+        pass
+
+    package = sputnik.install(about.__name__, about.__version__, about.__default_model__)
+
+    try:
+        sputnik.package(about.__name__, about.__version__, about.__default_model__)
+    except (PackageNotFoundException, CompatiblePackageNotFoundException):
+        print("Model failed to install. Please run 'python -m "
+              "spacy.en.download --force'.", file=sys.stderr)
+        sys.exit(1)

    # FIXME clean up old-style packages
-    migrate(path)
+    migrate(os.path.dirname(os.path.abspath(__file__)))
+
+    print("Model successfully installed.", file=sys.stderr)


 if __name__ == '__main__':
--- a/spacy/language.py
+++ b/spacy/language.py
@ -19,8 +19,8 @@ from . import orth
 from .syntax.ner import BiluoPushDown
 from .syntax.arc_eager import ArcEager

+from . import util
 from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD
-from .util import get_package


 class Language(object):
@ -137,12 +137,10 @@ class Language(object):
        return {0: {'PER': True, 'LOC': True, 'ORG': True, 'MISC': True}}

    @classmethod
-    def default_vocab(cls, package=None, get_lex_attr=None):
-        if package is None:
-            package = get_package()
+    def default_vocab(cls, package, get_lex_attr=None):
        if get_lex_attr is None:
            get_lex_attr = cls.default_lex_attrs()
-        return Vocab.load(package, get_lex_attr=get_lex_attr)
+        return Vocab.from_package(package, get_lex_attr=get_lex_attr)

    @classmethod
    def default_parser(cls, package, vocab):
@ -158,7 +156,6 @@ class Language(object):

    def __init__(self,
        data_dir=None,
-        model=None,
        vocab=None,
        tokenizer=None,
        tagger=None,
@ -166,33 +163,44 @@ class Language(object):
        entity=None,
        matcher=None,
        serializer=None,
-        load_vectors=True):
+        load_vectors=True,
+        package=None):
        """
           a model can be specified:

-           1) by a path to the model directory (DEPRECATED)
-             - Language(data_dir='path/to/data')
+           1) by calling a Language subclass
+             - spacy.en.English()

-           2) by a language identifier (and optionally a package root dir)
-             - Language(lang='en')
-             - Language(lang='en', data_dir='spacy/data')
+           2) by calling a Language subclass with data_dir
+             - spacy.en.English('my/model/root')
+             - spacy.en.English(data_dir='my/model/root')

-           3) by a model name/version (and optionally a package root dir)
-             - Language(model='en_default')
-             - Language(model='en_default ==1.0.0')
-             - Language(model='en_default <1.1.0, data_dir='spacy/data')
+           3) by package name
+             - spacy.load('en_default')
+             - spacy.load('en_default==1.0.0')
+
+           4) by package name with a relocated package base
+             - spacy.load('en_default', via='/my/package/root')
+             - spacy.load('en_default==1.0.0', via='/my/package/root')
        """
-        package = get_package(model, data_path=data_dir)
+
+        if package is None:
+            if data_dir is None:
+                package = util.get_package_by_name()
+            else:
+                package = util.get_package(data_dir)
+
        if load_vectors is not True:
            warn("load_vectors is deprecated", DeprecationWarning)
+
        if vocab in (None, True):
-            vocab = Vocab.load(package, get_lex_attr=self.default_lex_attrs())
+            vocab = self.default_vocab(package)
        self.vocab = vocab
        if tokenizer in (None, True):
-            tokenizer = Tokenizer.load(package, self.vocab)
+            tokenizer = Tokenizer.from_package(package, self.vocab)
        self.tokenizer = tokenizer
        if tagger in (None, True):
-            tagger = Tagger.load(package, self.vocab)
+            tagger = Tagger.from_package(package, self.vocab)
        self.tagger = tagger
        if entity in (None, True):
            entity = self.default_entity(package, self.vocab)
@ -201,13 +209,12 @@ class Language(object):
            parser = self.default_parser(package, self.vocab)
        self.parser = parser
        if matcher in (None, True):
-            matcher = Matcher.load(package, self.vocab)
+            matcher = Matcher.from_package(package, self.vocab)
        self.matcher = matcher

    def __reduce__(self):
        args = (
            None, # data_dir
-            None, # model
            self.vocab,
            self.tokenizer,
            self.tagger,
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@ -13,8 +13,11 @@ from .util import get_package

 class Lemmatizer(object):
    @classmethod
-    def load(cls, pkg_or_str_or_file):
-        pkg = get_package(pkg_or_str_or_file)
+    def load(cls, via):
+        return cls.from_package(get_package(via))
+
+    @classmethod
+    def from_package(cls, pkg):
        index = {}
        exc = {}
        for pos in ['adj', 'noun', 'verb']:
--- a/spacy/matcher.pyx
+++ b/spacy/matcher.pyx
@ -170,8 +170,11 @@ cdef class Matcher:
    cdef object _patterns

    @classmethod
-    def load(cls, pkg_or_str_or_file, Vocab vocab):
-        package = get_package(pkg_or_str_or_file)
+    def load(cls, data_dir, Vocab vocab):
+        return cls.from_package(get_package(data_dir), vocab=vocab)
+
+    @classmethod
+    def from_package(cls, package, Vocab vocab):
        patterns = package.load_json(('vocab', 'gazetteer.json'))
        return cls(vocab, patterns)

--- a/spacy/tagger.pyx
+++ b/spacy/tagger.pyx
@ -148,8 +148,11 @@ cdef class Tagger:
        return cls(vocab, model)

    @classmethod
-    def load(cls, pkg_or_str_or_file, vocab):
-        pkg = get_package(pkg_or_str_or_file)
+    def load(cls, data_dir, vocab):
+        return cls.from_package(get_package(data_dir), vocab=vocab)
+
+    @classmethod
+    def from_package(cls, pkg, vocab):
        # TODO: templates.json deprecated? not present in latest package
        templates = cls.default_templates()
        # templates = package.load_utf8(json.load,
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -7,11 +7,11 @@ import os
@pytest.fixture(scope="session")
 def EN():
    if os.environ.get('SPACY_DATA'):
-        data_path = os.environ.get('SPACY_DATA')
+        data_dir = os.environ.get('SPACY_DATA')
    else:
-        data_path = None
-    print("Load EN from %s" % data_path)
-    return English(data_dir=data_path)
+        data_dir = None
+    print("Load EN from %s" % data_dir)
+    return English(data_dir=data_dir)


 def pytest_addoption(parser):
--- a/spacy/tests/serialize/test_packer.py
+++ b/spacy/tests/serialize/test_packer.py
@ -13,6 +13,7 @@ from spacy.tokenizer import Tokenizer
 from os import path
 import os

+from spacy import util
 from spacy.attrs import ORTH, SPACY, TAG, DEP, HEAD
 from spacy.serialize.packer import Packer

@ -21,11 +22,13 @@ from spacy.serialize.bits import BitArray

@pytest.fixture
 def vocab():
-    if os.environ.get('SPACY_DATA'):
-        data_path = os.environ.get('SPACY_DATA')
+    data_dir = os.environ.get('SPACY_DATA')
+    if data_dir is None:
+        package = util.get_package_by_name()
    else:
-        data_path = None
-    vocab = English.default_vocab(package=data_path)
+        package = util.get_package(data_dir)
+
+    vocab = English.default_vocab(package=package)
    lex = vocab['dog']
    assert vocab[vocab.strings['dog']].orth_ == 'dog'
    lex  = vocab['the']
--- a/spacy/tests/tagger/test_lemmatizer.py
+++ b/spacy/tests/tagger/test_lemmatizer.py
@ -5,23 +5,23 @@ import io
 import pickle

 from spacy.lemmatizer import Lemmatizer, read_index, read_exc
-from spacy.util import get_package
+from spacy import util

 import pytest


@pytest.fixture
 def package():
-    if os.environ.get('SPACY_DATA'):
-        data_path = os.environ.get('SPACY_DATA')
+    data_dir = os.environ.get('SPACY_DATA')
+    if data_dir is None:
+        return util.get_package_by_name()
    else:
-        data_path = None
-    return get_package(data_path=data_path)
+        return util.get_package(data_dir)


@pytest.fixture
 def lemmatizer(package):
-    return Lemmatizer.load(package)
+    return Lemmatizer.from_package(package)


 def test_read_index(package):
--- a/spacy/tests/website/conftest.py
+++ b/spacy/tests/website/conftest.py
@ -7,10 +7,10 @@ import os
 def nlp():
    from spacy.en import English
    if os.environ.get('SPACY_DATA'):
-        data_path = os.environ.get('SPACY_DATA')
+        data_dir = os.environ.get('SPACY_DATA')
    else:
-        data_path = None
-    return English(data_dir=data_path)
+        data_dir = None
+    return English(data_dir=data_dir)


@pytest.fixture()
--- a/spacy/tests/website/test_home.py
+++ b/spacy/tests/website/test_home.py
@ -11,13 +11,13 @@ def token(doc):

 def test_load_resources_and_process_text():
    if os.environ.get('SPACY_DATA'):
-        data_path = os.environ.get('SPACY_DATA')
+        data_dir = os.environ.get('SPACY_DATA')
    else:
-        data_path = None
-    print("Load EN from %s" % data_path)
+        data_dir = None
+    print("Load EN from %s" % data_dir)
 
    from spacy.en import English
-    nlp = English(data_dir=data_path)
+    nlp = English(data_dir=data_dir)
    doc = nlp('Hello, world. Here are two sentences.')


--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -42,9 +42,12 @@ cdef class Tokenizer:
        return (self.__class__, args, None, None)

    @classmethod
-    def load(cls, pkg_or_str_or_file, Vocab vocab):
-        pkg = get_package(pkg_or_str_or_file)
-        rules, prefix_re, suffix_re, infix_re = read_lang_data(pkg)
+    def load(cls, data_dir, Vocab vocab):
+        return cls.from_package(get_package(data_dir), vocab=vocab)
+
+    @classmethod
+    def from_package(cls, package, Vocab vocab):
+        rules, prefix_re, suffix_re, infix_re = read_lang_data(package)
        prefix_re = re.compile(prefix_re)
        suffix_re = re.compile(suffix_re)
        infix_re = re.compile(infix_re)
--- a/spacy/util.py
+++ b/spacy/util.py
@ -4,37 +4,33 @@ import json
 import re
 import os.path

+import six
 import sputnik
 from sputnik.dir_package import DirPackage
-from sputnik.package_stub import PackageStub
-from sputnik.package_list import PackageNotFoundException, CompatiblePackageNotFoundException
+from sputnik.package_list import (PackageNotFoundException,
+                                  CompatiblePackageNotFoundException)

 from . import about
 from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE


-def get_package(value=None, data_path=None):
-    if data_path is None:
-        if isinstance(value, PackageStub):
-            return value
-        elif value and os.path.isdir(value):
-            return DirPackage(value)
+def get_package(data_dir):
+    if not isinstance(data_dir, six.string_types):
+        raise RuntimeError('data_dir must be a string')
+    return DirPackage(data_dir)

-    elif value is None and data_path is not None:
-        return DirPackage(data_path)

+def get_package_by_name(name=None, via=None):
    try:
-        return sputnik.package('spacy', about.short_version,
-                               value or 'en_default==1.0.4',
-                               data_path=data_path)
-
+        return sputnik.package(about.__name__, about.__version__,
+                               name or about.__default_model__, data_path=via)
    except PackageNotFoundException as e:
        raise RuntimeError("Model not installed. Please run 'python -m "
                           "spacy.en.download' to install latest compatible "
                           "model.")
    except CompatiblePackageNotFoundException as e:
        raise RuntimeError("Installed model is not compatible with spaCy "
-                           "version. Please run 'python -m spacy.en.download' "
+                           "version. Please run 'python -m spacy.en.download "
                           "--force' to install latest compatible model.")


--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -48,11 +48,14 @@ cdef class Vocab:
    '''A map container for a language's LexemeC structs.
    '''
    @classmethod
-    def load(cls, pkg_or_str_or_file, get_lex_attr=None):
-        package = get_package(pkg_or_str_or_file)
+    def load(cls, data_dir, get_lex_attr=None):
+        return cls.from_package(get_package(data_dir), get_lex_attr=get_lex_attr)
+
+    @classmethod
+    def from_package(cls, package, get_lex_attr=None):
        tag_map = package.load_json(('vocab', 'tag_map.json'), default={})

-        lemmatizer = Lemmatizer.load(package)
+        lemmatizer = Lemmatizer.from_package(package)

        serializer_freqs = package.load_json(('vocab', 'serializer.json'), default={})