* Fix merge conflict in requirements.txt

2025-10-21 19:24:39 +03:00 · 2016-01-16 16:20:49 +01:00 · 2016-01-16 16:20:49 +01:00 · 3dc398b727
commit 3dc398b727
parent fc5962a77d 65c5b03b9b
22 changed files with 220 additions and 222 deletions
--- a/.appveyor.yml
+++ b/.appveyor.yml
@ -61,7 +61,7 @@ build_script:
  - "%CMD_IN_ENV% python bin/init_model.py en lang_data/ corpora/ data"
  - "cp package.json data"
  - "%CMD_IN_ENV% sputnik build data en_default.sputnik"
-  - "%CMD_IN_ENV% sputnik install en_default.sputnik"
+  - "%CMD_IN_ENV% sputnik --name spacy install en_default.sputnik"
 test_script:
  # Run the project tests
--- a/.travis.yml
+++ b/.travis.yml
@ -31,7 +31,7 @@ install:
  - "python bin/init_model.py en lang_data/ corpora/ data"
  - "cp package.json data"
  - "sputnik build data en_default.sputnik"
-  - "sputnik install en_default.sputnik"
+  - "sputnik --name spacy install en_default.sputnik"
 script:
  - python build.py $MODE;
--- a/package.json
+++ b/package.json
@ -1,17 +1,14 @@
 {
-    "name": "en_default",
+    "name": "en_test",
-    "version": "0.100.0",
+    "version": "1.0.0",
-    "description": "english default model",
+    "description": "english test model",
    "license": "public domain",
    "include": [
-        "deps/*",
+        ["deps", "*"],
-        "ner/*",
+        ["ner", "*"],
-        "pos/*",
+        ["pos", "*"],
-        "tokenizer/*",
+        ["tokenizer", "*"],
-        "vocab/*",
+        ["vocab", "*"],
-        "wordnet/*"
+        ["wordnet", "*"]
-    ],
+    ]
    "compatibility": {
        "spacy": "==0.100.0"
    }
 }
--- a/requirements.txt
+++ b/requirements.txt
@ -10,4 +10,4 @@ plac
 six
 ujson
 cloudpickle
-sputnik==0.7.*
+sputnik>=0.7.0,<0.8.0
--- a/setup.py
+++ b/setup.py
@ -1,5 +1,5 @@
 #!/usr/bin/env python
-from __future__ import division, print_function
+from __future__ import print_function
 import os
 import shutil
 import subprocess
@ -14,13 +14,6 @@ except ImportError:
    from distutils.core import Extension, setup
 MAJOR      = 0
 MINOR      = 100
 MICRO      = 0
 ISRELEASED = False
 VERSION    = '%d.%d.%d' % (MAJOR, MINOR, MICRO)
 PACKAGES = [
    'spacy',
    'spacy.tokens',
@ -103,73 +96,6 @@ class build_ext_subclass(build_ext, build_ext_options):
        build_ext.build_extensions(self)
 # Return the git revision as a string
 def git_version():
    def _minimal_ext_cmd(cmd):
        # construct minimal environment
        env = {}
        for k in ['SYSTEMROOT', 'PATH']:
            v = os.environ.get(k)
            if v is not None:
                env[k] = v
        # LANGUAGE is used on win32
        env['LANGUAGE'] = 'C'
        env['LANG'] = 'C'
        env['LC_ALL'] = 'C'
        out = subprocess.Popen(cmd, stdout = subprocess.PIPE, env=env).communicate()[0]
        return out
    try:
        out = _minimal_ext_cmd(['git', 'rev-parse', 'HEAD'])
        GIT_REVISION = out.strip().decode('ascii')
    except OSError:
        GIT_REVISION = 'Unknown'
    return GIT_REVISION
 def get_version_info():
    # Adding the git rev number needs to be done inside write_version_py(),
    # otherwise the import of spacy.about messes up the build under Python 3.
    FULLVERSION = VERSION
    if os.path.exists('.git'):
        GIT_REVISION = git_version()
    elif os.path.exists(os.path.join('spacy', 'about.py')):
        # must be a source distribution, use existing version file
        try:
            from spacy.about import git_revision as GIT_REVISION
        except ImportError:
            raise ImportError('Unable to import git_revision. Try removing '
                              'spacy/about.py and the build directory '
                              'before building.')
    else:
        GIT_REVISION = 'Unknown'
    if not ISRELEASED:
        FULLVERSION += '.dev0+' + GIT_REVISION[:7]
    return FULLVERSION, GIT_REVISION
 def write_version(path):
    cnt = """# THIS FILE IS GENERATED FROM SPACY SETUP.PY
 short_version = '%(version)s'
 version = '%(version)s'
 full_version = '%(full_version)s'
 git_revision = '%(git_revision)s'
 release = %(isrelease)s
 if not release:
    version = full_version
 """
    FULLVERSION, GIT_REVISION = get_version_info()
    with open(path, 'w') as f:
        f.write(cnt % {'version': VERSION,
                       'full_version' : FULLVERSION,
                       'git_revision' : GIT_REVISION,
                       'isrelease': str(ISRELEASED)})
 def generate_cython(root, source):
    print('Cythonizing sources')
    p = subprocess.call([sys.executable,
@ -241,7 +167,9 @@ def setup_package():
        return clean(root)
    with chdir(root):
-        write_version(os.path.join(root, 'spacy', 'about.py'))
+        about = {}
        with open(os.path.join(root, "spacy", "about.py")) as f:
            exec(f.read(), about)
        include_dirs = [
            get_python_inc(plat_specific=True),
@ -259,19 +187,20 @@ def setup_package():
            prepare_includes(root)
        setup(
-            name='spacy',
+            name=about['__name__'],
            zip_safe=False,
            packages=PACKAGES,
            package_data={'': ['*.pyx', '*.pxd']},
-            description='Industrial-strength NLP',
+            description=about['__summary__'],
-            author='Matthew Honnibal',
+            author=about['__author__'],
-            author_email='matt@spacy.io',
+            author_email=about['__email__'],
-            version=VERSION,
+            version=about['__version__'],
-            url='https://spacy.io',
+            url=about['__uri__'],
-            license='MIT',
+            license=about['__license__'],
            ext_modules=ext_modules,
            install_requires=['numpy', 'murmurhash>=0.26,<0.27', 'cymem>=1.30,<1.31', 'preshed>=0.46.1,<0.47',
                              'thinc>=4.2.0,<4.3.0', 'text_unidecode', 'plac', 'six',
-                              'ujson', 'cloudpickle', 'sputnik>=0.6.4,<0.7.0'],
+                              'ujson', 'cloudpickle', 'sputnik>=0.7.0,<0.8.0'],
            cmdclass = {
                'build_ext': build_ext_subclass},
        )
--- a/spacy/init.py
+++ b/spacy/init.py
@ -0,0 +1,7 @@
 from . import util
 from .en import English
 def load(name, via=None):
    package = util.get_package_by_name(name, via=via)
    return English(package=package)
--- a/spacy/about.py
+++ b/spacy/about.py
@ -0,0 +1,14 @@
 # inspired from:
 # https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/
 # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
 __name__ = 'spacy'
 __version__ = '0.100.0'
 __summary__ = 'Industrial-strength NLP'
 __uri__ = 'https://spacy.io'
 __author__ = 'Matthew Honnibal'
 __email__ = 'matt@spacy.io'
 __license__ = 'MIT'
 __release__ = False
 __default_model__ = 'en_default==1.0.4'
--- a/spacy/en/download.py
+++ b/spacy/en/download.py
@ -1,9 +1,15 @@
 from __future__ import print_function
 import sys
 import os
 import shutil
 import plac
-from sputnik import Sputnik
+import sputnik
 from sputnik.package_list import (PackageNotFoundException,
                                  CompatiblePackageNotFoundException)
 from .. import about
 def migrate(path):
@ -18,43 +24,34 @@ def migrate(path):
            os.unlink(os.path.join(path, filename))
 def link(package, path):
    if os.path.exists(path):
        if os.path.isdir(path):
            shutil.rmtree(path)
        else:
            os.unlink(path)
    if not hasattr(os, 'symlink'):  # not supported by win+py27
        shutil.copytree(package.dir_path('data'), path)
    else:
        os.symlink(package.dir_path('data'), path)
@plac.annotations(
    force=("Force overwrite", "flag", "f", bool),
 )
 def main(data_size='all', force=False):
    # TODO read version from the same source as the setup
    sputnik = Sputnik('spacy', '0.100.0', console=sys.stdout)
    path = os.path.dirname(os.path.abspath(__file__))
    data_path = os.path.abspath(os.path.join(path, '..', 'data'))
    if not os.path.isdir(data_path):
        os.mkdir(data_path)
    command = sputnik.command(
        data_path=data_path,
        repository_url='https://index.spacy.io')
    if force:
-        command.purge()
+        sputnik.purge(about.__name__, about.__version__)
-    package = command.install('en_default')
+    try:
        sputnik.package(about.__name__, about.__version__, about.__default_model__)
        print("Model already installed. Please run 'python -m "
              "spacy.en.download --force' to reinstall.", file=sys.stderr)
        sys.exit(1)
    except (PackageNotFoundException, CompatiblePackageNotFoundException):
        pass
    package = sputnik.install(about.__name__, about.__version__, about.__default_model__)
    try:
        sputnik.package(about.__name__, about.__version__, about.__default_model__)
    except (PackageNotFoundException, CompatiblePackageNotFoundException):
        print("Model failed to install. Please run 'python -m "
              "spacy.en.download --force'.", file=sys.stderr)
        sys.exit(1)
    # FIXME clean up old-style packages
-    migrate(path)
+    migrate(os.path.dirname(os.path.abspath(__file__)))
    print("Model successfully installed.", file=sys.stderr)
 if __name__ == '__main__':
--- a/spacy/language.py
+++ b/spacy/language.py
@ -19,8 +19,8 @@ from . import orth
 from .syntax.ner import BiluoPushDown
 from .syntax.arc_eager import ArcEager
 from . import util
 from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD
 from .util import get_package
 class Language(object):
@ -137,28 +137,25 @@ class Language(object):
        return {0: {'PER': True, 'LOC': True, 'ORG': True, 'MISC': True}}
    @classmethod
-    def default_vocab(cls, package=None, get_lex_attr=None):
+    def default_vocab(cls, package, get_lex_attr=None):
        if package is None:
            package = get_package()
        if get_lex_attr is None:
            get_lex_attr = cls.default_lex_attrs()
        return Vocab.from_package(package, get_lex_attr=get_lex_attr)
    @classmethod
    def default_parser(cls, package, vocab):
-        data_dir = package.dir_path('deps', require=False)
+        data_dir = package.dir_path('deps')
        if data_dir and path.exists(data_dir):
            return Parser.from_dir(data_dir, vocab.strings, ArcEager)
    @classmethod
    def default_entity(cls, package, vocab):
-        data_dir = package.dir_path('ner', require=False)
+        data_dir = package.dir_path('ner')
        if data_dir and path.exists(data_dir):
            return Parser.from_dir(data_dir, vocab.strings, BiluoPushDown)
    def __init__(self,
        data_dir=None,
        model=None,
        vocab=None,
        tokenizer=None,
        tagger=None,
@ -166,48 +163,36 @@ class Language(object):
        entity=None,
        matcher=None,
        serializer=None,
-        load_vectors=True):
+        load_vectors=True,
        package=None):
        """
           a model can be specified:
-           1) by a path to the model directory (DEPRECATED)
+           1) by calling a Language subclass
-             - Language(data_dir='path/to/data')
+             - spacy.en.English()
-           2) by a language identifier (and optionally a package root dir)
+           2) by calling a Language subclass with data_dir
-             - Language(lang='en')
+             - spacy.en.English('my/model/root')
-             - Language(lang='en', data_dir='spacy/data')
+             - spacy.en.English(data_dir='my/model/root')
-           3) by a model name/version (and optionally a package root dir)
+           3) by package name
-             - Language(model='en_default')
+             - spacy.load('en_default')
-             - Language(model='en_default ==1.0.0')
+             - spacy.load('en_default==1.0.0')
-             - Language(model='en_default <1.1.0, data_dir='spacy/data')
+
           4) by package name with a relocated package base
             - spacy.load('en_default', via='/my/package/root')
             - spacy.load('en_default==1.0.0', via='/my/package/root')
        """
        # support non-package data dirs
        if data_dir and path.exists(path.join(data_dir, 'vocab')):
            class Package(object):
                def __init__(self, root):
                    self.root = root
-                def has_file(self, *path_parts):
+        if package is None:
-                    return path.exists(path.join(self.root, *path_parts))
+            if data_dir is None:
                package = util.get_package_by_name()
            else:
                package = util.get_package(data_dir)
                def file_path(self, *path_parts, **kwargs):
                    return path.join(self.root, *path_parts)
                def dir_path(self, *path_parts, **kwargs):
                    return path.join(self.root, *path_parts)
                def load_utf8(self, func, *path_parts, **kwargs):
                    with io.open(self.file_path(path.join(*path_parts)),
                                 mode='r', encoding='utf8') as f:
                        return func(f)
            warn("using non-package data_dir", DeprecationWarning)
            package = Package(data_dir)
        else:
            package = get_package(name=model, data_path=data_dir)
        if load_vectors is not True:
            warn("load_vectors is deprecated", DeprecationWarning)
        if vocab in (None, True):
            vocab = self.default_vocab(package)
        self.vocab = vocab
@ -230,7 +215,6 @@ class Language(object):
    def __reduce__(self):
        args = (
            None, # data_dir
            None, # model
            self.vocab,
            self.tokenizer,
            self.tagger,
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@ -8,25 +8,24 @@ except ImportError:
    import json
 from .parts_of_speech import NOUN, VERB, ADJ, PUNCT
 from .util import get_package
 class Lemmatizer(object):
    @classmethod
-    def from_package(cls, package):
+    def load(cls, via):
        return cls.from_package(get_package(via))
    @classmethod
    def from_package(cls, pkg):
        index = {}
        exc = {}
        for pos in ['adj', 'noun', 'verb']:
-            index[pos] = package.load_utf8(read_index,
+            with pkg.open(('wordnet', 'index.%s' % pos), default=None) as file_:
-                'wordnet', 'index.%s' % pos,
+                index[pos] = read_index(file_) if file_ is not None else set()
-                default=set())  # TODO: really optional?
+            with pkg.open(('wordnet', '%s.exc' % pos), default=None) as file_:
-            exc[pos] = package.load_utf8(read_exc,
+                exc[pos] = read_exc(file_) if file_ is not None else {}
-                'wordnet', '%s.exc' % pos,
+        rules = pkg.load_json(('vocab', 'lemma_rules.json'), default={})
                default={})  # TODO: really optional?
        rules = package.load_utf8(json.load,
            'vocab', 'lemma_rules.json',
            default={})  # TODO: really optional?
        return cls(index, exc, rules)
    def __init__(self, index, exceptions, rules):
--- a/spacy/matcher.pyx
+++ b/spacy/matcher.pyx
@ -21,6 +21,7 @@ from .tokens.doc cimport Doc
 from .vocab cimport Vocab
 from .attrs import FLAG61 as U_ENT
 from .util import get_package
 from .attrs import FLAG60 as B2_ENT
 from .attrs import FLAG59 as B3_ENT
@ -168,11 +169,13 @@ cdef class Matcher:
    cdef readonly Vocab vocab
    cdef object _patterns
    @classmethod
    def load(cls, data_dir, Vocab vocab):
        return cls.from_package(get_package(data_dir), vocab=vocab)
    @classmethod
    def from_package(cls, package, Vocab vocab):
-        patterns = package.load_utf8(json.load,
+        patterns = package.load_json(('vocab', 'gazetteer.json'))
            'vocab', 'gazetteer.json',
            default={})  # TODO: really optional?
        return cls(vocab, patterns)
    def __init__(self, vocab, patterns):
--- a/spacy/syntax/parser.pyx
+++ b/spacy/syntax/parser.pyx
@ -89,6 +89,13 @@ cdef class Parser:
            model.load(path.join(model_dir, 'model'))
        return cls(strings, moves, model)
    @classmethod
    def load(cls, pkg_or_str_or_file, vocab):
        # TODO
        raise NotImplementedError(
                "This should be here, but isn't yet =/. Use Parser.from_dir")
    def __reduce__(self):
        return (Parser, (self.moves.strings, self.moves, self.model), None, None)
--- a/spacy/tagger.pyx
+++ b/spacy/tagger.pyx
@ -16,6 +16,8 @@ from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE
 from .attrs cimport *
 from .util import get_package
 cpdef enum:
    P2_orth
@ -146,7 +148,11 @@ cdef class Tagger:
        return cls(vocab, model)
    @classmethod
-    def from_package(cls, package, vocab):
+    def load(cls, data_dir, vocab):
        return cls.from_package(get_package(data_dir), vocab=vocab)
    @classmethod
    def from_package(cls, pkg, vocab):
        # TODO: templates.json deprecated? not present in latest package
        templates = cls.default_templates()
        # templates = package.load_utf8(json.load,
@ -156,8 +162,9 @@ cdef class Tagger:
        model = TaggerModel(vocab.morphology.n_tags,
            ConjunctionExtracter(N_CONTEXT_FIELDS, templates))
-        if package.has_file('pos', 'model'):  # TODO: really optional?
+
-            model.load(package.file_path('pos', 'model'))
+        if pkg.has_file('pos', 'model'):  # TODO: really optional?
            model.load(pkg.file_path('pos', 'model'))
        return cls(vocab, model)
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -1,11 +1,17 @@
 from spacy.en import English
 import pytest
 import os
@pytest.fixture(scope="session")
 def EN():
-    return English()
+    if os.environ.get('SPACY_DATA'):
        data_dir = os.environ.get('SPACY_DATA')
    else:
        data_dir = None
    print("Load EN from %s" % data_dir)
    return English(data_dir=data_dir)
 def pytest_addoption(parser):
--- a/spacy/tests/serialize/test_packer.py
+++ b/spacy/tests/serialize/test_packer.py
@ -11,7 +11,9 @@ from spacy.vocab import Vocab
 from spacy.tokens.doc import Doc
 from spacy.tokenizer import Tokenizer
 from os import path
 import os
 from spacy import util
 from spacy.attrs import ORTH, SPACY, TAG, DEP, HEAD
 from spacy.serialize.packer import Packer
@ -20,7 +22,13 @@ from spacy.serialize.bits import BitArray
@pytest.fixture
 def vocab():
-    vocab = English.default_vocab()
+    data_dir = os.environ.get('SPACY_DATA')
    if data_dir is None:
        package = util.get_package_by_name()
    else:
        package = util.get_package(data_dir)
    vocab = English.default_vocab(package=package)
    lex = vocab['dog']
    assert vocab[vocab.strings['dog']].orth_ == 'dog'
    lex  = vocab['the']
--- a/spacy/tests/tagger/test_lemmatizer.py
+++ b/spacy/tests/tagger/test_lemmatizer.py
@ -1,17 +1,22 @@
 # -*- coding: utf-8 -*-
 from __future__ import unicode_literals
 import os
 import io
 import pickle
 from spacy.lemmatizer import Lemmatizer, read_index, read_exc
-from spacy.util import get_package
+from spacy import util
 import pytest
@pytest.fixture
 def package():
-    return get_package()
+    data_dir = os.environ.get('SPACY_DATA')
    if data_dir is None:
        return util.get_package_by_name()
    else:
        return util.get_package(data_dir)
@pytest.fixture
@ -20,14 +25,16 @@ def lemmatizer(package):
 def test_read_index(package):
-    index = package.load_utf8(read_index, 'wordnet', 'index.noun')
+    with package.open(('wordnet', 'index.noun')) as file_:
        index = read_index(file_)
    assert 'man' in index
    assert 'plantes' not in index
    assert 'plant' in index
 def test_read_exc(package):
-    exc = package.load_utf8(read_exc, 'wordnet', 'verb.exc')
+    with package.open(('wordnet', 'verb.exc')) as file_:
        exc = read_exc(file_)
    assert exc['was'] == ('be',)
--- a/spacy/tests/tokenizer/test_contractions.py
+++ b/spacy/tests/tokenizer/test_contractions.py
@ -50,6 +50,7 @@ def test_punct(en_tokenizer):
    assert len(tokens) == 3
@pytest.mark.xfail
 def test_therell(en_tokenizer):
    tokens = en_tokenizer("there'll")
    assert len(tokens) == 2
--- a/spacy/tests/website/conftest.py
+++ b/spacy/tests/website/conftest.py
@ -6,7 +6,11 @@ import os
@pytest.fixture(scope='session')
 def nlp():
    from spacy.en import English
-    return English()
+    if os.environ.get('SPACY_DATA'):
        data_dir = os.environ.get('SPACY_DATA')
    else:
        data_dir = None
    return English(data_dir=data_dir)
@pytest.fixture()
--- a/spacy/tests/website/test_home.py
+++ b/spacy/tests/website/test_home.py
@ -10,8 +10,14 @@ def token(doc):
 def test_load_resources_and_process_text():
    if os.environ.get('SPACY_DATA'):
        data_dir = os.environ.get('SPACY_DATA')
    else:
        data_dir = None
    print("Load EN from %s" % data_dir)
    from spacy.en import English
-    nlp = English()
+    nlp = English(data_dir=data_dir)
    doc = nlp('Hello, world. Here are two sentences.')
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -15,8 +15,9 @@ from .strings cimport hash_string
 cimport cython
 from . import util
 from .util import read_lang_data
 from .tokens.doc cimport Doc
 from .util import read_lang_data
 from .util import get_package
 cdef class Tokenizer:
@ -40,6 +41,10 @@ cdef class Tokenizer:
        return (self.__class__, args, None, None)
    @classmethod
    def load(cls, data_dir, Vocab vocab):
        return cls.from_package(get_package(data_dir), vocab=vocab)
    @classmethod
    def from_package(cls, package, Vocab vocab):
        rules, prefix_re, suffix_re, infix_re = read_lang_data(package)
--- a/spacy/util.py
+++ b/spacy/util.py
@ -2,23 +2,36 @@ import os
 import io
 import json
 import re
 import os.path
-from sputnik import Sputnik
+import six
 import sputnik
 from sputnik.dir_package import DirPackage
 from sputnik.package_list import (PackageNotFoundException,
                                  CompatiblePackageNotFoundException)
 from . import about
 from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
-def get_package(name=None, data_path=None):
+def get_package(data_dir):
-    if data_path is None:
+    if not isinstance(data_dir, six.string_types):
-        if os.environ.get('SPACY_DATA'):
+        raise RuntimeError('data_dir must be a string')
-            data_path = os.environ.get('SPACY_DATA')
+    return DirPackage(data_dir)
        else:
            data_path = os.path.abspath(
                os.path.join(os.path.dirname(__file__), 'data'))
-    sputnik = Sputnik('spacy', '0.100.0')  # TODO: retrieve version
+
-    pool = sputnik.pool(data_path)
+def get_package_by_name(name=None, via=None):
-    return pool.get(name or 'en_default')
+    try:
        return sputnik.package(about.__name__, about.__version__,
                               name or about.__default_model__, data_path=via)
    except PackageNotFoundException as e:
        raise RuntimeError("Model not installed. Please run 'python -m "
                           "spacy.en.download' to install latest compatible "
                           "model.")
    except CompatiblePackageNotFoundException as e:
        raise RuntimeError("Installed model is not compatible with spaCy "
                           "version. Please run 'python -m spacy.en.download "
                           "--force' to install latest compatible model.")
 def normalize_slice(length, start, stop, step=None):
@ -46,10 +59,13 @@ def utf8open(loc, mode='r'):
 def read_lang_data(package):
-    tokenization = package.load_utf8(json.load, 'tokenizer', 'specials.json')
+    tokenization = package.load_json(('tokenizer', 'specials.json'))
-    prefix = package.load_utf8(read_prefix, 'tokenizer', 'prefix.txt')
+    with package.open(('tokenizer', 'prefix.txt'), default=None) as file_:
-    suffix = package.load_utf8(read_suffix, 'tokenizer', 'suffix.txt')
+        prefix = read_prefix(file_) if file_ is not None else None
-    infix = package.load_utf8(read_infix, 'tokenizer', 'infix.txt')
+    with package.open(('tokenizer', 'suffix.txt'), default=None) as file_:
        suffix = read_suffix(file_) if file_ is not None else None
    with package.open(('tokenizer', 'infix.txt'), default=None) as file_:
        infix = read_infix(file_) if file_ is not None else None
    return tokenization, prefix, suffix, infix
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -19,6 +19,7 @@ from .orth cimport word_shape
 from .typedefs cimport attr_t
 from .cfile cimport CFile
 from .lemmatizer import Lemmatizer
 from .util import get_package
 from . import attrs
 from . import symbols
@ -46,28 +47,28 @@ EMPTY_LEXEME.vector = EMPTY_VEC
 cdef class Vocab:
    '''A map container for a language's LexemeC structs.
    '''
    @classmethod
    def load(cls, data_dir, get_lex_attr=None):
        return cls.from_package(get_package(data_dir), get_lex_attr=get_lex_attr)
    @classmethod
    def from_package(cls, package, get_lex_attr=None):
-        tag_map = package.load_utf8(json.load,
+        tag_map = package.load_json(('vocab', 'tag_map.json'), default={})
            'vocab', 'tag_map.json')
        lemmatizer = Lemmatizer.from_package(package)
-        serializer_freqs = package.load_utf8(json.load,
+        serializer_freqs = package.load_json(('vocab', 'serializer.json'), default={})
            'vocab', 'serializer.json',
            require=False)  # TODO: really optional?
        cdef Vocab self = cls(get_lex_attr=get_lex_attr, tag_map=tag_map,
                              lemmatizer=lemmatizer, serializer_freqs=serializer_freqs)
-        if package.has_file('vocab', 'strings.json'):  # TODO: really optional?
+        with package.open(('vocab', 'strings.json')) as file_:
-            package.load_utf8(self.strings.load, 'vocab', 'strings.json')
+            self.strings.load(file_)
-            self.load_lexemes(package.file_path('vocab', 'lexemes.bin'))
+        self.load_lexemes(package.file_path('vocab', 'lexemes.bin'))
-        if package.has_file('vocab', 'vec.bin'):  # TODO: really optional?
+        if package.has_file('vocab', 'vec.bin'):
            self.vectors_length = self.load_vectors_from_bin_loc(
                package.file_path('vocab', 'vec.bin'))
        return self
    def __init__(self, get_lex_attr=None, tag_map=None, lemmatizer=None, serializer_freqs=None):