Merge pull request #223 from henningpeters/revise_packaging

refactored data_dir->via, add zip_safe, add spacy.load()
This commit is contained in:
Matthew Honnibal 2016-01-17 02:12:48 +11:00
commit 65c5b03b9b
18 changed files with 150 additions and 178 deletions

View File

@ -61,7 +61,7 @@ build_script:
- "%CMD_IN_ENV% python bin/init_model.py en lang_data/ corpora/ data"
- "cp package.json data"
- "%CMD_IN_ENV% sputnik build data en_default.sputnik"
- "%CMD_IN_ENV% sputnik install en_default.sputnik"
- "%CMD_IN_ENV% sputnik --name spacy install en_default.sputnik"
test_script:
# Run the project tests

View File

@ -31,7 +31,7 @@ install:
- "python bin/init_model.py en lang_data/ corpora/ data"
- "cp package.json data"
- "sputnik build data en_default.sputnik"
- "sputnik install en_default.sputnik"
- "sputnik --name spacy install en_default.sputnik"
script:
- python build.py $MODE;

View File

@ -1,5 +1,5 @@
#!/usr/bin/env python
from __future__ import division, print_function
from __future__ import print_function
import os
import shutil
import subprocess
@ -14,13 +14,6 @@ except ImportError:
from distutils.core import Extension, setup
MAJOR = 0
MINOR = 100
MICRO = 0
ISRELEASED = False
VERSION = '%d.%d.%d' % (MAJOR, MINOR, MICRO)
PACKAGES = [
'spacy',
'spacy.tokens',
@ -103,73 +96,6 @@ class build_ext_subclass(build_ext, build_ext_options):
build_ext.build_extensions(self)
# Return the git revision as a string
def git_version():
def _minimal_ext_cmd(cmd):
# construct minimal environment
env = {}
for k in ['SYSTEMROOT', 'PATH']:
v = os.environ.get(k)
if v is not None:
env[k] = v
# LANGUAGE is used on win32
env['LANGUAGE'] = 'C'
env['LANG'] = 'C'
env['LC_ALL'] = 'C'
out = subprocess.Popen(cmd, stdout = subprocess.PIPE, env=env).communicate()[0]
return out
try:
out = _minimal_ext_cmd(['git', 'rev-parse', 'HEAD'])
GIT_REVISION = out.strip().decode('ascii')
except OSError:
GIT_REVISION = 'Unknown'
return GIT_REVISION
def get_version_info():
# Adding the git rev number needs to be done inside write_version_py(),
# otherwise the import of spacy.about messes up the build under Python 3.
FULLVERSION = VERSION
if os.path.exists('.git'):
GIT_REVISION = git_version()
elif os.path.exists(os.path.join('spacy', 'about.py')):
# must be a source distribution, use existing version file
try:
from spacy.about import git_revision as GIT_REVISION
except ImportError:
raise ImportError('Unable to import git_revision. Try removing '
'spacy/about.py and the build directory '
'before building.')
else:
GIT_REVISION = 'Unknown'
if not ISRELEASED:
FULLVERSION += '.dev0+' + GIT_REVISION[:7]
return FULLVERSION, GIT_REVISION
def write_version(path):
cnt = """# THIS FILE IS GENERATED FROM SPACY SETUP.PY
short_version = '%(version)s'
version = '%(version)s'
full_version = '%(full_version)s'
git_revision = '%(git_revision)s'
release = %(isrelease)s
if not release:
version = full_version
"""
FULLVERSION, GIT_REVISION = get_version_info()
with open(path, 'w') as f:
f.write(cnt % {'version': VERSION,
'full_version' : FULLVERSION,
'git_revision' : GIT_REVISION,
'isrelease': str(ISRELEASED)})
def generate_cython(root, source):
print('Cythonizing sources')
p = subprocess.call([sys.executable,
@ -241,7 +167,9 @@ def setup_package():
return clean(root)
with chdir(root):
write_version(os.path.join(root, 'spacy', 'about.py'))
about = {}
with open(os.path.join(root, "spacy", "about.py")) as f:
exec(f.read(), about)
include_dirs = [
get_python_inc(plat_specific=True),
@ -259,15 +187,16 @@ def setup_package():
prepare_includes(root)
setup(
name='spacy',
name=about['__name__'],
zip_safe=False,
packages=PACKAGES,
package_data={'': ['*.pyx', '*.pxd']},
description='Industrial-strength NLP',
author='Matthew Honnibal',
author_email='matt@spacy.io',
version=VERSION,
url='https://spacy.io',
license='MIT',
description=about['__summary__'],
author=about['__author__'],
author_email=about['__email__'],
version=about['__version__'],
url=about['__uri__'],
license=about['__license__'],
ext_modules=ext_modules,
install_requires=['numpy', 'murmurhash>=0.26,<0.27', 'cymem>=1.30,<1.31', 'preshed>=0.46.1,<0.47',
'thinc>=4.2.0,<4.3.0', 'text_unidecode', 'plac', 'six',

View File

@ -0,0 +1,7 @@
from . import util
from .en import English
def load(name, via=None):
package = util.get_package_by_name(name, via=via)
return English(package=package)

14
spacy/about.py Normal file
View File

@ -0,0 +1,14 @@
# inspired from:
# https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
__name__ = 'spacy'
__version__ = '0.100.0'
__summary__ = 'Industrial-strength NLP'
__uri__ = 'https://spacy.io'
__author__ = 'Matthew Honnibal'
__email__ = 'matt@spacy.io'
__license__ = 'MIT'
__release__ = False
__default_model__ = 'en_default==1.0.4'

View File

@ -1,9 +1,13 @@
from __future__ import print_function
import sys
import os
import shutil
import plac
import sputnik
from sputnik.package_list import (PackageNotFoundException,
CompatiblePackageNotFoundException)
from .. import about
@ -20,37 +24,34 @@ def migrate(path):
os.unlink(os.path.join(path, filename))
def link(package, path):
if os.path.exists(path):
if os.path.isdir(path):
shutil.rmtree(path)
else:
os.unlink(path)
if not hasattr(os, 'symlink'): # not supported by win+py27
shutil.copytree(package.dir_path('data'), path)
else:
os.symlink(package.dir_path('data'), path)
@plac.annotations(
force=("Force overwrite", "flag", "f", bool),
)
def main(data_size='all', force=False):
path = os.path.dirname(os.path.abspath(__file__))
data_path = os.path.abspath(os.path.join(path, '..', 'data'))
if not os.path.isdir(data_path):
os.mkdir(data_path)
if force:
sputnik.purge('spacy', about.short_version, data_path=data_path)
sputnik.purge(about.__name__, about.__version__)
package = sputnik.install('spacy', about.short_version, 'en_default==1.0.4',
data_path=data_path)
try:
sputnik.package(about.__name__, about.__version__, about.__default_model__)
print("Model already installed. Please run 'python -m "
"spacy.en.download --force' to reinstall.", file=sys.stderr)
sys.exit(1)
except (PackageNotFoundException, CompatiblePackageNotFoundException):
pass
package = sputnik.install(about.__name__, about.__version__, about.__default_model__)
try:
sputnik.package(about.__name__, about.__version__, about.__default_model__)
except (PackageNotFoundException, CompatiblePackageNotFoundException):
print("Model failed to install. Please run 'python -m "
"spacy.en.download --force'.", file=sys.stderr)
sys.exit(1)
# FIXME clean up old-style packages
migrate(path)
migrate(os.path.dirname(os.path.abspath(__file__)))
print("Model successfully installed.", file=sys.stderr)
if __name__ == '__main__':

View File

@ -19,8 +19,8 @@ from . import orth
from .syntax.ner import BiluoPushDown
from .syntax.arc_eager import ArcEager
from . import util
from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD
from .util import get_package
class Language(object):
@ -137,12 +137,10 @@ class Language(object):
return {0: {'PER': True, 'LOC': True, 'ORG': True, 'MISC': True}}
@classmethod
def default_vocab(cls, package=None, get_lex_attr=None):
if package is None:
package = get_package()
def default_vocab(cls, package, get_lex_attr=None):
if get_lex_attr is None:
get_lex_attr = cls.default_lex_attrs()
return Vocab.load(package, get_lex_attr=get_lex_attr)
return Vocab.from_package(package, get_lex_attr=get_lex_attr)
@classmethod
def default_parser(cls, package, vocab):
@ -158,7 +156,6 @@ class Language(object):
def __init__(self,
data_dir=None,
model=None,
vocab=None,
tokenizer=None,
tagger=None,
@ -166,33 +163,44 @@ class Language(object):
entity=None,
matcher=None,
serializer=None,
load_vectors=True):
load_vectors=True,
package=None):
"""
a model can be specified:
1) by a path to the model directory (DEPRECATED)
- Language(data_dir='path/to/data')
1) by calling a Language subclass
- spacy.en.English()
2) by a language identifier (and optionally a package root dir)
- Language(lang='en')
- Language(lang='en', data_dir='spacy/data')
2) by calling a Language subclass with data_dir
- spacy.en.English('my/model/root')
- spacy.en.English(data_dir='my/model/root')
3) by a model name/version (and optionally a package root dir)
- Language(model='en_default')
- Language(model='en_default ==1.0.0')
- Language(model='en_default <1.1.0, data_dir='spacy/data')
3) by package name
- spacy.load('en_default')
- spacy.load('en_default==1.0.0')
4) by package name with a relocated package base
- spacy.load('en_default', via='/my/package/root')
- spacy.load('en_default==1.0.0', via='/my/package/root')
"""
package = get_package(model, data_path=data_dir)
if package is None:
if data_dir is None:
package = util.get_package_by_name()
else:
package = util.get_package(data_dir)
if load_vectors is not True:
warn("load_vectors is deprecated", DeprecationWarning)
if vocab in (None, True):
vocab = Vocab.load(package, get_lex_attr=self.default_lex_attrs())
vocab = self.default_vocab(package)
self.vocab = vocab
if tokenizer in (None, True):
tokenizer = Tokenizer.load(package, self.vocab)
tokenizer = Tokenizer.from_package(package, self.vocab)
self.tokenizer = tokenizer
if tagger in (None, True):
tagger = Tagger.load(package, self.vocab)
tagger = Tagger.from_package(package, self.vocab)
self.tagger = tagger
if entity in (None, True):
entity = self.default_entity(package, self.vocab)
@ -201,13 +209,12 @@ class Language(object):
parser = self.default_parser(package, self.vocab)
self.parser = parser
if matcher in (None, True):
matcher = Matcher.load(package, self.vocab)
matcher = Matcher.from_package(package, self.vocab)
self.matcher = matcher
def __reduce__(self):
args = (
None, # data_dir
None, # model
self.vocab,
self.tokenizer,
self.tagger,

View File

@ -13,8 +13,11 @@ from .util import get_package
class Lemmatizer(object):
@classmethod
def load(cls, pkg_or_str_or_file):
pkg = get_package(pkg_or_str_or_file)
def load(cls, via):
return cls.from_package(get_package(via))
@classmethod
def from_package(cls, pkg):
index = {}
exc = {}
for pos in ['adj', 'noun', 'verb']:

View File

@ -170,8 +170,11 @@ cdef class Matcher:
cdef object _patterns
@classmethod
def load(cls, pkg_or_str_or_file, Vocab vocab):
package = get_package(pkg_or_str_or_file)
def load(cls, data_dir, Vocab vocab):
return cls.from_package(get_package(data_dir), vocab=vocab)
@classmethod
def from_package(cls, package, Vocab vocab):
patterns = package.load_json(('vocab', 'gazetteer.json'))
return cls(vocab, patterns)

View File

@ -148,8 +148,11 @@ cdef class Tagger:
return cls(vocab, model)
@classmethod
def load(cls, pkg_or_str_or_file, vocab):
pkg = get_package(pkg_or_str_or_file)
def load(cls, data_dir, vocab):
return cls.from_package(get_package(data_dir), vocab=vocab)
@classmethod
def from_package(cls, pkg, vocab):
# TODO: templates.json deprecated? not present in latest package
templates = cls.default_templates()
# templates = package.load_utf8(json.load,

View File

@ -7,11 +7,11 @@ import os
@pytest.fixture(scope="session")
def EN():
if os.environ.get('SPACY_DATA'):
data_path = os.environ.get('SPACY_DATA')
data_dir = os.environ.get('SPACY_DATA')
else:
data_path = None
print("Load EN from %s" % data_path)
return English(data_dir=data_path)
data_dir = None
print("Load EN from %s" % data_dir)
return English(data_dir=data_dir)
def pytest_addoption(parser):

View File

@ -13,6 +13,7 @@ from spacy.tokenizer import Tokenizer
from os import path
import os
from spacy import util
from spacy.attrs import ORTH, SPACY, TAG, DEP, HEAD
from spacy.serialize.packer import Packer
@ -21,11 +22,13 @@ from spacy.serialize.bits import BitArray
@pytest.fixture
def vocab():
if os.environ.get('SPACY_DATA'):
data_path = os.environ.get('SPACY_DATA')
data_dir = os.environ.get('SPACY_DATA')
if data_dir is None:
package = util.get_package_by_name()
else:
data_path = None
vocab = English.default_vocab(package=data_path)
package = util.get_package(data_dir)
vocab = English.default_vocab(package=package)
lex = vocab['dog']
assert vocab[vocab.strings['dog']].orth_ == 'dog'
lex = vocab['the']

View File

@ -5,23 +5,23 @@ import io
import pickle
from spacy.lemmatizer import Lemmatizer, read_index, read_exc
from spacy.util import get_package
from spacy import util
import pytest
@pytest.fixture
def package():
if os.environ.get('SPACY_DATA'):
data_path = os.environ.get('SPACY_DATA')
data_dir = os.environ.get('SPACY_DATA')
if data_dir is None:
return util.get_package_by_name()
else:
data_path = None
return get_package(data_path=data_path)
return util.get_package(data_dir)
@pytest.fixture
def lemmatizer(package):
return Lemmatizer.load(package)
return Lemmatizer.from_package(package)
def test_read_index(package):

View File

@ -7,10 +7,10 @@ import os
def nlp():
from spacy.en import English
if os.environ.get('SPACY_DATA'):
data_path = os.environ.get('SPACY_DATA')
data_dir = os.environ.get('SPACY_DATA')
else:
data_path = None
return English(data_dir=data_path)
data_dir = None
return English(data_dir=data_dir)
@pytest.fixture()

View File

@ -11,13 +11,13 @@ def token(doc):
def test_load_resources_and_process_text():
if os.environ.get('SPACY_DATA'):
data_path = os.environ.get('SPACY_DATA')
data_dir = os.environ.get('SPACY_DATA')
else:
data_path = None
print("Load EN from %s" % data_path)
data_dir = None
print("Load EN from %s" % data_dir)
from spacy.en import English
nlp = English(data_dir=data_path)
nlp = English(data_dir=data_dir)
doc = nlp('Hello, world. Here are two sentences.')

View File

@ -42,9 +42,12 @@ cdef class Tokenizer:
return (self.__class__, args, None, None)
@classmethod
def load(cls, pkg_or_str_or_file, Vocab vocab):
pkg = get_package(pkg_or_str_or_file)
rules, prefix_re, suffix_re, infix_re = read_lang_data(pkg)
def load(cls, data_dir, Vocab vocab):
return cls.from_package(get_package(data_dir), vocab=vocab)
@classmethod
def from_package(cls, package, Vocab vocab):
rules, prefix_re, suffix_re, infix_re = read_lang_data(package)
prefix_re = re.compile(prefix_re)
suffix_re = re.compile(suffix_re)
infix_re = re.compile(infix_re)

View File

@ -4,37 +4,33 @@ import json
import re
import os.path
import six
import sputnik
from sputnik.dir_package import DirPackage
from sputnik.package_stub import PackageStub
from sputnik.package_list import PackageNotFoundException, CompatiblePackageNotFoundException
from sputnik.package_list import (PackageNotFoundException,
CompatiblePackageNotFoundException)
from . import about
from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
def get_package(value=None, data_path=None):
if data_path is None:
if isinstance(value, PackageStub):
return value
elif value and os.path.isdir(value):
return DirPackage(value)
def get_package(data_dir):
if not isinstance(data_dir, six.string_types):
raise RuntimeError('data_dir must be a string')
return DirPackage(data_dir)
elif value is None and data_path is not None:
return DirPackage(data_path)
def get_package_by_name(name=None, via=None):
try:
return sputnik.package('spacy', about.short_version,
value or 'en_default==1.0.4',
data_path=data_path)
return sputnik.package(about.__name__, about.__version__,
name or about.__default_model__, data_path=via)
except PackageNotFoundException as e:
raise RuntimeError("Model not installed. Please run 'python -m "
"spacy.en.download' to install latest compatible "
"model.")
except CompatiblePackageNotFoundException as e:
raise RuntimeError("Installed model is not compatible with spaCy "
"version. Please run 'python -m spacy.en.download' "
"version. Please run 'python -m spacy.en.download "
"--force' to install latest compatible model.")

View File

@ -48,11 +48,14 @@ cdef class Vocab:
'''A map container for a language's LexemeC structs.
'''
@classmethod
def load(cls, pkg_or_str_or_file, get_lex_attr=None):
package = get_package(pkg_or_str_or_file)
def load(cls, data_dir, get_lex_attr=None):
return cls.from_package(get_package(data_dir), get_lex_attr=get_lex_attr)
@classmethod
def from_package(cls, package, get_lex_attr=None):
tag_map = package.load_json(('vocab', 'tag_map.json'), default={})
lemmatizer = Lemmatizer.load(package)
lemmatizer = Lemmatizer.from_package(package)
serializer_freqs = package.load_json(('vocab', 'serializer.json'), default={})