Merge pull request #223 from henningpeters/revise_packaging

refactored data_dir->via, add zip_safe, add spacy.load()
This commit is contained in:
Matthew Honnibal 2016-01-17 02:12:48 +11:00
commit 65c5b03b9b
18 changed files with 150 additions and 178 deletions

View File

@ -61,7 +61,7 @@ build_script:
- "%CMD_IN_ENV% python bin/init_model.py en lang_data/ corpora/ data" - "%CMD_IN_ENV% python bin/init_model.py en lang_data/ corpora/ data"
- "cp package.json data" - "cp package.json data"
- "%CMD_IN_ENV% sputnik build data en_default.sputnik" - "%CMD_IN_ENV% sputnik build data en_default.sputnik"
- "%CMD_IN_ENV% sputnik install en_default.sputnik" - "%CMD_IN_ENV% sputnik --name spacy install en_default.sputnik"
test_script: test_script:
# Run the project tests # Run the project tests

View File

@ -31,7 +31,7 @@ install:
- "python bin/init_model.py en lang_data/ corpora/ data" - "python bin/init_model.py en lang_data/ corpora/ data"
- "cp package.json data" - "cp package.json data"
- "sputnik build data en_default.sputnik" - "sputnik build data en_default.sputnik"
- "sputnik install en_default.sputnik" - "sputnik --name spacy install en_default.sputnik"
script: script:
- python build.py $MODE; - python build.py $MODE;

View File

@ -1,5 +1,5 @@
#!/usr/bin/env python #!/usr/bin/env python
from __future__ import division, print_function from __future__ import print_function
import os import os
import shutil import shutil
import subprocess import subprocess
@ -14,13 +14,6 @@ except ImportError:
from distutils.core import Extension, setup from distutils.core import Extension, setup
MAJOR = 0
MINOR = 100
MICRO = 0
ISRELEASED = False
VERSION = '%d.%d.%d' % (MAJOR, MINOR, MICRO)
PACKAGES = [ PACKAGES = [
'spacy', 'spacy',
'spacy.tokens', 'spacy.tokens',
@ -103,73 +96,6 @@ class build_ext_subclass(build_ext, build_ext_options):
build_ext.build_extensions(self) build_ext.build_extensions(self)
# Return the git revision as a string
def git_version():
def _minimal_ext_cmd(cmd):
# construct minimal environment
env = {}
for k in ['SYSTEMROOT', 'PATH']:
v = os.environ.get(k)
if v is not None:
env[k] = v
# LANGUAGE is used on win32
env['LANGUAGE'] = 'C'
env['LANG'] = 'C'
env['LC_ALL'] = 'C'
out = subprocess.Popen(cmd, stdout = subprocess.PIPE, env=env).communicate()[0]
return out
try:
out = _minimal_ext_cmd(['git', 'rev-parse', 'HEAD'])
GIT_REVISION = out.strip().decode('ascii')
except OSError:
GIT_REVISION = 'Unknown'
return GIT_REVISION
def get_version_info():
# Adding the git rev number needs to be done inside write_version_py(),
# otherwise the import of spacy.about messes up the build under Python 3.
FULLVERSION = VERSION
if os.path.exists('.git'):
GIT_REVISION = git_version()
elif os.path.exists(os.path.join('spacy', 'about.py')):
# must be a source distribution, use existing version file
try:
from spacy.about import git_revision as GIT_REVISION
except ImportError:
raise ImportError('Unable to import git_revision. Try removing '
'spacy/about.py and the build directory '
'before building.')
else:
GIT_REVISION = 'Unknown'
if not ISRELEASED:
FULLVERSION += '.dev0+' + GIT_REVISION[:7]
return FULLVERSION, GIT_REVISION
def write_version(path):
cnt = """# THIS FILE IS GENERATED FROM SPACY SETUP.PY
short_version = '%(version)s'
version = '%(version)s'
full_version = '%(full_version)s'
git_revision = '%(git_revision)s'
release = %(isrelease)s
if not release:
version = full_version
"""
FULLVERSION, GIT_REVISION = get_version_info()
with open(path, 'w') as f:
f.write(cnt % {'version': VERSION,
'full_version' : FULLVERSION,
'git_revision' : GIT_REVISION,
'isrelease': str(ISRELEASED)})
def generate_cython(root, source): def generate_cython(root, source):
print('Cythonizing sources') print('Cythonizing sources')
p = subprocess.call([sys.executable, p = subprocess.call([sys.executable,
@ -241,7 +167,9 @@ def setup_package():
return clean(root) return clean(root)
with chdir(root): with chdir(root):
write_version(os.path.join(root, 'spacy', 'about.py')) about = {}
with open(os.path.join(root, "spacy", "about.py")) as f:
exec(f.read(), about)
include_dirs = [ include_dirs = [
get_python_inc(plat_specific=True), get_python_inc(plat_specific=True),
@ -259,15 +187,16 @@ def setup_package():
prepare_includes(root) prepare_includes(root)
setup( setup(
name='spacy', name=about['__name__'],
zip_safe=False,
packages=PACKAGES, packages=PACKAGES,
package_data={'': ['*.pyx', '*.pxd']}, package_data={'': ['*.pyx', '*.pxd']},
description='Industrial-strength NLP', description=about['__summary__'],
author='Matthew Honnibal', author=about['__author__'],
author_email='matt@spacy.io', author_email=about['__email__'],
version=VERSION, version=about['__version__'],
url='https://spacy.io', url=about['__uri__'],
license='MIT', license=about['__license__'],
ext_modules=ext_modules, ext_modules=ext_modules,
install_requires=['numpy', 'murmurhash>=0.26,<0.27', 'cymem>=1.30,<1.31', 'preshed>=0.46.1,<0.47', install_requires=['numpy', 'murmurhash>=0.26,<0.27', 'cymem>=1.30,<1.31', 'preshed>=0.46.1,<0.47',
'thinc>=4.2.0,<4.3.0', 'text_unidecode', 'plac', 'six', 'thinc>=4.2.0,<4.3.0', 'text_unidecode', 'plac', 'six',

View File

@ -0,0 +1,7 @@
from . import util
from .en import English
def load(name, via=None):
package = util.get_package_by_name(name, via=via)
return English(package=package)

14
spacy/about.py Normal file
View File

@ -0,0 +1,14 @@
# inspired from:
# https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
__name__ = 'spacy'
__version__ = '0.100.0'
__summary__ = 'Industrial-strength NLP'
__uri__ = 'https://spacy.io'
__author__ = 'Matthew Honnibal'
__email__ = 'matt@spacy.io'
__license__ = 'MIT'
__release__ = False
__default_model__ = 'en_default==1.0.4'

View File

@ -1,9 +1,13 @@
from __future__ import print_function
import sys import sys
import os import os
import shutil import shutil
import plac import plac
import sputnik import sputnik
from sputnik.package_list import (PackageNotFoundException,
CompatiblePackageNotFoundException)
from .. import about from .. import about
@ -20,37 +24,34 @@ def migrate(path):
os.unlink(os.path.join(path, filename)) os.unlink(os.path.join(path, filename))
def link(package, path):
if os.path.exists(path):
if os.path.isdir(path):
shutil.rmtree(path)
else:
os.unlink(path)
if not hasattr(os, 'symlink'): # not supported by win+py27
shutil.copytree(package.dir_path('data'), path)
else:
os.symlink(package.dir_path('data'), path)
@plac.annotations( @plac.annotations(
force=("Force overwrite", "flag", "f", bool), force=("Force overwrite", "flag", "f", bool),
) )
def main(data_size='all', force=False): def main(data_size='all', force=False):
path = os.path.dirname(os.path.abspath(__file__))
data_path = os.path.abspath(os.path.join(path, '..', 'data'))
if not os.path.isdir(data_path):
os.mkdir(data_path)
if force: if force:
sputnik.purge('spacy', about.short_version, data_path=data_path) sputnik.purge(about.__name__, about.__version__)
package = sputnik.install('spacy', about.short_version, 'en_default==1.0.4', try:
data_path=data_path) sputnik.package(about.__name__, about.__version__, about.__default_model__)
print("Model already installed. Please run 'python -m "
"spacy.en.download --force' to reinstall.", file=sys.stderr)
sys.exit(1)
except (PackageNotFoundException, CompatiblePackageNotFoundException):
pass
package = sputnik.install(about.__name__, about.__version__, about.__default_model__)
try:
sputnik.package(about.__name__, about.__version__, about.__default_model__)
except (PackageNotFoundException, CompatiblePackageNotFoundException):
print("Model failed to install. Please run 'python -m "
"spacy.en.download --force'.", file=sys.stderr)
sys.exit(1)
# FIXME clean up old-style packages # FIXME clean up old-style packages
migrate(path) migrate(os.path.dirname(os.path.abspath(__file__)))
print("Model successfully installed.", file=sys.stderr)
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -19,8 +19,8 @@ from . import orth
from .syntax.ner import BiluoPushDown from .syntax.ner import BiluoPushDown
from .syntax.arc_eager import ArcEager from .syntax.arc_eager import ArcEager
from . import util
from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD
from .util import get_package
class Language(object): class Language(object):
@ -137,12 +137,10 @@ class Language(object):
return {0: {'PER': True, 'LOC': True, 'ORG': True, 'MISC': True}} return {0: {'PER': True, 'LOC': True, 'ORG': True, 'MISC': True}}
@classmethod @classmethod
def default_vocab(cls, package=None, get_lex_attr=None): def default_vocab(cls, package, get_lex_attr=None):
if package is None:
package = get_package()
if get_lex_attr is None: if get_lex_attr is None:
get_lex_attr = cls.default_lex_attrs() get_lex_attr = cls.default_lex_attrs()
return Vocab.load(package, get_lex_attr=get_lex_attr) return Vocab.from_package(package, get_lex_attr=get_lex_attr)
@classmethod @classmethod
def default_parser(cls, package, vocab): def default_parser(cls, package, vocab):
@ -158,7 +156,6 @@ class Language(object):
def __init__(self, def __init__(self,
data_dir=None, data_dir=None,
model=None,
vocab=None, vocab=None,
tokenizer=None, tokenizer=None,
tagger=None, tagger=None,
@ -166,33 +163,44 @@ class Language(object):
entity=None, entity=None,
matcher=None, matcher=None,
serializer=None, serializer=None,
load_vectors=True): load_vectors=True,
package=None):
""" """
a model can be specified: a model can be specified:
1) by a path to the model directory (DEPRECATED) 1) by calling a Language subclass
- Language(data_dir='path/to/data') - spacy.en.English()
2) by a language identifier (and optionally a package root dir) 2) by calling a Language subclass with data_dir
- Language(lang='en') - spacy.en.English('my/model/root')
- Language(lang='en', data_dir='spacy/data') - spacy.en.English(data_dir='my/model/root')
3) by a model name/version (and optionally a package root dir) 3) by package name
- Language(model='en_default') - spacy.load('en_default')
- Language(model='en_default ==1.0.0') - spacy.load('en_default==1.0.0')
- Language(model='en_default <1.1.0, data_dir='spacy/data')
4) by package name with a relocated package base
- spacy.load('en_default', via='/my/package/root')
- spacy.load('en_default==1.0.0', via='/my/package/root')
""" """
package = get_package(model, data_path=data_dir)
if package is None:
if data_dir is None:
package = util.get_package_by_name()
else:
package = util.get_package(data_dir)
if load_vectors is not True: if load_vectors is not True:
warn("load_vectors is deprecated", DeprecationWarning) warn("load_vectors is deprecated", DeprecationWarning)
if vocab in (None, True): if vocab in (None, True):
vocab = Vocab.load(package, get_lex_attr=self.default_lex_attrs()) vocab = self.default_vocab(package)
self.vocab = vocab self.vocab = vocab
if tokenizer in (None, True): if tokenizer in (None, True):
tokenizer = Tokenizer.load(package, self.vocab) tokenizer = Tokenizer.from_package(package, self.vocab)
self.tokenizer = tokenizer self.tokenizer = tokenizer
if tagger in (None, True): if tagger in (None, True):
tagger = Tagger.load(package, self.vocab) tagger = Tagger.from_package(package, self.vocab)
self.tagger = tagger self.tagger = tagger
if entity in (None, True): if entity in (None, True):
entity = self.default_entity(package, self.vocab) entity = self.default_entity(package, self.vocab)
@ -201,13 +209,12 @@ class Language(object):
parser = self.default_parser(package, self.vocab) parser = self.default_parser(package, self.vocab)
self.parser = parser self.parser = parser
if matcher in (None, True): if matcher in (None, True):
matcher = Matcher.load(package, self.vocab) matcher = Matcher.from_package(package, self.vocab)
self.matcher = matcher self.matcher = matcher
def __reduce__(self): def __reduce__(self):
args = ( args = (
None, # data_dir None, # data_dir
None, # model
self.vocab, self.vocab,
self.tokenizer, self.tokenizer,
self.tagger, self.tagger,

View File

@ -13,8 +13,11 @@ from .util import get_package
class Lemmatizer(object): class Lemmatizer(object):
@classmethod @classmethod
def load(cls, pkg_or_str_or_file): def load(cls, via):
pkg = get_package(pkg_or_str_or_file) return cls.from_package(get_package(via))
@classmethod
def from_package(cls, pkg):
index = {} index = {}
exc = {} exc = {}
for pos in ['adj', 'noun', 'verb']: for pos in ['adj', 'noun', 'verb']:

View File

@ -170,8 +170,11 @@ cdef class Matcher:
cdef object _patterns cdef object _patterns
@classmethod @classmethod
def load(cls, pkg_or_str_or_file, Vocab vocab): def load(cls, data_dir, Vocab vocab):
package = get_package(pkg_or_str_or_file) return cls.from_package(get_package(data_dir), vocab=vocab)
@classmethod
def from_package(cls, package, Vocab vocab):
patterns = package.load_json(('vocab', 'gazetteer.json')) patterns = package.load_json(('vocab', 'gazetteer.json'))
return cls(vocab, patterns) return cls(vocab, patterns)

View File

@ -148,8 +148,11 @@ cdef class Tagger:
return cls(vocab, model) return cls(vocab, model)
@classmethod @classmethod
def load(cls, pkg_or_str_or_file, vocab): def load(cls, data_dir, vocab):
pkg = get_package(pkg_or_str_or_file) return cls.from_package(get_package(data_dir), vocab=vocab)
@classmethod
def from_package(cls, pkg, vocab):
# TODO: templates.json deprecated? not present in latest package # TODO: templates.json deprecated? not present in latest package
templates = cls.default_templates() templates = cls.default_templates()
# templates = package.load_utf8(json.load, # templates = package.load_utf8(json.load,

View File

@ -7,11 +7,11 @@ import os
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def EN(): def EN():
if os.environ.get('SPACY_DATA'): if os.environ.get('SPACY_DATA'):
data_path = os.environ.get('SPACY_DATA') data_dir = os.environ.get('SPACY_DATA')
else: else:
data_path = None data_dir = None
print("Load EN from %s" % data_path) print("Load EN from %s" % data_dir)
return English(data_dir=data_path) return English(data_dir=data_dir)
def pytest_addoption(parser): def pytest_addoption(parser):

View File

@ -13,6 +13,7 @@ from spacy.tokenizer import Tokenizer
from os import path from os import path
import os import os
from spacy import util
from spacy.attrs import ORTH, SPACY, TAG, DEP, HEAD from spacy.attrs import ORTH, SPACY, TAG, DEP, HEAD
from spacy.serialize.packer import Packer from spacy.serialize.packer import Packer
@ -21,11 +22,13 @@ from spacy.serialize.bits import BitArray
@pytest.fixture @pytest.fixture
def vocab(): def vocab():
if os.environ.get('SPACY_DATA'): data_dir = os.environ.get('SPACY_DATA')
data_path = os.environ.get('SPACY_DATA') if data_dir is None:
package = util.get_package_by_name()
else: else:
data_path = None package = util.get_package(data_dir)
vocab = English.default_vocab(package=data_path)
vocab = English.default_vocab(package=package)
lex = vocab['dog'] lex = vocab['dog']
assert vocab[vocab.strings['dog']].orth_ == 'dog' assert vocab[vocab.strings['dog']].orth_ == 'dog'
lex = vocab['the'] lex = vocab['the']

View File

@ -5,23 +5,23 @@ import io
import pickle import pickle
from spacy.lemmatizer import Lemmatizer, read_index, read_exc from spacy.lemmatizer import Lemmatizer, read_index, read_exc
from spacy.util import get_package from spacy import util
import pytest import pytest
@pytest.fixture @pytest.fixture
def package(): def package():
if os.environ.get('SPACY_DATA'): data_dir = os.environ.get('SPACY_DATA')
data_path = os.environ.get('SPACY_DATA') if data_dir is None:
return util.get_package_by_name()
else: else:
data_path = None return util.get_package(data_dir)
return get_package(data_path=data_path)
@pytest.fixture @pytest.fixture
def lemmatizer(package): def lemmatizer(package):
return Lemmatizer.load(package) return Lemmatizer.from_package(package)
def test_read_index(package): def test_read_index(package):

View File

@ -7,10 +7,10 @@ import os
def nlp(): def nlp():
from spacy.en import English from spacy.en import English
if os.environ.get('SPACY_DATA'): if os.environ.get('SPACY_DATA'):
data_path = os.environ.get('SPACY_DATA') data_dir = os.environ.get('SPACY_DATA')
else: else:
data_path = None data_dir = None
return English(data_dir=data_path) return English(data_dir=data_dir)
@pytest.fixture() @pytest.fixture()

View File

@ -11,13 +11,13 @@ def token(doc):
def test_load_resources_and_process_text(): def test_load_resources_and_process_text():
if os.environ.get('SPACY_DATA'): if os.environ.get('SPACY_DATA'):
data_path = os.environ.get('SPACY_DATA') data_dir = os.environ.get('SPACY_DATA')
else: else:
data_path = None data_dir = None
print("Load EN from %s" % data_path) print("Load EN from %s" % data_dir)
from spacy.en import English from spacy.en import English
nlp = English(data_dir=data_path) nlp = English(data_dir=data_dir)
doc = nlp('Hello, world. Here are two sentences.') doc = nlp('Hello, world. Here are two sentences.')

View File

@ -42,9 +42,12 @@ cdef class Tokenizer:
return (self.__class__, args, None, None) return (self.__class__, args, None, None)
@classmethod @classmethod
def load(cls, pkg_or_str_or_file, Vocab vocab): def load(cls, data_dir, Vocab vocab):
pkg = get_package(pkg_or_str_or_file) return cls.from_package(get_package(data_dir), vocab=vocab)
rules, prefix_re, suffix_re, infix_re = read_lang_data(pkg)
@classmethod
def from_package(cls, package, Vocab vocab):
rules, prefix_re, suffix_re, infix_re = read_lang_data(package)
prefix_re = re.compile(prefix_re) prefix_re = re.compile(prefix_re)
suffix_re = re.compile(suffix_re) suffix_re = re.compile(suffix_re)
infix_re = re.compile(infix_re) infix_re = re.compile(infix_re)

View File

@ -4,37 +4,33 @@ import json
import re import re
import os.path import os.path
import six
import sputnik import sputnik
from sputnik.dir_package import DirPackage from sputnik.dir_package import DirPackage
from sputnik.package_stub import PackageStub from sputnik.package_list import (PackageNotFoundException,
from sputnik.package_list import PackageNotFoundException, CompatiblePackageNotFoundException CompatiblePackageNotFoundException)
from . import about from . import about
from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
def get_package(value=None, data_path=None): def get_package(data_dir):
if data_path is None: if not isinstance(data_dir, six.string_types):
if isinstance(value, PackageStub): raise RuntimeError('data_dir must be a string')
return value return DirPackage(data_dir)
elif value and os.path.isdir(value):
return DirPackage(value)
elif value is None and data_path is not None:
return DirPackage(data_path)
def get_package_by_name(name=None, via=None):
try: try:
return sputnik.package('spacy', about.short_version, return sputnik.package(about.__name__, about.__version__,
value or 'en_default==1.0.4', name or about.__default_model__, data_path=via)
data_path=data_path)
except PackageNotFoundException as e: except PackageNotFoundException as e:
raise RuntimeError("Model not installed. Please run 'python -m " raise RuntimeError("Model not installed. Please run 'python -m "
"spacy.en.download' to install latest compatible " "spacy.en.download' to install latest compatible "
"model.") "model.")
except CompatiblePackageNotFoundException as e: except CompatiblePackageNotFoundException as e:
raise RuntimeError("Installed model is not compatible with spaCy " raise RuntimeError("Installed model is not compatible with spaCy "
"version. Please run 'python -m spacy.en.download' " "version. Please run 'python -m spacy.en.download "
"--force' to install latest compatible model.") "--force' to install latest compatible model.")

View File

@ -48,11 +48,14 @@ cdef class Vocab:
'''A map container for a language's LexemeC structs. '''A map container for a language's LexemeC structs.
''' '''
@classmethod @classmethod
def load(cls, pkg_or_str_or_file, get_lex_attr=None): def load(cls, data_dir, get_lex_attr=None):
package = get_package(pkg_or_str_or_file) return cls.from_package(get_package(data_dir), get_lex_attr=get_lex_attr)
@classmethod
def from_package(cls, package, get_lex_attr=None):
tag_map = package.load_json(('vocab', 'tag_map.json'), default={}) tag_map = package.load_json(('vocab', 'tag_map.json'), default={})
lemmatizer = Lemmatizer.load(package) lemmatizer = Lemmatizer.from_package(package)
serializer_freqs = package.load_json(('vocab', 'serializer.json'), default={}) serializer_freqs = package.load_json(('vocab', 'serializer.json'), default={})