diff --git a/.appveyor.yml b/.appveyor.yml index d4cab45cd..0fc7dad50 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -61,7 +61,7 @@ build_script: - "%CMD_IN_ENV% python bin/init_model.py en lang_data/ corpora/ data" - "cp package.json data" - "%CMD_IN_ENV% sputnik build data en_default.sputnik" - - "%CMD_IN_ENV% sputnik install en_default.sputnik" + - "%CMD_IN_ENV% sputnik --name spacy install en_default.sputnik" test_script: # Run the project tests diff --git a/.travis.yml b/.travis.yml index b6c2a430e..e89dd19e4 100644 --- a/.travis.yml +++ b/.travis.yml @@ -31,7 +31,7 @@ install: - "python bin/init_model.py en lang_data/ corpora/ data" - "cp package.json data" - "sputnik build data en_default.sputnik" - - "sputnik install en_default.sputnik" + - "sputnik --name spacy install en_default.sputnik" script: - python build.py $MODE; diff --git a/setup.py b/setup.py index a1e7dc94b..ce189985b 100644 --- a/setup.py +++ b/setup.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -from __future__ import division, print_function +from __future__ import print_function import os import shutil import subprocess @@ -14,13 +14,6 @@ except ImportError: from distutils.core import Extension, setup -MAJOR = 0 -MINOR = 100 -MICRO = 0 -ISRELEASED = False -VERSION = '%d.%d.%d' % (MAJOR, MINOR, MICRO) - - PACKAGES = [ 'spacy', 'spacy.tokens', @@ -103,73 +96,6 @@ class build_ext_subclass(build_ext, build_ext_options): build_ext.build_extensions(self) -# Return the git revision as a string -def git_version(): - def _minimal_ext_cmd(cmd): - # construct minimal environment - env = {} - for k in ['SYSTEMROOT', 'PATH']: - v = os.environ.get(k) - if v is not None: - env[k] = v - # LANGUAGE is used on win32 - env['LANGUAGE'] = 'C' - env['LANG'] = 'C' - env['LC_ALL'] = 'C' - out = subprocess.Popen(cmd, stdout = subprocess.PIPE, env=env).communicate()[0] - return out - - try: - out = _minimal_ext_cmd(['git', 'rev-parse', 'HEAD']) - GIT_REVISION = out.strip().decode('ascii') - except OSError: - GIT_REVISION = 'Unknown' - - return GIT_REVISION - - -def get_version_info(): - # Adding the git rev number needs to be done inside write_version_py(), - # otherwise the import of spacy.about messes up the build under Python 3. - FULLVERSION = VERSION - if os.path.exists('.git'): - GIT_REVISION = git_version() - elif os.path.exists(os.path.join('spacy', 'about.py')): - # must be a source distribution, use existing version file - try: - from spacy.about import git_revision as GIT_REVISION - except ImportError: - raise ImportError('Unable to import git_revision. Try removing ' - 'spacy/about.py and the build directory ' - 'before building.') - else: - GIT_REVISION = 'Unknown' - - if not ISRELEASED: - FULLVERSION += '.dev0+' + GIT_REVISION[:7] - - return FULLVERSION, GIT_REVISION - - -def write_version(path): - cnt = """# THIS FILE IS GENERATED FROM SPACY SETUP.PY -short_version = '%(version)s' -version = '%(version)s' -full_version = '%(full_version)s' -git_revision = '%(git_revision)s' -release = %(isrelease)s -if not release: - version = full_version -""" - FULLVERSION, GIT_REVISION = get_version_info() - - with open(path, 'w') as f: - f.write(cnt % {'version': VERSION, - 'full_version' : FULLVERSION, - 'git_revision' : GIT_REVISION, - 'isrelease': str(ISRELEASED)}) - - def generate_cython(root, source): print('Cythonizing sources') p = subprocess.call([sys.executable, @@ -241,7 +167,9 @@ def setup_package(): return clean(root) with chdir(root): - write_version(os.path.join(root, 'spacy', 'about.py')) + about = {} + with open(os.path.join(root, "spacy", "about.py")) as f: + exec(f.read(), about) include_dirs = [ get_python_inc(plat_specific=True), @@ -259,15 +187,16 @@ def setup_package(): prepare_includes(root) setup( - name='spacy', + name=about['__name__'], + zip_safe=False, packages=PACKAGES, package_data={'': ['*.pyx', '*.pxd']}, - description='Industrial-strength NLP', - author='Matthew Honnibal', - author_email='matt@spacy.io', - version=VERSION, - url='https://spacy.io', - license='MIT', + description=about['__summary__'], + author=about['__author__'], + author_email=about['__email__'], + version=about['__version__'], + url=about['__uri__'], + license=about['__license__'], ext_modules=ext_modules, install_requires=['numpy', 'murmurhash>=0.26,<0.27', 'cymem>=1.30,<1.31', 'preshed>=0.46.1,<0.47', 'thinc>=4.2.0,<4.3.0', 'text_unidecode', 'plac', 'six', diff --git a/spacy/__init__.py b/spacy/__init__.py index e69de29bb..556027a42 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -0,0 +1,7 @@ +from . import util +from .en import English + + +def load(name, via=None): + package = util.get_package_by_name(name, via=via) + return English(package=package) diff --git a/spacy/about.py b/spacy/about.py new file mode 100644 index 000000000..6ad68f5ba --- /dev/null +++ b/spacy/about.py @@ -0,0 +1,14 @@ +# inspired from: + +# https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/ +# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py + +__name__ = 'spacy' +__version__ = '0.100.0' +__summary__ = 'Industrial-strength NLP' +__uri__ = 'https://spacy.io' +__author__ = 'Matthew Honnibal' +__email__ = 'matt@spacy.io' +__license__ = 'MIT' +__release__ = False +__default_model__ = 'en_default==1.0.4' diff --git a/spacy/en/download.py b/spacy/en/download.py index bdc0ac9b0..ae1e62e44 100644 --- a/spacy/en/download.py +++ b/spacy/en/download.py @@ -1,9 +1,13 @@ +from __future__ import print_function + import sys import os import shutil import plac import sputnik +from sputnik.package_list import (PackageNotFoundException, + CompatiblePackageNotFoundException) from .. import about @@ -20,37 +24,34 @@ def migrate(path): os.unlink(os.path.join(path, filename)) -def link(package, path): - if os.path.exists(path): - if os.path.isdir(path): - shutil.rmtree(path) - else: - os.unlink(path) - - if not hasattr(os, 'symlink'): # not supported by win+py27 - shutil.copytree(package.dir_path('data'), path) - else: - os.symlink(package.dir_path('data'), path) - - @plac.annotations( force=("Force overwrite", "flag", "f", bool), ) def main(data_size='all', force=False): - path = os.path.dirname(os.path.abspath(__file__)) - - data_path = os.path.abspath(os.path.join(path, '..', 'data')) - if not os.path.isdir(data_path): - os.mkdir(data_path) - if force: - sputnik.purge('spacy', about.short_version, data_path=data_path) + sputnik.purge(about.__name__, about.__version__) - package = sputnik.install('spacy', about.short_version, 'en_default==1.0.4', - data_path=data_path) + try: + sputnik.package(about.__name__, about.__version__, about.__default_model__) + print("Model already installed. Please run 'python -m " + "spacy.en.download --force' to reinstall.", file=sys.stderr) + sys.exit(1) + except (PackageNotFoundException, CompatiblePackageNotFoundException): + pass + + package = sputnik.install(about.__name__, about.__version__, about.__default_model__) + + try: + sputnik.package(about.__name__, about.__version__, about.__default_model__) + except (PackageNotFoundException, CompatiblePackageNotFoundException): + print("Model failed to install. Please run 'python -m " + "spacy.en.download --force'.", file=sys.stderr) + sys.exit(1) # FIXME clean up old-style packages - migrate(path) + migrate(os.path.dirname(os.path.abspath(__file__))) + + print("Model successfully installed.", file=sys.stderr) if __name__ == '__main__': diff --git a/spacy/language.py b/spacy/language.py index fe7cabcd7..8efcc618e 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -19,8 +19,8 @@ from . import orth from .syntax.ner import BiluoPushDown from .syntax.arc_eager import ArcEager +from . import util from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD -from .util import get_package class Language(object): @@ -137,12 +137,10 @@ class Language(object): return {0: {'PER': True, 'LOC': True, 'ORG': True, 'MISC': True}} @classmethod - def default_vocab(cls, package=None, get_lex_attr=None): - if package is None: - package = get_package() + def default_vocab(cls, package, get_lex_attr=None): if get_lex_attr is None: get_lex_attr = cls.default_lex_attrs() - return Vocab.load(package, get_lex_attr=get_lex_attr) + return Vocab.from_package(package, get_lex_attr=get_lex_attr) @classmethod def default_parser(cls, package, vocab): @@ -158,7 +156,6 @@ class Language(object): def __init__(self, data_dir=None, - model=None, vocab=None, tokenizer=None, tagger=None, @@ -166,33 +163,44 @@ class Language(object): entity=None, matcher=None, serializer=None, - load_vectors=True): + load_vectors=True, + package=None): """ a model can be specified: - 1) by a path to the model directory (DEPRECATED) - - Language(data_dir='path/to/data') + 1) by calling a Language subclass + - spacy.en.English() - 2) by a language identifier (and optionally a package root dir) - - Language(lang='en') - - Language(lang='en', data_dir='spacy/data') + 2) by calling a Language subclass with data_dir + - spacy.en.English('my/model/root') + - spacy.en.English(data_dir='my/model/root') - 3) by a model name/version (and optionally a package root dir) - - Language(model='en_default') - - Language(model='en_default ==1.0.0') - - Language(model='en_default <1.1.0, data_dir='spacy/data') + 3) by package name + - spacy.load('en_default') + - spacy.load('en_default==1.0.0') + + 4) by package name with a relocated package base + - spacy.load('en_default', via='/my/package/root') + - spacy.load('en_default==1.0.0', via='/my/package/root') """ - package = get_package(model, data_path=data_dir) + + if package is None: + if data_dir is None: + package = util.get_package_by_name() + else: + package = util.get_package(data_dir) + if load_vectors is not True: warn("load_vectors is deprecated", DeprecationWarning) + if vocab in (None, True): - vocab = Vocab.load(package, get_lex_attr=self.default_lex_attrs()) + vocab = self.default_vocab(package) self.vocab = vocab if tokenizer in (None, True): - tokenizer = Tokenizer.load(package, self.vocab) + tokenizer = Tokenizer.from_package(package, self.vocab) self.tokenizer = tokenizer if tagger in (None, True): - tagger = Tagger.load(package, self.vocab) + tagger = Tagger.from_package(package, self.vocab) self.tagger = tagger if entity in (None, True): entity = self.default_entity(package, self.vocab) @@ -201,13 +209,12 @@ class Language(object): parser = self.default_parser(package, self.vocab) self.parser = parser if matcher in (None, True): - matcher = Matcher.load(package, self.vocab) + matcher = Matcher.from_package(package, self.vocab) self.matcher = matcher def __reduce__(self): args = ( None, # data_dir - None, # model self.vocab, self.tokenizer, self.tagger, diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index 556de3659..a05ca49c0 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -13,8 +13,11 @@ from .util import get_package class Lemmatizer(object): @classmethod - def load(cls, pkg_or_str_or_file): - pkg = get_package(pkg_or_str_or_file) + def load(cls, via): + return cls.from_package(get_package(via)) + + @classmethod + def from_package(cls, pkg): index = {} exc = {} for pos in ['adj', 'noun', 'verb']: diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index 2b7364487..cef98c068 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -170,8 +170,11 @@ cdef class Matcher: cdef object _patterns @classmethod - def load(cls, pkg_or_str_or_file, Vocab vocab): - package = get_package(pkg_or_str_or_file) + def load(cls, data_dir, Vocab vocab): + return cls.from_package(get_package(data_dir), vocab=vocab) + + @classmethod + def from_package(cls, package, Vocab vocab): patterns = package.load_json(('vocab', 'gazetteer.json')) return cls(vocab, patterns) diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx index a3f8797e2..493cc4f99 100644 --- a/spacy/tagger.pyx +++ b/spacy/tagger.pyx @@ -148,8 +148,11 @@ cdef class Tagger: return cls(vocab, model) @classmethod - def load(cls, pkg_or_str_or_file, vocab): - pkg = get_package(pkg_or_str_or_file) + def load(cls, data_dir, vocab): + return cls.from_package(get_package(data_dir), vocab=vocab) + + @classmethod + def from_package(cls, pkg, vocab): # TODO: templates.json deprecated? not present in latest package templates = cls.default_templates() # templates = package.load_utf8(json.load, diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index b8a620d88..83a39a03a 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -7,11 +7,11 @@ import os @pytest.fixture(scope="session") def EN(): if os.environ.get('SPACY_DATA'): - data_path = os.environ.get('SPACY_DATA') + data_dir = os.environ.get('SPACY_DATA') else: - data_path = None - print("Load EN from %s" % data_path) - return English(data_dir=data_path) + data_dir = None + print("Load EN from %s" % data_dir) + return English(data_dir=data_dir) def pytest_addoption(parser): diff --git a/spacy/tests/serialize/test_packer.py b/spacy/tests/serialize/test_packer.py index 1d3b12117..392cba8e3 100644 --- a/spacy/tests/serialize/test_packer.py +++ b/spacy/tests/serialize/test_packer.py @@ -13,6 +13,7 @@ from spacy.tokenizer import Tokenizer from os import path import os +from spacy import util from spacy.attrs import ORTH, SPACY, TAG, DEP, HEAD from spacy.serialize.packer import Packer @@ -21,11 +22,13 @@ from spacy.serialize.bits import BitArray @pytest.fixture def vocab(): - if os.environ.get('SPACY_DATA'): - data_path = os.environ.get('SPACY_DATA') + data_dir = os.environ.get('SPACY_DATA') + if data_dir is None: + package = util.get_package_by_name() else: - data_path = None - vocab = English.default_vocab(package=data_path) + package = util.get_package(data_dir) + + vocab = English.default_vocab(package=package) lex = vocab['dog'] assert vocab[vocab.strings['dog']].orth_ == 'dog' lex = vocab['the'] diff --git a/spacy/tests/tagger/test_lemmatizer.py b/spacy/tests/tagger/test_lemmatizer.py index 3de30693c..af85645a6 100644 --- a/spacy/tests/tagger/test_lemmatizer.py +++ b/spacy/tests/tagger/test_lemmatizer.py @@ -5,23 +5,23 @@ import io import pickle from spacy.lemmatizer import Lemmatizer, read_index, read_exc -from spacy.util import get_package +from spacy import util import pytest @pytest.fixture def package(): - if os.environ.get('SPACY_DATA'): - data_path = os.environ.get('SPACY_DATA') + data_dir = os.environ.get('SPACY_DATA') + if data_dir is None: + return util.get_package_by_name() else: - data_path = None - return get_package(data_path=data_path) + return util.get_package(data_dir) @pytest.fixture def lemmatizer(package): - return Lemmatizer.load(package) + return Lemmatizer.from_package(package) def test_read_index(package): diff --git a/spacy/tests/website/conftest.py b/spacy/tests/website/conftest.py index d7b4b3252..e2c64cfd7 100644 --- a/spacy/tests/website/conftest.py +++ b/spacy/tests/website/conftest.py @@ -7,10 +7,10 @@ import os def nlp(): from spacy.en import English if os.environ.get('SPACY_DATA'): - data_path = os.environ.get('SPACY_DATA') + data_dir = os.environ.get('SPACY_DATA') else: - data_path = None - return English(data_dir=data_path) + data_dir = None + return English(data_dir=data_dir) @pytest.fixture() diff --git a/spacy/tests/website/test_home.py b/spacy/tests/website/test_home.py index ef13b4677..3d9aa5dd6 100644 --- a/spacy/tests/website/test_home.py +++ b/spacy/tests/website/test_home.py @@ -11,13 +11,13 @@ def token(doc): def test_load_resources_and_process_text(): if os.environ.get('SPACY_DATA'): - data_path = os.environ.get('SPACY_DATA') + data_dir = os.environ.get('SPACY_DATA') else: - data_path = None - print("Load EN from %s" % data_path) + data_dir = None + print("Load EN from %s" % data_dir) from spacy.en import English - nlp = English(data_dir=data_path) + nlp = English(data_dir=data_dir) doc = nlp('Hello, world. Here are two sentences.') diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 49e8a06ef..593d0dc7d 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -42,9 +42,12 @@ cdef class Tokenizer: return (self.__class__, args, None, None) @classmethod - def load(cls, pkg_or_str_or_file, Vocab vocab): - pkg = get_package(pkg_or_str_or_file) - rules, prefix_re, suffix_re, infix_re = read_lang_data(pkg) + def load(cls, data_dir, Vocab vocab): + return cls.from_package(get_package(data_dir), vocab=vocab) + + @classmethod + def from_package(cls, package, Vocab vocab): + rules, prefix_re, suffix_re, infix_re = read_lang_data(package) prefix_re = re.compile(prefix_re) suffix_re = re.compile(suffix_re) infix_re = re.compile(infix_re) diff --git a/spacy/util.py b/spacy/util.py index c998df056..390c83a03 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -4,37 +4,33 @@ import json import re import os.path +import six import sputnik from sputnik.dir_package import DirPackage -from sputnik.package_stub import PackageStub -from sputnik.package_list import PackageNotFoundException, CompatiblePackageNotFoundException +from sputnik.package_list import (PackageNotFoundException, + CompatiblePackageNotFoundException) from . import about from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE -def get_package(value=None, data_path=None): - if data_path is None: - if isinstance(value, PackageStub): - return value - elif value and os.path.isdir(value): - return DirPackage(value) +def get_package(data_dir): + if not isinstance(data_dir, six.string_types): + raise RuntimeError('data_dir must be a string') + return DirPackage(data_dir) - elif value is None and data_path is not None: - return DirPackage(data_path) +def get_package_by_name(name=None, via=None): try: - return sputnik.package('spacy', about.short_version, - value or 'en_default==1.0.4', - data_path=data_path) - + return sputnik.package(about.__name__, about.__version__, + name or about.__default_model__, data_path=via) except PackageNotFoundException as e: raise RuntimeError("Model not installed. Please run 'python -m " "spacy.en.download' to install latest compatible " "model.") except CompatiblePackageNotFoundException as e: raise RuntimeError("Installed model is not compatible with spaCy " - "version. Please run 'python -m spacy.en.download' " + "version. Please run 'python -m spacy.en.download " "--force' to install latest compatible model.") diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index e09cb48de..f4750dcb5 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -48,11 +48,14 @@ cdef class Vocab: '''A map container for a language's LexemeC structs. ''' @classmethod - def load(cls, pkg_or_str_or_file, get_lex_attr=None): - package = get_package(pkg_or_str_or_file) + def load(cls, data_dir, get_lex_attr=None): + return cls.from_package(get_package(data_dir), get_lex_attr=get_lex_attr) + + @classmethod + def from_package(cls, package, get_lex_attr=None): tag_map = package.load_json(('vocab', 'tag_map.json'), default={}) - lemmatizer = Lemmatizer.load(package) + lemmatizer = Lemmatizer.from_package(package) serializer_freqs = package.load_json(('vocab', 'serializer.json'), default={})