diff --git a/setup.py b/setup.py index a1e7dc94b..349e85b6b 100644 --- a/setup.py +++ b/setup.py @@ -260,6 +260,7 @@ def setup_package(): setup( name='spacy', + zip_safe=False, packages=PACKAGES, package_data={'': ['*.pyx', '*.pxd']}, description='Industrial-strength NLP', diff --git a/spacy/__init__.py b/spacy/__init__.py index e69de29bb..57d02d95f 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -0,0 +1,7 @@ +from . import util +from .en import English + + +def load(name, via=None): + package = util.get_package_by_name(name, via=via) + return English(package) diff --git a/spacy/en/download.py b/spacy/en/download.py index bdc0ac9b0..3195aa127 100644 --- a/spacy/en/download.py +++ b/spacy/en/download.py @@ -1,3 +1,5 @@ +from __future__ import print_function + import sys import os import shutil @@ -37,21 +39,26 @@ def link(package, path): force=("Force overwrite", "flag", "f", bool), ) def main(data_size='all', force=False): + package_name = 'en_default==1.0.4' path = os.path.dirname(os.path.abspath(__file__)) - data_path = os.path.abspath(os.path.join(path, '..', 'data')) - if not os.path.isdir(data_path): - os.mkdir(data_path) - if force: - sputnik.purge('spacy', about.short_version, data_path=data_path) + sputnik.purge('spacy', about.short_version) - package = sputnik.install('spacy', about.short_version, 'en_default==1.0.4', - data_path=data_path) + package = sputnik.install('spacy', about.short_version, package_name) + + try: + sputnik.package('spacy', about.short_version, package_name) + except PackageNotFoundException, CompatiblePackageNotFoundException: + print("Model failed to install. Please run 'python -m " + "spacy.en.download --force'.", file=sys.stderr) + sys.exit(1) # FIXME clean up old-style packages migrate(path) + print("Model successfully installed.", file=sys.stderr) + if __name__ == '__main__': plac.call(main) diff --git a/spacy/language.py b/spacy/language.py index fe7cabcd7..8c86d53dd 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -8,6 +8,9 @@ try: except ImportError: import json +import sputnik +from sputnik.dir_package import DirPackage + from .tokenizer import Tokenizer from .vocab import Vocab from .syntax.parser import Parser @@ -19,8 +22,9 @@ from . import orth from .syntax.ner import BiluoPushDown from .syntax.arc_eager import ArcEager +from . import about +from . import util from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD -from .util import get_package class Language(object): @@ -137,9 +141,7 @@ class Language(object): return {0: {'PER': True, 'LOC': True, 'ORG': True, 'MISC': True}} @classmethod - def default_vocab(cls, package=None, get_lex_attr=None): - if package is None: - package = get_package() + def default_vocab(cls, package, get_lex_attr=None): if get_lex_attr is None: get_lex_attr = cls.default_lex_attrs() return Vocab.load(package, get_lex_attr=get_lex_attr) @@ -157,8 +159,8 @@ class Language(object): return Parser.from_dir(data_dir, vocab.strings, BiluoPushDown) def __init__(self, + via=None, data_dir=None, - model=None, vocab=None, tokenizer=None, tagger=None, @@ -170,19 +172,34 @@ class Language(object): """ a model can be specified: - 1) by a path to the model directory (DEPRECATED) - - Language(data_dir='path/to/data') + 1) by calling a Language subclass + - spacy.en.English() - 2) by a language identifier (and optionally a package root dir) - - Language(lang='en') - - Language(lang='en', data_dir='spacy/data') + 2) by calling a Language subclass with via (previously: data_dir) + - spacy.en.English('my/model/root') + - spacy.en.English(via='my/model/root') - 3) by a model name/version (and optionally a package root dir) - - Language(model='en_default') - - Language(model='en_default ==1.0.0') - - Language(model='en_default <1.1.0, data_dir='spacy/data') + 3) by package name + - spacy.load('en_default') + - spacy.load('en_default==1.0.0') + + 4) by package name with a relocated package base + - spacy.load('en_default', via='/my/package/root') + - spacy.load('en_default==1.0.0', via='/my/package/root') + + 5) by package object + - spacy.en.English(package) """ - package = get_package(model, data_path=data_dir) + + if data_dir is not None and via is None: + warn("Use of data_dir is deprecated, use via instead.", DeprecationWarning) + via = data_dir + + if via is None: + package = util.get_package_by_name('en_default==1.0.4') + else: + package = util.get_package(via) + if load_vectors is not True: warn("load_vectors is deprecated", DeprecationWarning) if vocab in (None, True): diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index 556de3659..5082da253 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -13,8 +13,8 @@ from .util import get_package class Lemmatizer(object): @classmethod - def load(cls, pkg_or_str_or_file): - pkg = get_package(pkg_or_str_or_file) + def load(cls, via): + pkg = get_package(via) index = {} exc = {} for pos in ['adj', 'noun', 'verb']: diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index 2b7364487..df71e8f98 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -170,8 +170,8 @@ cdef class Matcher: cdef object _patterns @classmethod - def load(cls, pkg_or_str_or_file, Vocab vocab): - package = get_package(pkg_or_str_or_file) + def load(cls, via, Vocab vocab): + package = get_package(via) patterns = package.load_json(('vocab', 'gazetteer.json')) return cls(vocab, patterns) diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx index a3f8797e2..7d7b82d90 100644 --- a/spacy/tagger.pyx +++ b/spacy/tagger.pyx @@ -148,8 +148,8 @@ cdef class Tagger: return cls(vocab, model) @classmethod - def load(cls, pkg_or_str_or_file, vocab): - pkg = get_package(pkg_or_str_or_file) + def load(cls, via, vocab): + pkg = get_package(via) # TODO: templates.json deprecated? not present in latest package templates = cls.default_templates() # templates = package.load_utf8(json.load, diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 49e8a06ef..9f195f784 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -42,8 +42,8 @@ cdef class Tokenizer: return (self.__class__, args, None, None) @classmethod - def load(cls, pkg_or_str_or_file, Vocab vocab): - pkg = get_package(pkg_or_str_or_file) + def load(cls, via, Vocab vocab): + pkg = get_package(via) rules, prefix_re, suffix_re, infix_re = read_lang_data(pkg) prefix_re = re.compile(prefix_re) suffix_re = re.compile(suffix_re) diff --git a/spacy/util.py b/spacy/util.py index c998df056..27d1fe161 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -7,34 +7,29 @@ import os.path import sputnik from sputnik.dir_package import DirPackage from sputnik.package_stub import PackageStub -from sputnik.package_list import PackageNotFoundException, CompatiblePackageNotFoundException +from sputnik.package_list import (PackageNotFoundException, + CompatiblePackageNotFoundException) from . import about from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE -def get_package(value=None, data_path=None): - if data_path is None: - if isinstance(value, PackageStub): - return value - elif value and os.path.isdir(value): - return DirPackage(value) +def get_package(via=None): + if isinstance(via, PackageStub): + return via + return DirPackage(via) - elif value is None and data_path is not None: - return DirPackage(data_path) +def get_package_by_name(name, via=None): try: - return sputnik.package('spacy', about.short_version, - value or 'en_default==1.0.4', - data_path=data_path) - + return sputnik.package('spacy', about.short_version, name, data_path=via) except PackageNotFoundException as e: raise RuntimeError("Model not installed. Please run 'python -m " "spacy.en.download' to install latest compatible " "model.") except CompatiblePackageNotFoundException as e: raise RuntimeError("Installed model is not compatible with spaCy " - "version. Please run 'python -m spacy.en.download' " + "version. Please run 'python -m spacy.en.download " "--force' to install latest compatible model.") diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index e09cb48de..f9771d5f7 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -48,8 +48,8 @@ cdef class Vocab: '''A map container for a language's LexemeC structs. ''' @classmethod - def load(cls, pkg_or_str_or_file, get_lex_attr=None): - package = get_package(pkg_or_str_or_file) + def load(cls, via, get_lex_attr=None): + package = get_package(via) tag_map = package.load_json(('vocab', 'tag_map.json'), default={}) lemmatizer = Lemmatizer.load(package)