Merge pull request #222 from henningpeters/revise_packaging

integrate with sputnik
This commit is contained in:
Matthew Honnibal 2016-01-16 01:23:39 +11:00
commit ed001ea977
12 changed files with 59 additions and 103 deletions

View File

@ -1,17 +1,14 @@
{ {
"name": "en_default", "name": "en_test",
"version": "0.100.0", "version": "1.0.0",
"description": "english default model", "description": "english test model",
"license": "public domain", "license": "public domain",
"include": [ "include": [
"deps/*", ["deps", "*"],
"ner/*", ["ner", "*"],
"pos/*", ["pos", "*"],
"tokenizer/*", ["tokenizer", "*"],
"vocab/*", ["vocab", "*"],
"wordnet/*" ["wordnet", "*"]
], ]
"compatibility": {
"spacy": "==0.100.0"
}
} }

View File

@ -10,4 +10,4 @@ plac
six six
ujson ujson
cloudpickle cloudpickle
sputnik>=0.6.4,<0.7.0 sputnik>=0.7.0,<0.8.0

View File

@ -271,7 +271,7 @@ def setup_package():
ext_modules=ext_modules, ext_modules=ext_modules,
install_requires=['numpy', 'murmurhash>=0.26,<0.27', 'cymem>=1.30,<1.31', 'preshed>=0.46.1,<0.47', install_requires=['numpy', 'murmurhash>=0.26,<0.27', 'cymem>=1.30,<1.31', 'preshed>=0.46.1,<0.47',
'thinc>=4.2.0,<4.3.0', 'text_unidecode', 'plac', 'six', 'thinc>=4.2.0,<4.3.0', 'text_unidecode', 'plac', 'six',
'ujson', 'cloudpickle', 'sputnik>=0.6.4,<0.7.0'], 'ujson', 'cloudpickle', 'sputnik>=0.7.0,<0.8.0'],
cmdclass = { cmdclass = {
'build_ext': build_ext_subclass}, 'build_ext': build_ext_subclass},
) )

View File

@ -3,7 +3,9 @@ import os
import shutil import shutil
import plac import plac
from sputnik import Sputnik import sputnik
from .. import about
def migrate(path): def migrate(path):
@ -35,23 +37,17 @@ def link(package, path):
force=("Force overwrite", "flag", "f", bool), force=("Force overwrite", "flag", "f", bool),
) )
def main(data_size='all', force=False): def main(data_size='all', force=False):
# TODO read version from the same source as the setup
sputnik = Sputnik('spacy', '0.100.0', console=sys.stdout)
path = os.path.dirname(os.path.abspath(__file__)) path = os.path.dirname(os.path.abspath(__file__))
data_path = os.path.abspath(os.path.join(path, '..', 'data')) data_path = os.path.abspath(os.path.join(path, '..', 'data'))
if not os.path.isdir(data_path): if not os.path.isdir(data_path):
os.mkdir(data_path) os.mkdir(data_path)
command = sputnik.command(
data_path=data_path,
repository_url='https://index.spacy.io')
if force: if force:
command.purge() sputnik.purge('spacy', about.short_version, data_path=data_path)
package = command.install('en_default') package = sputnik.install('spacy', about.short_version, 'en_default==1.0.4',
data_path=data_path)
# FIXME clean up old-style packages # FIXME clean up old-style packages
migrate(path) migrate(path)

View File

@ -20,7 +20,7 @@ from .syntax.ner import BiluoPushDown
from .syntax.arc_eager import ArcEager from .syntax.arc_eager import ArcEager
from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD
from .util import get_package, Package from .util import get_package
class Language(object): class Language(object):
@ -146,13 +146,13 @@ class Language(object):
@classmethod @classmethod
def default_parser(cls, package, vocab): def default_parser(cls, package, vocab):
data_dir = package.dir_path('deps', require=False) data_dir = package.dir_path('deps')
if data_dir and path.exists(data_dir): if data_dir and path.exists(data_dir):
return Parser.from_dir(data_dir, vocab.strings, ArcEager) return Parser.from_dir(data_dir, vocab.strings, ArcEager)
@classmethod @classmethod
def default_entity(cls, package, vocab): def default_entity(cls, package, vocab):
data_dir = package.dir_path('ner', require=False) data_dir = package.dir_path('ner')
if data_dir and path.exists(data_dir): if data_dir and path.exists(data_dir):
return Parser.from_dir(data_dir, vocab.strings, BiluoPushDown) return Parser.from_dir(data_dir, vocab.strings, BiluoPushDown)
@ -182,7 +182,7 @@ class Language(object):
- Language(model='en_default ==1.0.0') - Language(model='en_default ==1.0.0')
- Language(model='en_default <1.1.0, data_dir='spacy/data') - Language(model='en_default <1.1.0, data_dir='spacy/data')
""" """
package = Package(data_dir) package = get_package(model, data_path=data_dir)
if load_vectors is not True: if load_vectors is not True:
warn("load_vectors is deprecated", DeprecationWarning) warn("load_vectors is deprecated", DeprecationWarning)
if vocab in (None, True): if vocab in (None, True):

View File

@ -8,13 +8,13 @@ except ImportError:
import json import json
from .parts_of_speech import NOUN, VERB, ADJ, PUNCT from .parts_of_speech import NOUN, VERB, ADJ, PUNCT
from .util import Package from .util import get_package
class Lemmatizer(object): class Lemmatizer(object):
@classmethod @classmethod
def load(cls, pkg_or_str_or_file): def load(cls, pkg_or_str_or_file):
pkg = Package.create_or_return(pkg_or_str_or_file) pkg = get_package(pkg_or_str_or_file)
index = {} index = {}
exc = {} exc = {}
for pos in ['adj', 'noun', 'verb']: for pos in ['adj', 'noun', 'verb']:

View File

@ -21,7 +21,7 @@ from .tokens.doc cimport Doc
from .vocab cimport Vocab from .vocab cimport Vocab
from .attrs import FLAG61 as U_ENT from .attrs import FLAG61 as U_ENT
from .util import Package from .util import get_package
from .attrs import FLAG60 as B2_ENT from .attrs import FLAG60 as B2_ENT
from .attrs import FLAG59 as B3_ENT from .attrs import FLAG59 as B3_ENT
@ -171,7 +171,7 @@ cdef class Matcher:
@classmethod @classmethod
def load(cls, pkg_or_str_or_file, Vocab vocab): def load(cls, pkg_or_str_or_file, Vocab vocab):
package = Package.create_or_return(pkg_or_str_or_file) package = get_package(pkg_or_str_or_file)
patterns = package.load_json(('vocab', 'gazetteer.json')) patterns = package.load_json(('vocab', 'gazetteer.json'))
return cls(vocab, patterns) return cls(vocab, patterns)

View File

@ -16,7 +16,7 @@ from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE
from .attrs cimport * from .attrs cimport *
from .util import Package from .util import get_package
cpdef enum: cpdef enum:
@ -149,7 +149,7 @@ cdef class Tagger:
@classmethod @classmethod
def load(cls, pkg_or_str_or_file, vocab): def load(cls, pkg_or_str_or_file, vocab):
pkg = Package.create_or_return(pkg_or_str_or_file) pkg = get_package(pkg_or_str_or_file)
# TODO: templates.json deprecated? not present in latest package # TODO: templates.json deprecated? not present in latest package
templates = cls.default_templates() templates = cls.default_templates()
# templates = package.load_utf8(json.load, # templates = package.load_utf8(json.load,

View File

@ -5,7 +5,7 @@ import io
import pickle import pickle
from spacy.lemmatizer import Lemmatizer, read_index, read_exc from spacy.lemmatizer import Lemmatizer, read_index, read_exc
from spacy.util import get_package, Package from spacy.util import get_package
import pytest import pytest

View File

@ -17,7 +17,7 @@ cimport cython
from . import util from . import util
from .tokens.doc cimport Doc from .tokens.doc cimport Doc
from .util import read_lang_data from .util import read_lang_data
from .util import Package from .util import get_package
cdef class Tokenizer: cdef class Tokenizer:
@ -43,7 +43,7 @@ cdef class Tokenizer:
@classmethod @classmethod
def load(cls, pkg_or_str_or_file, Vocab vocab): def load(cls, pkg_or_str_or_file, Vocab vocab):
pkg = Package.create_or_return(pkg_or_str_or_file) pkg = get_package(pkg_or_str_or_file)
rules, prefix_re, suffix_re, infix_re = read_lang_data(pkg) rules, prefix_re, suffix_re, infix_re = read_lang_data(pkg)
prefix_re = re.compile(prefix_re) prefix_re = re.compile(prefix_re)
suffix_re = re.compile(suffix_re) suffix_re = re.compile(suffix_re)

View File

@ -3,76 +3,39 @@ import io
import json import json
import re import re
import os.path import os.path
from contextlib import contextmanager
import types
import sputnik
from sputnik.dir_package import DirPackage
from sputnik.package_stub import PackageStub
from sputnik.package_list import PackageNotFoundException, CompatiblePackageNotFoundException
from . import about
from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
def local_path(*dirs): def get_package(value=None, data_path=None):
return os.path.abspath(os.path.join(os.path.dirname(__file__), *dirs))
class Package(object):
@classmethod
def create_or_return(cls, me_or_arg):
return me_or_arg if isinstance(me_or_arg, cls) else cls(me_or_arg)
def __init__(self, data_path=None, model='en_default-1.0.3'):
if data_path is None: if data_path is None:
data_path = local_path('data', model) if isinstance(value, PackageStub):
self.model = model return value
self.data_path = data_path elif value and os.path.isdir(value):
self._root = self.data_path return DirPackage(value)
def get(self, key): elif value is None and data_path is not None:
pass return DirPackage(data_path)
def has_file(self, *path_parts): try:
return os.path.exists(os.path.join(self._root, *path_parts)) return sputnik.package('spacy', about.short_version,
value or 'en_default==1.0.4',
data_path=data_path)
def file_path(self, *path_parts, **kwargs): except PackageNotFoundException as e:
return os.path.join(self._root, *path_parts) raise RuntimeError("Model not installed. Please run 'python -m "
"spacy.en.download' to install latest compatible "
def dir_path(self, *path_parts, **kwargs): "model.")
return os.path.join(self._root, *path_parts) except CompatiblePackageNotFoundException as e:
raise RuntimeError("Installed model is not compatible with spaCy "
def load_json(self, path_parts, default=None): "version. Please run 'python -m spacy.en.download' "
if not self.has_file(*path_parts): "--force' to install latest compatible model.")
if _is_error_class(default):
raise default(self.file_path(*path_parts))
elif isinstance(default, Exception):
raise default
else:
return default
with io.open(self.file_path(os.path.join(*path_parts)),
mode='r', encoding='utf8') as file_:
return json.load(file_)
@contextmanager
def open(self, path_parts, mode='r', encoding='utf8', default=IOError):
if not self.has_file(*path_parts):
if _is_error_class(default):
raise default(self.file_path(*path_parts))
elif isinstance(default, Exception):
raise default
else:
yield default
else:
# Enter
file_ = io.open(self.file_path(os.path.join(*path_parts)),
mode=mode, encoding='utf8')
yield file_
# Exit
file_.close()
def _is_error_class(e):
return isinstance(e, types.TypeType) and issubclass(e, Exception)
def get_package(name=None, data_path=None):
return Package(data_path)
def normalize_slice(length, start, stop, step=None): def normalize_slice(length, start, stop, step=None):

View File

@ -19,7 +19,7 @@ from .orth cimport word_shape
from .typedefs cimport attr_t from .typedefs cimport attr_t
from .cfile cimport CFile from .cfile cimport CFile
from .lemmatizer import Lemmatizer from .lemmatizer import Lemmatizer
from .util import Package from .util import get_package
from . import attrs from . import attrs
from . import symbols from . import symbols
@ -49,7 +49,7 @@ cdef class Vocab:
''' '''
@classmethod @classmethod
def load(cls, pkg_or_str_or_file, get_lex_attr=None): def load(cls, pkg_or_str_or_file, get_lex_attr=None):
package = Package.create_or_return(pkg_or_str_or_file) package = get_package(pkg_or_str_or_file)
tag_map = package.load_json(('vocab', 'tag_map.json'), default={}) tag_map = package.load_json(('vocab', 'tag_map.json'), default={})
lemmatizer = Lemmatizer.load(package) lemmatizer = Lemmatizer.load(package)