refactored data_dir->via, add zip_safe, add spacy.load()

This commit is contained in:
Henning Peters 2016-01-15 18:01:02 +01:00
parent 04e67e8715
commit 788f734513
10 changed files with 73 additions and 46 deletions

View File

@ -260,6 +260,7 @@ def setup_package():
setup( setup(
name='spacy', name='spacy',
zip_safe=False,
packages=PACKAGES, packages=PACKAGES,
package_data={'': ['*.pyx', '*.pxd']}, package_data={'': ['*.pyx', '*.pxd']},
description='Industrial-strength NLP', description='Industrial-strength NLP',

View File

@ -0,0 +1,7 @@
from . import util
from .en import English
def load(name, via=None):
package = util.get_package_by_name(name, via=via)
return English(package)

View File

@ -1,3 +1,5 @@
from __future__ import print_function
import sys import sys
import os import os
import shutil import shutil
@ -37,21 +39,26 @@ def link(package, path):
force=("Force overwrite", "flag", "f", bool), force=("Force overwrite", "flag", "f", bool),
) )
def main(data_size='all', force=False): def main(data_size='all', force=False):
package_name = 'en_default==1.0.4'
path = os.path.dirname(os.path.abspath(__file__)) path = os.path.dirname(os.path.abspath(__file__))
data_path = os.path.abspath(os.path.join(path, '..', 'data'))
if not os.path.isdir(data_path):
os.mkdir(data_path)
if force: if force:
sputnik.purge('spacy', about.short_version, data_path=data_path) sputnik.purge('spacy', about.short_version)
package = sputnik.install('spacy', about.short_version, 'en_default==1.0.4', package = sputnik.install('spacy', about.short_version, package_name)
data_path=data_path)
try:
sputnik.package('spacy', about.short_version, package_name)
except PackageNotFoundException, CompatiblePackageNotFoundException:
print("Model failed to install. Please run 'python -m "
"spacy.en.download --force'.", file=sys.stderr)
sys.exit(1)
# FIXME clean up old-style packages # FIXME clean up old-style packages
migrate(path) migrate(path)
print("Model successfully installed.", file=sys.stderr)
if __name__ == '__main__': if __name__ == '__main__':
plac.call(main) plac.call(main)

View File

@ -8,6 +8,9 @@ try:
except ImportError: except ImportError:
import json import json
import sputnik
from sputnik.dir_package import DirPackage
from .tokenizer import Tokenizer from .tokenizer import Tokenizer
from .vocab import Vocab from .vocab import Vocab
from .syntax.parser import Parser from .syntax.parser import Parser
@ -19,8 +22,9 @@ from . import orth
from .syntax.ner import BiluoPushDown from .syntax.ner import BiluoPushDown
from .syntax.arc_eager import ArcEager from .syntax.arc_eager import ArcEager
from . import about
from . import util
from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD
from .util import get_package
class Language(object): class Language(object):
@ -137,9 +141,7 @@ class Language(object):
return {0: {'PER': True, 'LOC': True, 'ORG': True, 'MISC': True}} return {0: {'PER': True, 'LOC': True, 'ORG': True, 'MISC': True}}
@classmethod @classmethod
def default_vocab(cls, package=None, get_lex_attr=None): def default_vocab(cls, package, get_lex_attr=None):
if package is None:
package = get_package()
if get_lex_attr is None: if get_lex_attr is None:
get_lex_attr = cls.default_lex_attrs() get_lex_attr = cls.default_lex_attrs()
return Vocab.load(package, get_lex_attr=get_lex_attr) return Vocab.load(package, get_lex_attr=get_lex_attr)
@ -157,8 +159,8 @@ class Language(object):
return Parser.from_dir(data_dir, vocab.strings, BiluoPushDown) return Parser.from_dir(data_dir, vocab.strings, BiluoPushDown)
def __init__(self, def __init__(self,
via=None,
data_dir=None, data_dir=None,
model=None,
vocab=None, vocab=None,
tokenizer=None, tokenizer=None,
tagger=None, tagger=None,
@ -170,19 +172,34 @@ class Language(object):
""" """
a model can be specified: a model can be specified:
1) by a path to the model directory (DEPRECATED) 1) by calling a Language subclass
- Language(data_dir='path/to/data') - spacy.en.English()
2) by a language identifier (and optionally a package root dir) 2) by calling a Language subclass with via (previously: data_dir)
- Language(lang='en') - spacy.en.English('my/model/root')
- Language(lang='en', data_dir='spacy/data') - spacy.en.English(via='my/model/root')
3) by a model name/version (and optionally a package root dir) 3) by package name
- Language(model='en_default') - spacy.load('en_default')
- Language(model='en_default ==1.0.0') - spacy.load('en_default==1.0.0')
- Language(model='en_default <1.1.0, data_dir='spacy/data')
4) by package name with a relocated package base
- spacy.load('en_default', via='/my/package/root')
- spacy.load('en_default==1.0.0', via='/my/package/root')
5) by package object
- spacy.en.English(package)
""" """
package = get_package(model, data_path=data_dir)
if data_dir is not None and via is None:
warn("Use of data_dir is deprecated, use via instead.", DeprecationWarning)
via = data_dir
if via is None:
package = util.get_package_by_name('en_default==1.0.4')
else:
package = util.get_package(via)
if load_vectors is not True: if load_vectors is not True:
warn("load_vectors is deprecated", DeprecationWarning) warn("load_vectors is deprecated", DeprecationWarning)
if vocab in (None, True): if vocab in (None, True):

View File

@ -13,8 +13,8 @@ from .util import get_package
class Lemmatizer(object): class Lemmatizer(object):
@classmethod @classmethod
def load(cls, pkg_or_str_or_file): def load(cls, via):
pkg = get_package(pkg_or_str_or_file) pkg = get_package(via)
index = {} index = {}
exc = {} exc = {}
for pos in ['adj', 'noun', 'verb']: for pos in ['adj', 'noun', 'verb']:

View File

@ -170,8 +170,8 @@ cdef class Matcher:
cdef object _patterns cdef object _patterns
@classmethod @classmethod
def load(cls, pkg_or_str_or_file, Vocab vocab): def load(cls, via, Vocab vocab):
package = get_package(pkg_or_str_or_file) package = get_package(via)
patterns = package.load_json(('vocab', 'gazetteer.json')) patterns = package.load_json(('vocab', 'gazetteer.json'))
return cls(vocab, patterns) return cls(vocab, patterns)

View File

@ -148,8 +148,8 @@ cdef class Tagger:
return cls(vocab, model) return cls(vocab, model)
@classmethod @classmethod
def load(cls, pkg_or_str_or_file, vocab): def load(cls, via, vocab):
pkg = get_package(pkg_or_str_or_file) pkg = get_package(via)
# TODO: templates.json deprecated? not present in latest package # TODO: templates.json deprecated? not present in latest package
templates = cls.default_templates() templates = cls.default_templates()
# templates = package.load_utf8(json.load, # templates = package.load_utf8(json.load,

View File

@ -42,8 +42,8 @@ cdef class Tokenizer:
return (self.__class__, args, None, None) return (self.__class__, args, None, None)
@classmethod @classmethod
def load(cls, pkg_or_str_or_file, Vocab vocab): def load(cls, via, Vocab vocab):
pkg = get_package(pkg_or_str_or_file) pkg = get_package(via)
rules, prefix_re, suffix_re, infix_re = read_lang_data(pkg) rules, prefix_re, suffix_re, infix_re = read_lang_data(pkg)
prefix_re = re.compile(prefix_re) prefix_re = re.compile(prefix_re)
suffix_re = re.compile(suffix_re) suffix_re = re.compile(suffix_re)

View File

@ -7,34 +7,29 @@ import os.path
import sputnik import sputnik
from sputnik.dir_package import DirPackage from sputnik.dir_package import DirPackage
from sputnik.package_stub import PackageStub from sputnik.package_stub import PackageStub
from sputnik.package_list import PackageNotFoundException, CompatiblePackageNotFoundException from sputnik.package_list import (PackageNotFoundException,
CompatiblePackageNotFoundException)
from . import about from . import about
from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
def get_package(value=None, data_path=None): def get_package(via=None):
if data_path is None: if isinstance(via, PackageStub):
if isinstance(value, PackageStub): return via
return value return DirPackage(via)
elif value and os.path.isdir(value):
return DirPackage(value)
elif value is None and data_path is not None:
return DirPackage(data_path)
def get_package_by_name(name, via=None):
try: try:
return sputnik.package('spacy', about.short_version, return sputnik.package('spacy', about.short_version, name, data_path=via)
value or 'en_default==1.0.4',
data_path=data_path)
except PackageNotFoundException as e: except PackageNotFoundException as e:
raise RuntimeError("Model not installed. Please run 'python -m " raise RuntimeError("Model not installed. Please run 'python -m "
"spacy.en.download' to install latest compatible " "spacy.en.download' to install latest compatible "
"model.") "model.")
except CompatiblePackageNotFoundException as e: except CompatiblePackageNotFoundException as e:
raise RuntimeError("Installed model is not compatible with spaCy " raise RuntimeError("Installed model is not compatible with spaCy "
"version. Please run 'python -m spacy.en.download' " "version. Please run 'python -m spacy.en.download "
"--force' to install latest compatible model.") "--force' to install latest compatible model.")

View File

@ -48,8 +48,8 @@ cdef class Vocab:
'''A map container for a language's LexemeC structs. '''A map container for a language's LexemeC structs.
''' '''
@classmethod @classmethod
def load(cls, pkg_or_str_or_file, get_lex_attr=None): def load(cls, via, get_lex_attr=None):
package = get_package(pkg_or_str_or_file) package = get_package(via)
tag_map = package.load_json(('vocab', 'tag_map.json'), default={}) tag_map = package.load_json(('vocab', 'tag_map.json'), default={})
lemmatizer = Lemmatizer.load(package) lemmatizer = Lemmatizer.load(package)