* Fix use of mock Package object

This commit is contained in:
Matthew Honnibal 2015-12-31 04:13:15 +01:00
parent 029136a007
commit eaf2ad59f1
6 changed files with 39 additions and 31 deletions

View File

@ -186,7 +186,7 @@ class Language(object):
if load_vectors is not True:
warn("load_vectors is deprecated", DeprecationWarning)
if vocab in (None, True):
vocab = self.default_vocab(package)
vocab = Vocab.load(package, get_lex_attr=self.default_lex_attrs())
self.vocab = vocab
if tokenizer in (None, True):
tokenizer = Tokenizer.load(package, self.vocab)

View File

@ -22,8 +22,7 @@ class Lemmatizer(object):
index[pos] = read_index(file_) if file_ is not None else set()
with pkg.open(('wordnet', '%s.exc' % pos), default=None) as file_:
exc[pos] = read_exc(file_) if file_ is not None else {}
with pkg.open(('vocab', 'lemma_rules.json'), default=None) as file_:
rules = json.load(file_) if file_ is not None else {}
rules = pkg.load_json(('vocab', 'lemma_rules.json'), default={})
return cls(index, exc, rules)
def __init__(self, index, exceptions, rules):

View File

@ -172,9 +172,7 @@ cdef class Matcher:
@classmethod
def load(cls, pkg_or_str_or_file, Vocab vocab):
package = Package.create_or_return(pkg_or_str_or_file)
with package.open(('vocab', 'serializer.json'), default=None) as file_:
patterns = json.load(file_) if file_ is not None else {}
patterns = package.load_json(('vocab', 'gazetteer.json'))
return cls(vocab, patterns)
def __init__(self, vocab, patterns):

View File

@ -25,14 +25,16 @@ def lemmatizer(package):
def test_read_index(package):
index = package.load_utf8(read_index, 'wordnet', 'index.noun')
with package.open(('wordnet', 'index.noun')) as file_:
index = read_index(file_)
assert 'man' in index
assert 'plantes' not in index
assert 'plant' in index
def test_read_exc(package):
exc = package.load_utf8(read_exc, 'wordnet', 'verb.exc')
with package.open(('wordnet', 'verb.exc')) as file_:
exc = read_exc(file_)
assert exc['was'] == ('be',)

View File

@ -9,8 +9,8 @@ import types
from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
def local_path(subdir):
return os.path.abspath(os.path.join(os.path.dirname(__file__), 'data'))
def local_path(*dirs):
return os.path.abspath(os.path.join(os.path.dirname(__file__), *dirs))
class Package(object):
@ -18,10 +18,10 @@ class Package(object):
def create_or_return(cls, me_or_arg):
return me_or_arg if isinstance(me_or_arg, cls) else cls(me_or_arg)
def __init__(self, data_path=None):
def __init__(self, data_path=None, model='en_default-1.0.3'):
if data_path is None:
data_path = local_path('data')
self.name = None
data_path = local_path('data', model)
self.model = model
self.data_path = data_path
self._root = self.data_path
@ -37,18 +37,22 @@ class Package(object):
def dir_path(self, *path_parts, **kwargs):
return os.path.join(self._root, *path_parts)
def load_utf8(self, func, *path_parts, **kwargs):
if kwargs.get('require', True):
with io.open(self.file_path(os.path.join(*path_parts)),
mode='r', encoding='utf8') as f:
return func(f)
else:
return None
def load_json(self, path_parts, default=None):
if not self.has_file(*path_parts):
if _is_error_class(default):
raise default(self.file_path(*path_parts))
elif isinstance(default, Exception):
raise default
else:
return default
with io.open(self.file_path(os.path.join(*path_parts)),
mode='r', encoding='utf8') as file_:
return json.load(file_)
@contextmanager
def open(self, path_parts, default=IOError):
def open(self, path_parts, mode='r', encoding='utf8', default=IOError):
if not self.has_file(*path_parts):
if isinstance(default, types.TypeType) and issubclass(default, Exception):
if _is_error_class(default):
raise default(self.file_path(*path_parts))
elif isinstance(default, Exception):
raise default
@ -57,12 +61,16 @@ class Package(object):
else:
# Enter
file_ = io.open(self.file_path(os.path.join(*path_parts)),
mode='r', encoding='utf8')
mode=mode, encoding='utf8')
yield file_
# Exit
file_.close()
def _is_error_class(e):
return isinstance(e, types.TypeType) and issubclass(e, Exception)
def get_package(name=None, data_path=None):
return Package(data_path)
@ -92,10 +100,13 @@ def utf8open(loc, mode='r'):
def read_lang_data(package):
tokenization = package.load_utf8(json.load, 'tokenizer', 'specials.json')
prefix = package.load_utf8(read_prefix, 'tokenizer', 'prefix.txt')
suffix = package.load_utf8(read_suffix, 'tokenizer', 'suffix.txt')
infix = package.load_utf8(read_infix, 'tokenizer', 'infix.txt')
tokenization = package.load_json(('tokenizer', 'specials.json'))
with package.open(('tokenizer', 'prefix.txt'), default=None) as file_:
prefix = read_prefix(file_) if file_ is not None else None
with package.open(('tokenizer', 'suffix.txt'), default=None) as file_:
suffix = read_suffix(file_) if file_ is not None else None
with package.open(('tokenizer', 'infix.txt'), default=None) as file_:
infix = read_infix(file_) if file_ is not None else None
return tokenization, prefix, suffix, infix

View File

@ -50,13 +50,11 @@ cdef class Vocab:
@classmethod
def load(cls, pkg_or_str_or_file, get_lex_attr=None):
package = Package.create_or_return(pkg_or_str_or_file)
with package.open(('vocab', 'tag_map.json'), default=None) as file_:
tag_map = json.load(file_) if file_ is not None else {}
tag_map = package.load_json(('vocab', 'tag_map.json'), default={})
lemmatizer = Lemmatizer.load(package)
with package.open(('vocab', 'serializer.json'), default=None) as file_:
serializer_freqs = json.load(file_) if file_ is not None else {}
serializer_freqs = package.load_json(('vocab', 'serializer.json'), default={})
cdef Vocab self = cls(get_lex_attr=get_lex_attr, tag_map=tag_map,
lemmatizer=lemmatizer, serializer_freqs=serializer_freqs)