* Fix use of mock Package object

This commit is contained in:
Matthew Honnibal 2015-12-31 04:13:15 +01:00
parent 029136a007
commit eaf2ad59f1
6 changed files with 39 additions and 31 deletions

View File

@ -186,7 +186,7 @@ class Language(object):
if load_vectors is not True: if load_vectors is not True:
warn("load_vectors is deprecated", DeprecationWarning) warn("load_vectors is deprecated", DeprecationWarning)
if vocab in (None, True): if vocab in (None, True):
vocab = self.default_vocab(package) vocab = Vocab.load(package, get_lex_attr=self.default_lex_attrs())
self.vocab = vocab self.vocab = vocab
if tokenizer in (None, True): if tokenizer in (None, True):
tokenizer = Tokenizer.load(package, self.vocab) tokenizer = Tokenizer.load(package, self.vocab)

View File

@ -22,8 +22,7 @@ class Lemmatizer(object):
index[pos] = read_index(file_) if file_ is not None else set() index[pos] = read_index(file_) if file_ is not None else set()
with pkg.open(('wordnet', '%s.exc' % pos), default=None) as file_: with pkg.open(('wordnet', '%s.exc' % pos), default=None) as file_:
exc[pos] = read_exc(file_) if file_ is not None else {} exc[pos] = read_exc(file_) if file_ is not None else {}
with pkg.open(('vocab', 'lemma_rules.json'), default=None) as file_: rules = pkg.load_json(('vocab', 'lemma_rules.json'), default={})
rules = json.load(file_) if file_ is not None else {}
return cls(index, exc, rules) return cls(index, exc, rules)
def __init__(self, index, exceptions, rules): def __init__(self, index, exceptions, rules):

View File

@ -172,9 +172,7 @@ cdef class Matcher:
@classmethod @classmethod
def load(cls, pkg_or_str_or_file, Vocab vocab): def load(cls, pkg_or_str_or_file, Vocab vocab):
package = Package.create_or_return(pkg_or_str_or_file) package = Package.create_or_return(pkg_or_str_or_file)
patterns = package.load_json(('vocab', 'gazetteer.json'))
with package.open(('vocab', 'serializer.json'), default=None) as file_:
patterns = json.load(file_) if file_ is not None else {}
return cls(vocab, patterns) return cls(vocab, patterns)
def __init__(self, vocab, patterns): def __init__(self, vocab, patterns):

View File

@ -25,14 +25,16 @@ def lemmatizer(package):
def test_read_index(package): def test_read_index(package):
index = package.load_utf8(read_index, 'wordnet', 'index.noun') with package.open(('wordnet', 'index.noun')) as file_:
index = read_index(file_)
assert 'man' in index assert 'man' in index
assert 'plantes' not in index assert 'plantes' not in index
assert 'plant' in index assert 'plant' in index
def test_read_exc(package): def test_read_exc(package):
exc = package.load_utf8(read_exc, 'wordnet', 'verb.exc') with package.open(('wordnet', 'verb.exc')) as file_:
exc = read_exc(file_)
assert exc['was'] == ('be',) assert exc['was'] == ('be',)

View File

@ -9,8 +9,8 @@ import types
from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
def local_path(subdir): def local_path(*dirs):
return os.path.abspath(os.path.join(os.path.dirname(__file__), 'data')) return os.path.abspath(os.path.join(os.path.dirname(__file__), *dirs))
class Package(object): class Package(object):
@ -18,10 +18,10 @@ class Package(object):
def create_or_return(cls, me_or_arg): def create_or_return(cls, me_or_arg):
return me_or_arg if isinstance(me_or_arg, cls) else cls(me_or_arg) return me_or_arg if isinstance(me_or_arg, cls) else cls(me_or_arg)
def __init__(self, data_path=None): def __init__(self, data_path=None, model='en_default-1.0.3'):
if data_path is None: if data_path is None:
data_path = local_path('data') data_path = local_path('data', model)
self.name = None self.model = model
self.data_path = data_path self.data_path = data_path
self._root = self.data_path self._root = self.data_path
@ -37,18 +37,22 @@ class Package(object):
def dir_path(self, *path_parts, **kwargs): def dir_path(self, *path_parts, **kwargs):
return os.path.join(self._root, *path_parts) return os.path.join(self._root, *path_parts)
def load_utf8(self, func, *path_parts, **kwargs): def load_json(self, path_parts, default=None):
if kwargs.get('require', True): if not self.has_file(*path_parts):
with io.open(self.file_path(os.path.join(*path_parts)), if _is_error_class(default):
mode='r', encoding='utf8') as f: raise default(self.file_path(*path_parts))
return func(f) elif isinstance(default, Exception):
else: raise default
return None else:
return default
with io.open(self.file_path(os.path.join(*path_parts)),
mode='r', encoding='utf8') as file_:
return json.load(file_)
@contextmanager @contextmanager
def open(self, path_parts, default=IOError): def open(self, path_parts, mode='r', encoding='utf8', default=IOError):
if not self.has_file(*path_parts): if not self.has_file(*path_parts):
if isinstance(default, types.TypeType) and issubclass(default, Exception): if _is_error_class(default):
raise default(self.file_path(*path_parts)) raise default(self.file_path(*path_parts))
elif isinstance(default, Exception): elif isinstance(default, Exception):
raise default raise default
@ -57,12 +61,16 @@ class Package(object):
else: else:
# Enter # Enter
file_ = io.open(self.file_path(os.path.join(*path_parts)), file_ = io.open(self.file_path(os.path.join(*path_parts)),
mode='r', encoding='utf8') mode=mode, encoding='utf8')
yield file_ yield file_
# Exit # Exit
file_.close() file_.close()
def _is_error_class(e):
return isinstance(e, types.TypeType) and issubclass(e, Exception)
def get_package(name=None, data_path=None): def get_package(name=None, data_path=None):
return Package(data_path) return Package(data_path)
@ -92,10 +100,13 @@ def utf8open(loc, mode='r'):
def read_lang_data(package): def read_lang_data(package):
tokenization = package.load_utf8(json.load, 'tokenizer', 'specials.json') tokenization = package.load_json(('tokenizer', 'specials.json'))
prefix = package.load_utf8(read_prefix, 'tokenizer', 'prefix.txt') with package.open(('tokenizer', 'prefix.txt'), default=None) as file_:
suffix = package.load_utf8(read_suffix, 'tokenizer', 'suffix.txt') prefix = read_prefix(file_) if file_ is not None else None
infix = package.load_utf8(read_infix, 'tokenizer', 'infix.txt') with package.open(('tokenizer', 'suffix.txt'), default=None) as file_:
suffix = read_suffix(file_) if file_ is not None else None
with package.open(('tokenizer', 'infix.txt'), default=None) as file_:
infix = read_infix(file_) if file_ is not None else None
return tokenization, prefix, suffix, infix return tokenization, prefix, suffix, infix

View File

@ -50,13 +50,11 @@ cdef class Vocab:
@classmethod @classmethod
def load(cls, pkg_or_str_or_file, get_lex_attr=None): def load(cls, pkg_or_str_or_file, get_lex_attr=None):
package = Package.create_or_return(pkg_or_str_or_file) package = Package.create_or_return(pkg_or_str_or_file)
with package.open(('vocab', 'tag_map.json'), default=None) as file_: tag_map = package.load_json(('vocab', 'tag_map.json'), default={})
tag_map = json.load(file_) if file_ is not None else {}
lemmatizer = Lemmatizer.load(package) lemmatizer = Lemmatizer.load(package)
with package.open(('vocab', 'serializer.json'), default=None) as file_: serializer_freqs = package.load_json(('vocab', 'serializer.json'), default={})
serializer_freqs = json.load(file_) if file_ is not None else {}
cdef Vocab self = cls(get_lex_attr=get_lex_attr, tag_map=tag_map, cdef Vocab self = cls(get_lex_attr=get_lex_attr, tag_map=tag_map,
lemmatizer=lemmatizer, serializer_freqs=serializer_freqs) lemmatizer=lemmatizer, serializer_freqs=serializer_freqs)