From eaf2ad59f1943bea698493673500034fbc830ff8 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 31 Dec 2015 04:13:15 +0100 Subject: [PATCH] * Fix use of mock Package object --- spacy/language.py | 2 +- spacy/lemmatizer.py | 3 +- spacy/matcher.pyx | 4 +-- spacy/tests/tagger/test_lemmatizer.py | 6 ++-- spacy/util.py | 49 ++++++++++++++++----------- spacy/vocab.pyx | 6 ++-- 6 files changed, 39 insertions(+), 31 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 0123e1c4f..1dbbc09b1 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -186,7 +186,7 @@ class Language(object): if load_vectors is not True: warn("load_vectors is deprecated", DeprecationWarning) if vocab in (None, True): - vocab = self.default_vocab(package) + vocab = Vocab.load(package, get_lex_attr=self.default_lex_attrs()) self.vocab = vocab if tokenizer in (None, True): tokenizer = Tokenizer.load(package, self.vocab) diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index dfa8b3aa3..48f23b4b4 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -22,8 +22,7 @@ class Lemmatizer(object): index[pos] = read_index(file_) if file_ is not None else set() with pkg.open(('wordnet', '%s.exc' % pos), default=None) as file_: exc[pos] = read_exc(file_) if file_ is not None else {} - with pkg.open(('vocab', 'lemma_rules.json'), default=None) as file_: - rules = json.load(file_) if file_ is not None else {} + rules = pkg.load_json(('vocab', 'lemma_rules.json'), default={}) return cls(index, exc, rules) def __init__(self, index, exceptions, rules): diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index ba4b46fad..777cdfbf3 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -172,9 +172,7 @@ cdef class Matcher: @classmethod def load(cls, pkg_or_str_or_file, Vocab vocab): package = Package.create_or_return(pkg_or_str_or_file) - - with package.open(('vocab', 'serializer.json'), default=None) as file_: - patterns = json.load(file_) if file_ is not None else {} + patterns = package.load_json(('vocab', 'gazetteer.json')) return cls(vocab, patterns) def __init__(self, vocab, patterns): diff --git a/spacy/tests/tagger/test_lemmatizer.py b/spacy/tests/tagger/test_lemmatizer.py index 8ba2cc3ee..a73c6dd4b 100644 --- a/spacy/tests/tagger/test_lemmatizer.py +++ b/spacy/tests/tagger/test_lemmatizer.py @@ -25,14 +25,16 @@ def lemmatizer(package): def test_read_index(package): - index = package.load_utf8(read_index, 'wordnet', 'index.noun') + with package.open(('wordnet', 'index.noun')) as file_: + index = read_index(file_) assert 'man' in index assert 'plantes' not in index assert 'plant' in index def test_read_exc(package): - exc = package.load_utf8(read_exc, 'wordnet', 'verb.exc') + with package.open(('wordnet', 'verb.exc')) as file_: + exc = read_exc(file_) assert exc['was'] == ('be',) diff --git a/spacy/util.py b/spacy/util.py index 61f708b8f..5f148bc01 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -9,8 +9,8 @@ import types from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE -def local_path(subdir): - return os.path.abspath(os.path.join(os.path.dirname(__file__), 'data')) +def local_path(*dirs): + return os.path.abspath(os.path.join(os.path.dirname(__file__), *dirs)) class Package(object): @@ -18,10 +18,10 @@ class Package(object): def create_or_return(cls, me_or_arg): return me_or_arg if isinstance(me_or_arg, cls) else cls(me_or_arg) - def __init__(self, data_path=None): + def __init__(self, data_path=None, model='en_default-1.0.3'): if data_path is None: - data_path = local_path('data') - self.name = None + data_path = local_path('data', model) + self.model = model self.data_path = data_path self._root = self.data_path @@ -37,18 +37,22 @@ class Package(object): def dir_path(self, *path_parts, **kwargs): return os.path.join(self._root, *path_parts) - def load_utf8(self, func, *path_parts, **kwargs): - if kwargs.get('require', True): - with io.open(self.file_path(os.path.join(*path_parts)), - mode='r', encoding='utf8') as f: - return func(f) - else: - return None + def load_json(self, path_parts, default=None): + if not self.has_file(*path_parts): + if _is_error_class(default): + raise default(self.file_path(*path_parts)) + elif isinstance(default, Exception): + raise default + else: + return default + with io.open(self.file_path(os.path.join(*path_parts)), + mode='r', encoding='utf8') as file_: + return json.load(file_) @contextmanager - def open(self, path_parts, default=IOError): + def open(self, path_parts, mode='r', encoding='utf8', default=IOError): if not self.has_file(*path_parts): - if isinstance(default, types.TypeType) and issubclass(default, Exception): + if _is_error_class(default): raise default(self.file_path(*path_parts)) elif isinstance(default, Exception): raise default @@ -57,12 +61,16 @@ class Package(object): else: # Enter file_ = io.open(self.file_path(os.path.join(*path_parts)), - mode='r', encoding='utf8') + mode=mode, encoding='utf8') yield file_ # Exit file_.close() +def _is_error_class(e): + return isinstance(e, types.TypeType) and issubclass(e, Exception) + + def get_package(name=None, data_path=None): return Package(data_path) @@ -92,10 +100,13 @@ def utf8open(loc, mode='r'): def read_lang_data(package): - tokenization = package.load_utf8(json.load, 'tokenizer', 'specials.json') - prefix = package.load_utf8(read_prefix, 'tokenizer', 'prefix.txt') - suffix = package.load_utf8(read_suffix, 'tokenizer', 'suffix.txt') - infix = package.load_utf8(read_infix, 'tokenizer', 'infix.txt') + tokenization = package.load_json(('tokenizer', 'specials.json')) + with package.open(('tokenizer', 'prefix.txt'), default=None) as file_: + prefix = read_prefix(file_) if file_ is not None else None + with package.open(('tokenizer', 'suffix.txt'), default=None) as file_: + suffix = read_suffix(file_) if file_ is not None else None + with package.open(('tokenizer', 'infix.txt'), default=None) as file_: + infix = read_infix(file_) if file_ is not None else None return tokenization, prefix, suffix, infix diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 1444f767e..a1d5ee8cc 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -50,13 +50,11 @@ cdef class Vocab: @classmethod def load(cls, pkg_or_str_or_file, get_lex_attr=None): package = Package.create_or_return(pkg_or_str_or_file) - with package.open(('vocab', 'tag_map.json'), default=None) as file_: - tag_map = json.load(file_) if file_ is not None else {} + tag_map = package.load_json(('vocab', 'tag_map.json'), default={}) lemmatizer = Lemmatizer.load(package) - with package.open(('vocab', 'serializer.json'), default=None) as file_: - serializer_freqs = json.load(file_) if file_ is not None else {} + serializer_freqs = package.load_json(('vocab', 'serializer.json'), default={}) cdef Vocab self = cls(get_lex_attr=get_lex_attr, tag_map=tag_map, lemmatizer=lemmatizer, serializer_freqs=serializer_freqs)