mirror of
https://github.com/explosion/spaCy.git
synced 2025-05-30 02:33:07 +03:00
* Fix use of mock Package object
This commit is contained in:
parent
029136a007
commit
eaf2ad59f1
|
@ -186,7 +186,7 @@ class Language(object):
|
||||||
if load_vectors is not True:
|
if load_vectors is not True:
|
||||||
warn("load_vectors is deprecated", DeprecationWarning)
|
warn("load_vectors is deprecated", DeprecationWarning)
|
||||||
if vocab in (None, True):
|
if vocab in (None, True):
|
||||||
vocab = self.default_vocab(package)
|
vocab = Vocab.load(package, get_lex_attr=self.default_lex_attrs())
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
if tokenizer in (None, True):
|
if tokenizer in (None, True):
|
||||||
tokenizer = Tokenizer.load(package, self.vocab)
|
tokenizer = Tokenizer.load(package, self.vocab)
|
||||||
|
|
|
@ -22,8 +22,7 @@ class Lemmatizer(object):
|
||||||
index[pos] = read_index(file_) if file_ is not None else set()
|
index[pos] = read_index(file_) if file_ is not None else set()
|
||||||
with pkg.open(('wordnet', '%s.exc' % pos), default=None) as file_:
|
with pkg.open(('wordnet', '%s.exc' % pos), default=None) as file_:
|
||||||
exc[pos] = read_exc(file_) if file_ is not None else {}
|
exc[pos] = read_exc(file_) if file_ is not None else {}
|
||||||
with pkg.open(('vocab', 'lemma_rules.json'), default=None) as file_:
|
rules = pkg.load_json(('vocab', 'lemma_rules.json'), default={})
|
||||||
rules = json.load(file_) if file_ is not None else {}
|
|
||||||
return cls(index, exc, rules)
|
return cls(index, exc, rules)
|
||||||
|
|
||||||
def __init__(self, index, exceptions, rules):
|
def __init__(self, index, exceptions, rules):
|
||||||
|
|
|
@ -172,9 +172,7 @@ cdef class Matcher:
|
||||||
@classmethod
|
@classmethod
|
||||||
def load(cls, pkg_or_str_or_file, Vocab vocab):
|
def load(cls, pkg_or_str_or_file, Vocab vocab):
|
||||||
package = Package.create_or_return(pkg_or_str_or_file)
|
package = Package.create_or_return(pkg_or_str_or_file)
|
||||||
|
patterns = package.load_json(('vocab', 'gazetteer.json'))
|
||||||
with package.open(('vocab', 'serializer.json'), default=None) as file_:
|
|
||||||
patterns = json.load(file_) if file_ is not None else {}
|
|
||||||
return cls(vocab, patterns)
|
return cls(vocab, patterns)
|
||||||
|
|
||||||
def __init__(self, vocab, patterns):
|
def __init__(self, vocab, patterns):
|
||||||
|
|
|
@ -25,14 +25,16 @@ def lemmatizer(package):
|
||||||
|
|
||||||
|
|
||||||
def test_read_index(package):
|
def test_read_index(package):
|
||||||
index = package.load_utf8(read_index, 'wordnet', 'index.noun')
|
with package.open(('wordnet', 'index.noun')) as file_:
|
||||||
|
index = read_index(file_)
|
||||||
assert 'man' in index
|
assert 'man' in index
|
||||||
assert 'plantes' not in index
|
assert 'plantes' not in index
|
||||||
assert 'plant' in index
|
assert 'plant' in index
|
||||||
|
|
||||||
|
|
||||||
def test_read_exc(package):
|
def test_read_exc(package):
|
||||||
exc = package.load_utf8(read_exc, 'wordnet', 'verb.exc')
|
with package.open(('wordnet', 'verb.exc')) as file_:
|
||||||
|
exc = read_exc(file_)
|
||||||
assert exc['was'] == ('be',)
|
assert exc['was'] == ('be',)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -9,8 +9,8 @@ import types
|
||||||
from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
|
from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
|
||||||
|
|
||||||
|
|
||||||
def local_path(subdir):
|
def local_path(*dirs):
|
||||||
return os.path.abspath(os.path.join(os.path.dirname(__file__), 'data'))
|
return os.path.abspath(os.path.join(os.path.dirname(__file__), *dirs))
|
||||||
|
|
||||||
|
|
||||||
class Package(object):
|
class Package(object):
|
||||||
|
@ -18,10 +18,10 @@ class Package(object):
|
||||||
def create_or_return(cls, me_or_arg):
|
def create_or_return(cls, me_or_arg):
|
||||||
return me_or_arg if isinstance(me_or_arg, cls) else cls(me_or_arg)
|
return me_or_arg if isinstance(me_or_arg, cls) else cls(me_or_arg)
|
||||||
|
|
||||||
def __init__(self, data_path=None):
|
def __init__(self, data_path=None, model='en_default-1.0.3'):
|
||||||
if data_path is None:
|
if data_path is None:
|
||||||
data_path = local_path('data')
|
data_path = local_path('data', model)
|
||||||
self.name = None
|
self.model = model
|
||||||
self.data_path = data_path
|
self.data_path = data_path
|
||||||
self._root = self.data_path
|
self._root = self.data_path
|
||||||
|
|
||||||
|
@ -37,18 +37,22 @@ class Package(object):
|
||||||
def dir_path(self, *path_parts, **kwargs):
|
def dir_path(self, *path_parts, **kwargs):
|
||||||
return os.path.join(self._root, *path_parts)
|
return os.path.join(self._root, *path_parts)
|
||||||
|
|
||||||
def load_utf8(self, func, *path_parts, **kwargs):
|
def load_json(self, path_parts, default=None):
|
||||||
if kwargs.get('require', True):
|
if not self.has_file(*path_parts):
|
||||||
with io.open(self.file_path(os.path.join(*path_parts)),
|
if _is_error_class(default):
|
||||||
mode='r', encoding='utf8') as f:
|
raise default(self.file_path(*path_parts))
|
||||||
return func(f)
|
elif isinstance(default, Exception):
|
||||||
else:
|
raise default
|
||||||
return None
|
else:
|
||||||
|
return default
|
||||||
|
with io.open(self.file_path(os.path.join(*path_parts)),
|
||||||
|
mode='r', encoding='utf8') as file_:
|
||||||
|
return json.load(file_)
|
||||||
|
|
||||||
@contextmanager
|
@contextmanager
|
||||||
def open(self, path_parts, default=IOError):
|
def open(self, path_parts, mode='r', encoding='utf8', default=IOError):
|
||||||
if not self.has_file(*path_parts):
|
if not self.has_file(*path_parts):
|
||||||
if isinstance(default, types.TypeType) and issubclass(default, Exception):
|
if _is_error_class(default):
|
||||||
raise default(self.file_path(*path_parts))
|
raise default(self.file_path(*path_parts))
|
||||||
elif isinstance(default, Exception):
|
elif isinstance(default, Exception):
|
||||||
raise default
|
raise default
|
||||||
|
@ -57,12 +61,16 @@ class Package(object):
|
||||||
else:
|
else:
|
||||||
# Enter
|
# Enter
|
||||||
file_ = io.open(self.file_path(os.path.join(*path_parts)),
|
file_ = io.open(self.file_path(os.path.join(*path_parts)),
|
||||||
mode='r', encoding='utf8')
|
mode=mode, encoding='utf8')
|
||||||
yield file_
|
yield file_
|
||||||
# Exit
|
# Exit
|
||||||
file_.close()
|
file_.close()
|
||||||
|
|
||||||
|
|
||||||
|
def _is_error_class(e):
|
||||||
|
return isinstance(e, types.TypeType) and issubclass(e, Exception)
|
||||||
|
|
||||||
|
|
||||||
def get_package(name=None, data_path=None):
|
def get_package(name=None, data_path=None):
|
||||||
return Package(data_path)
|
return Package(data_path)
|
||||||
|
|
||||||
|
@ -92,10 +100,13 @@ def utf8open(loc, mode='r'):
|
||||||
|
|
||||||
|
|
||||||
def read_lang_data(package):
|
def read_lang_data(package):
|
||||||
tokenization = package.load_utf8(json.load, 'tokenizer', 'specials.json')
|
tokenization = package.load_json(('tokenizer', 'specials.json'))
|
||||||
prefix = package.load_utf8(read_prefix, 'tokenizer', 'prefix.txt')
|
with package.open(('tokenizer', 'prefix.txt'), default=None) as file_:
|
||||||
suffix = package.load_utf8(read_suffix, 'tokenizer', 'suffix.txt')
|
prefix = read_prefix(file_) if file_ is not None else None
|
||||||
infix = package.load_utf8(read_infix, 'tokenizer', 'infix.txt')
|
with package.open(('tokenizer', 'suffix.txt'), default=None) as file_:
|
||||||
|
suffix = read_suffix(file_) if file_ is not None else None
|
||||||
|
with package.open(('tokenizer', 'infix.txt'), default=None) as file_:
|
||||||
|
infix = read_infix(file_) if file_ is not None else None
|
||||||
return tokenization, prefix, suffix, infix
|
return tokenization, prefix, suffix, infix
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -50,13 +50,11 @@ cdef class Vocab:
|
||||||
@classmethod
|
@classmethod
|
||||||
def load(cls, pkg_or_str_or_file, get_lex_attr=None):
|
def load(cls, pkg_or_str_or_file, get_lex_attr=None):
|
||||||
package = Package.create_or_return(pkg_or_str_or_file)
|
package = Package.create_or_return(pkg_or_str_or_file)
|
||||||
with package.open(('vocab', 'tag_map.json'), default=None) as file_:
|
tag_map = package.load_json(('vocab', 'tag_map.json'), default={})
|
||||||
tag_map = json.load(file_) if file_ is not None else {}
|
|
||||||
|
|
||||||
lemmatizer = Lemmatizer.load(package)
|
lemmatizer = Lemmatizer.load(package)
|
||||||
|
|
||||||
with package.open(('vocab', 'serializer.json'), default=None) as file_:
|
serializer_freqs = package.load_json(('vocab', 'serializer.json'), default={})
|
||||||
serializer_freqs = json.load(file_) if file_ is not None else {}
|
|
||||||
|
|
||||||
cdef Vocab self = cls(get_lex_attr=get_lex_attr, tag_map=tag_map,
|
cdef Vocab self = cls(get_lex_attr=get_lex_attr, tag_map=tag_map,
|
||||||
lemmatizer=lemmatizer, serializer_freqs=serializer_freqs)
|
lemmatizer=lemmatizer, serializer_freqs=serializer_freqs)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user