initial proposal for separate vector package

This commit is contained in:
Henning Peters 2016-03-04 11:09:06 +01:00
parent 7adbd7a785
commit 931c07a609
3 changed files with 14 additions and 8 deletions

View File

@ -2,6 +2,7 @@ from . import util
from .en import English from .en import English
def load(name, via=None): def load(name, via=None, vectors_name=None):
package = util.get_package_by_name(name, via=via) package = util.get_package_by_name(name, via=via)
return English(package=package) vectors_package = util.get_package_by_name(vectors_name, via=via)
return English(package=package, vectors_package=vectors_package)

View File

@ -153,7 +153,7 @@ class Language(object):
return {0: {'PER': True, 'LOC': True, 'ORG': True, 'MISC': True}} return {0: {'PER': True, 'LOC': True, 'ORG': True, 'MISC': True}}
@classmethod @classmethod
def default_vocab(cls, package, get_lex_attr=None): def default_vocab(cls, package, get_lex_attr=None, vectors_package=None):
if get_lex_attr is None: if get_lex_attr is None:
if package.has_file('vocab', 'oov_prob'): if package.has_file('vocab', 'oov_prob'):
with package.open(('vocab', 'oov_prob')) as file_: with package.open(('vocab', 'oov_prob')) as file_:
@ -162,7 +162,8 @@ class Language(object):
else: else:
get_lex_attr = cls.default_lex_attrs() get_lex_attr = cls.default_lex_attrs()
if hasattr(package, 'dir_path'): if hasattr(package, 'dir_path'):
return Vocab.from_package(package, get_lex_attr=get_lex_attr) return Vocab.from_package(package, get_lex_attr=get_lex_attr,
vectors_package=vectors_package)
else: else:
return Vocab.load(package, get_lex_attr) return Vocab.load(package, get_lex_attr)
@ -198,7 +199,8 @@ class Language(object):
matcher=None, matcher=None,
serializer=None, serializer=None,
load_vectors=True, load_vectors=True,
package=None): package=None,
vectors_package=None):
""" """
a model can be specified: a model can be specified:
@ -228,7 +230,7 @@ class Language(object):
warn("load_vectors is deprecated", DeprecationWarning) warn("load_vectors is deprecated", DeprecationWarning)
if vocab in (None, True): if vocab in (None, True):
vocab = self.default_vocab(package) vocab = self.default_vocab(package, vectors_package=vectors_package)
self.vocab = vocab self.vocab = vocab
if tokenizer in (None, True): if tokenizer in (None, True):
tokenizer = Tokenizer.from_package(package, self.vocab) tokenizer = Tokenizer.from_package(package, self.vocab)

View File

@ -52,7 +52,7 @@ cdef class Vocab:
return cls.from_package(get_package(data_dir), get_lex_attr=get_lex_attr) return cls.from_package(get_package(data_dir), get_lex_attr=get_lex_attr)
@classmethod @classmethod
def from_package(cls, package, get_lex_attr=None): def from_package(cls, package, get_lex_attr=None, vectors_package=None):
tag_map = package.load_json(('vocab', 'tag_map.json'), default={}) tag_map = package.load_json(('vocab', 'tag_map.json'), default={})
lemmatizer = Lemmatizer.from_package(package) lemmatizer = Lemmatizer.from_package(package)
@ -66,7 +66,10 @@ cdef class Vocab:
self.strings.load(file_) self.strings.load(file_)
self.load_lexemes(package.file_path('vocab', 'lexemes.bin')) self.load_lexemes(package.file_path('vocab', 'lexemes.bin'))
if package.has_file('vocab', 'vec.bin'): if vectors_package and vectors_package.has_file('vocab', 'vec.bin'):
self.vectors_length = self.load_vectors_from_bin_loc(
vectors_package.file_path('vocab', 'vec.bin'))
elif package.has_file('vocab', 'vec.bin'):
self.vectors_length = self.load_vectors_from_bin_loc( self.vectors_length = self.load_vectors_from_bin_loc(
package.file_path('vocab', 'vec.bin')) package.file_path('vocab', 'vec.bin'))
return self return self