mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 01:04:34 +03:00
initial proposal for separate vector package
This commit is contained in:
parent
7adbd7a785
commit
931c07a609
|
@ -2,6 +2,7 @@ from . import util
|
||||||
from .en import English
|
from .en import English
|
||||||
|
|
||||||
|
|
||||||
def load(name, via=None):
|
def load(name, via=None, vectors_name=None):
|
||||||
package = util.get_package_by_name(name, via=via)
|
package = util.get_package_by_name(name, via=via)
|
||||||
return English(package=package)
|
vectors_package = util.get_package_by_name(vectors_name, via=via)
|
||||||
|
return English(package=package, vectors_package=vectors_package)
|
||||||
|
|
|
@ -153,7 +153,7 @@ class Language(object):
|
||||||
return {0: {'PER': True, 'LOC': True, 'ORG': True, 'MISC': True}}
|
return {0: {'PER': True, 'LOC': True, 'ORG': True, 'MISC': True}}
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def default_vocab(cls, package, get_lex_attr=None):
|
def default_vocab(cls, package, get_lex_attr=None, vectors_package=None):
|
||||||
if get_lex_attr is None:
|
if get_lex_attr is None:
|
||||||
if package.has_file('vocab', 'oov_prob'):
|
if package.has_file('vocab', 'oov_prob'):
|
||||||
with package.open(('vocab', 'oov_prob')) as file_:
|
with package.open(('vocab', 'oov_prob')) as file_:
|
||||||
|
@ -162,7 +162,8 @@ class Language(object):
|
||||||
else:
|
else:
|
||||||
get_lex_attr = cls.default_lex_attrs()
|
get_lex_attr = cls.default_lex_attrs()
|
||||||
if hasattr(package, 'dir_path'):
|
if hasattr(package, 'dir_path'):
|
||||||
return Vocab.from_package(package, get_lex_attr=get_lex_attr)
|
return Vocab.from_package(package, get_lex_attr=get_lex_attr,
|
||||||
|
vectors_package=vectors_package)
|
||||||
else:
|
else:
|
||||||
return Vocab.load(package, get_lex_attr)
|
return Vocab.load(package, get_lex_attr)
|
||||||
|
|
||||||
|
@ -198,7 +199,8 @@ class Language(object):
|
||||||
matcher=None,
|
matcher=None,
|
||||||
serializer=None,
|
serializer=None,
|
||||||
load_vectors=True,
|
load_vectors=True,
|
||||||
package=None):
|
package=None,
|
||||||
|
vectors_package=None):
|
||||||
"""
|
"""
|
||||||
a model can be specified:
|
a model can be specified:
|
||||||
|
|
||||||
|
@ -228,7 +230,7 @@ class Language(object):
|
||||||
warn("load_vectors is deprecated", DeprecationWarning)
|
warn("load_vectors is deprecated", DeprecationWarning)
|
||||||
|
|
||||||
if vocab in (None, True):
|
if vocab in (None, True):
|
||||||
vocab = self.default_vocab(package)
|
vocab = self.default_vocab(package, vectors_package=vectors_package)
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
if tokenizer in (None, True):
|
if tokenizer in (None, True):
|
||||||
tokenizer = Tokenizer.from_package(package, self.vocab)
|
tokenizer = Tokenizer.from_package(package, self.vocab)
|
||||||
|
|
|
@ -52,7 +52,7 @@ cdef class Vocab:
|
||||||
return cls.from_package(get_package(data_dir), get_lex_attr=get_lex_attr)
|
return cls.from_package(get_package(data_dir), get_lex_attr=get_lex_attr)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_package(cls, package, get_lex_attr=None):
|
def from_package(cls, package, get_lex_attr=None, vectors_package=None):
|
||||||
tag_map = package.load_json(('vocab', 'tag_map.json'), default={})
|
tag_map = package.load_json(('vocab', 'tag_map.json'), default={})
|
||||||
|
|
||||||
lemmatizer = Lemmatizer.from_package(package)
|
lemmatizer = Lemmatizer.from_package(package)
|
||||||
|
@ -66,7 +66,10 @@ cdef class Vocab:
|
||||||
self.strings.load(file_)
|
self.strings.load(file_)
|
||||||
self.load_lexemes(package.file_path('vocab', 'lexemes.bin'))
|
self.load_lexemes(package.file_path('vocab', 'lexemes.bin'))
|
||||||
|
|
||||||
if package.has_file('vocab', 'vec.bin'):
|
if vectors_package and vectors_package.has_file('vocab', 'vec.bin'):
|
||||||
|
self.vectors_length = self.load_vectors_from_bin_loc(
|
||||||
|
vectors_package.file_path('vocab', 'vec.bin'))
|
||||||
|
elif package.has_file('vocab', 'vec.bin'):
|
||||||
self.vectors_length = self.load_vectors_from_bin_loc(
|
self.vectors_length = self.load_vectors_from_bin_loc(
|
||||||
package.file_path('vocab', 'vec.bin'))
|
package.file_path('vocab', 'vec.bin'))
|
||||||
return self
|
return self
|
||||||
|
|
Loading…
Reference in New Issue
Block a user