* Remove the vectors option to Vocab, preferring to either load vectors from disk, or set them on the Lexeme objects.

This commit is contained in:
Matthew Honnibal 2015-09-15 14:41:48 +10:00
parent 893542afae
commit 27f988b167
2 changed files with 6 additions and 14 deletions

View File

@ -137,21 +137,14 @@ class Language(object):
return path.join(path.dirname(__file__), 'data') return path.join(path.dirname(__file__), 'data')
@classmethod @classmethod
def default_vectors(cls, data_dir): def default_vocab(cls, data_dir=None, get_lex_attr=None):
return None
@classmethod
def default_vocab(cls, data_dir=None, get_lex_attr=None, vectors=None):
if data_dir is None: if data_dir is None:
data_dir = cls.default_data_dir() data_dir = cls.default_data_dir()
if vectors is None:
vectors = cls.default_vectors(data_dir)
if get_lex_attr is None: if get_lex_attr is None:
get_lex_attr = cls.default_lex_attrs(data_dir) get_lex_attr = cls.default_lex_attrs(data_dir)
return Vocab.from_dir( return Vocab.from_dir(
path.join(data_dir, 'vocab'), path.join(data_dir, 'vocab'),
get_lex_attr=get_lex_attr, get_lex_attr=get_lex_attr)
vectors=vectors)
@classmethod @classmethod
def default_tokenizer(cls, vocab, data_dir): def default_tokenizer(cls, vocab, data_dir):
@ -214,7 +207,7 @@ class Language(object):
self.entity = entity self.entity = entity
self.matcher = matcher self.matcher = matcher
def __call__(self, text, tag=True, parse=True, entity=True, merge_mwes=False): def __call__(self, text, tag=True, parse=True, entity=True):
"""Apply the pipeline to some text. The text can span multiple sentences, """Apply the pipeline to some text. The text can span multiple sentences,
and can contain arbtrary whitespace. Alignment into the original string and can contain arbtrary whitespace. Alignment into the original string
is preserved. is preserved.

View File

@ -56,11 +56,10 @@ cdef class Vocab:
self.load_lexemes(path.join(data_dir, 'strings.txt'), path.join(data_dir, 'lexemes.bin')) self.load_lexemes(path.join(data_dir, 'strings.txt'), path.join(data_dir, 'lexemes.bin'))
if vectors is None and path.exists(path.join(data_dir, 'vec.bin')): if vectors is None and path.exists(path.join(data_dir, 'vec.bin')):
self.repvec_length = self.load_rep_vectors(path.join(data_dir, 'vec.bin')) self.vectors_length = self.load_vectors(path.join(data_dir, 'vec.bin'))
return self return self
def __init__(self, get_lex_attr=None, tag_map=None, vectors=None, lemmatizer=None, def __init__(self, get_lex_attr=None, tag_map=None, lemmatizer=None, serializer_freqs=None):
serializer_freqs=None):
if tag_map is None: if tag_map is None:
tag_map = {} tag_map = {}
if lemmatizer is None: if lemmatizer is None:
@ -262,7 +261,7 @@ cdef class Vocab:
i += 1 i += 1
fp.close() fp.close()
def load_rep_vectors(self, loc): def load_vectors(self, loc):
cdef CFile file_ = CFile(loc, b'rb') cdef CFile file_ = CFile(loc, b'rb')
cdef int32_t word_len cdef int32_t word_len
cdef int32_t vec_len cdef int32_t vec_len