Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Matthew Honnibal 2017-08-18 14:55:40 -05:00
commit 28162290b3
7 changed files with 255 additions and 77 deletions

View File

@ -3,7 +3,7 @@ pathlib
numpy>=1.7 numpy>=1.7
cymem>=1.30,<1.32 cymem>=1.30,<1.32
preshed>=1.0.0,<2.0.0 preshed>=1.0.0,<2.0.0
thinc>=6.8.1,<6.9.0 thinc>=6.8.0,<6.9.0
murmurhash>=0.28,<0.29 murmurhash>=0.28,<0.29
plac<1.0.0,>=0.9.6 plac<1.0.0,>=0.9.6
six six

View File

@ -29,6 +29,7 @@ MOD_NAMES = [
'spacy.syntax.stateclass', 'spacy.syntax.stateclass',
'spacy.syntax._state', 'spacy.syntax._state',
'spacy.tokenizer', 'spacy.tokenizer',
'spacy._cfile',
'spacy.syntax.parser', 'spacy.syntax.parser',
'spacy.syntax.nn_parser', 'spacy.syntax.nn_parser',
'spacy.syntax.beam_parser', 'spacy.syntax.beam_parser',
@ -193,7 +194,7 @@ def setup_package():
'murmurhash>=0.28,<0.29', 'murmurhash>=0.28,<0.29',
'cymem>=1.30,<1.32', 'cymem>=1.30,<1.32',
'preshed>=1.0.0,<2.0.0', 'preshed>=1.0.0,<2.0.0',
'thinc>=6.8.1,<6.9.0', 'thinc>=6.8.0,<6.9.0',
'plac<1.0.0,>=0.9.6', 'plac<1.0.0,>=0.9.6',
'pip>=9.0.0,<10.0.0', 'pip>=9.0.0,<10.0.0',
'six', 'six',

26
spacy/_cfile.pxd Normal file
View File

@ -0,0 +1,26 @@
from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
from cymem.cymem cimport Pool
cdef class CFile:
cdef FILE* fp
cdef bint is_open
cdef Pool mem
cdef int size # For compatibility with subclass
cdef int _capacity # For compatibility with subclass
cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1
cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1
cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *
cdef class StringCFile(CFile):
cdef unsigned char* data
cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1
cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1
cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *

88
spacy/_cfile.pyx Normal file
View File

@ -0,0 +1,88 @@
from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
from libc.string cimport memcpy
cdef class CFile:
def __init__(self, loc, mode, on_open_error=None):
if isinstance(mode, unicode):
mode_str = mode.encode('ascii')
else:
mode_str = mode
if hasattr(loc, 'as_posix'):
loc = loc.as_posix()
self.mem = Pool()
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
self.fp = fopen(<char*>bytes_loc, mode_str)
if self.fp == NULL:
if on_open_error is not None:
on_open_error()
else:
raise IOError("Could not open binary file %s" % bytes_loc)
self.is_open = True
def __dealloc__(self):
if self.is_open:
fclose(self.fp)
def close(self):
fclose(self.fp)
self.is_open = False
cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1:
st = fread(dest, elem_size, number, self.fp)
if st != number:
raise IOError
cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1:
st = fwrite(src, elem_size, number, self.fp)
if st != number:
raise IOError
cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *:
cdef void* dest = mem.alloc(number, elem_size)
self.read_into(dest, number, elem_size)
return dest
def write_unicode(self, unicode value):
cdef bytes py_bytes = value.encode('utf8')
cdef char* chars = <char*>py_bytes
self.write(sizeof(char), len(py_bytes), chars)
cdef class StringCFile:
def __init__(self, mode, bytes data=b'', on_open_error=None):
self.mem = Pool()
self.is_open = 'w' in mode
self._capacity = max(len(data), 8)
self.size = len(data)
self.data = <unsigned char*>self.mem.alloc(1, self._capacity)
for i in range(len(data)):
self.data[i] = data[i]
def close(self):
self.is_open = False
def string_data(self):
return (self.data-self.size)[:self.size]
cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1:
memcpy(dest, self.data, elem_size * number)
self.data += elem_size * number
cdef int write_from(self, void* src, size_t elem_size, size_t number) except -1:
write_size = number * elem_size
if (self.size + write_size) >= self._capacity:
self._capacity = (self.size + write_size) * 2
self.data = <unsigned char*>self.mem.realloc(self.data, self._capacity)
memcpy(&self.data[self.size], src, elem_size * number)
self.size += write_size
cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *:
cdef void* dest = mem.alloc(number, elem_size)
self.read_into(dest, number, elem_size)
return dest
def write_unicode(self, unicode value):
cdef bytes py_bytes = value.encode('utf8')
cdef char* chars = <char*>py_bytes
self.write(sizeof(char), len(py_bytes), chars)

View File

@ -37,14 +37,11 @@ from preshed.maps cimport MapStruct
from preshed.maps cimport map_get from preshed.maps cimport map_get
from thinc.api import layerize, chain, noop, clone from thinc.api import layerize, chain, noop, clone
<<<<<<< HEAD
from thinc.neural import Model, Affine, ELU, ReLu, Maxout from thinc.neural import Model, Affine, ELU, ReLu, Maxout
=======
from thinc.neural import Model, Affine, ReLu, Maxout from thinc.neural import Model, Affine, ReLu, Maxout
from thinc.neural._classes.batchnorm import BatchNorm as BN from thinc.neural._classes.batchnorm import BatchNorm as BN
from thinc.neural._classes.selu import SELU from thinc.neural._classes.selu import SELU
from thinc.neural._classes.layernorm import LayerNorm from thinc.neural._classes.layernorm import LayerNorm
>>>>>>> feature/nn-beam-parser
from thinc.neural.ops import NumpyOps, CupyOps from thinc.neural.ops import NumpyOps, CupyOps
from thinc.neural.util import get_array_module from thinc.neural.util import get_array_module
@ -54,6 +51,7 @@ from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts
from .._ml import Tok2Vec, doc2feats, rebatch from .._ml import Tok2Vec, doc2feats, rebatch
from ..compat import json_dumps from ..compat import json_dumps
from . import _beam_utils
from . import _parse_features from . import _parse_features
from ._parse_features cimport CONTEXT_SIZE from ._parse_features cimport CONTEXT_SIZE
from ._parse_features cimport fill_context from ._parse_features cimport fill_context
@ -68,10 +66,6 @@ from ..strings cimport StringStore
from ..gold cimport GoldParse from ..gold cimport GoldParse
from ..attrs cimport TAG, DEP from ..attrs cimport TAG, DEP
<<<<<<< HEAD
=======
USE_FINE_TUNE = True
>>>>>>> feature/nn-beam-parser
def get_templates(*args, **kwargs): def get_templates(*args, **kwargs):
return [] return []
@ -259,7 +253,6 @@ cdef class Parser:
nI=token_vector_width) nI=token_vector_width)
with Model.use_device('cpu'): with Model.use_device('cpu'):
<<<<<<< HEAD
if depth == 0: if depth == 0:
upper = chain() upper = chain()
upper.is_noop = True upper.is_noop = True
@ -269,12 +262,6 @@ cdef class Parser:
zero_init(Affine(nr_class, drop_factor=0.0)) zero_init(Affine(nr_class, drop_factor=0.0))
) )
upper.is_noop = False upper.is_noop = False
=======
upper = chain(
clone(Maxout(hidden_width), (depth-1)),
zero_init(Affine(nr_class, drop_factor=0.0))
)
>>>>>>> feature/nn-beam-parser
# TODO: This is an unfortunate hack atm! # TODO: This is an unfortunate hack atm!
# Used to set input dimensions in network. # Used to set input dimensions in network.
lower.begin_training(lower.ops.allocate((500, token_vector_width))) lower.begin_training(lower.ops.allocate((500, token_vector_width)))
@ -422,7 +409,6 @@ cdef class Parser:
c_is_valid = <int*>is_valid.data c_is_valid = <int*>is_valid.data
cdef int has_hidden = not getattr(vec2scores, 'is_noop', False) cdef int has_hidden = not getattr(vec2scores, 'is_noop', False)
while not next_step.empty(): while not next_step.empty():
<<<<<<< HEAD
if not has_hidden: if not has_hidden:
for i in cython.parallel.prange( for i in cython.parallel.prange(
next_step.size(), num_threads=6, nogil=True): next_step.size(), num_threads=6, nogil=True):
@ -442,21 +428,6 @@ cdef class Parser:
&c_scores[i*nr_class], &c_is_valid[i*nr_class], nr_class) &c_scores[i*nr_class], &c_is_valid[i*nr_class], nr_class)
action = self.moves.c[guess] action = self.moves.c[guess]
action.do(st, action.label) action.do(st, action.label)
=======
for i in range(next_step.size()):
st = next_step[i]
st.set_context_tokens(&c_token_ids[i*nr_feat], nr_feat)
self.moves.set_valid(&c_is_valid[i*nr_class], st)
vectors = state2vec(token_ids[:next_step.size()])
scores = vec2scores(vectors)
c_scores = <float*>scores.data
for i in range(next_step.size()):
st = next_step[i]
guess = arg_max_if_valid(
&c_scores[i*nr_class], &c_is_valid[i*nr_class], nr_class)
action = self.moves.c[guess]
action.do(st, action.label)
>>>>>>> feature/nn-beam-parser
this_step, next_step = next_step, this_step this_step, next_step = next_step, this_step
next_step.clear() next_step.clear()
for st in this_step: for st in this_step:
@ -526,17 +497,17 @@ cdef class Parser:
free(token_ids) free(token_ids)
def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None): def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
<<<<<<< HEAD
=======
if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() >= 0.5: if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() >= 0.5:
return self.update_beam(docs_tokvecs, golds, return self.update_beam(docs_tokvecs, golds,
self.cfg['beam_width'], self.cfg['beam_density'], self.cfg['beam_width'], self.cfg['beam_density'],
drop=drop, sgd=sgd, losses=losses) drop=drop, sgd=sgd, losses=losses)
>>>>>>> feature/nn-beam-parser
if losses is not None and self.name not in losses: if losses is not None and self.name not in losses:
losses[self.name] = 0. losses[self.name] = 0.
docs, tokvec_lists = docs_tokvecs docs, tokvec_lists = docs_tokvecs
tokvecs = self.model[0].ops.flatten(tokvec_lists) tokvecs = self.model[0].ops.flatten(tokvec_lists)
my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
tokvecs += self.model[0].ops.flatten(my_tokvecs)
if isinstance(docs, Doc) and isinstance(golds, GoldParse): if isinstance(docs, Doc) and isinstance(golds, GoldParse):
docs = [docs] docs = [docs]
golds = [golds] golds = [golds]
@ -589,12 +560,8 @@ cdef class Parser:
break break
self._make_updates(d_tokvecs, self._make_updates(d_tokvecs,
backprops, sgd, cuda_stream) backprops, sgd, cuda_stream)
<<<<<<< HEAD
return self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs])
=======
d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs]) d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs])
if USE_FINE_TUNE: bp_my_tokvecs(d_tokvecs, sgd=sgd)
bp_my_tokvecs(d_tokvecs, sgd=sgd)
return d_tokvecs return d_tokvecs
def update_beam(self, docs_tokvecs, golds, width=None, density=None, def update_beam(self, docs_tokvecs, golds, width=None, density=None,
@ -609,10 +576,9 @@ cdef class Parser:
lengths = [len(d) for d in docs] lengths = [len(d) for d in docs]
assert min(lengths) >= 1 assert min(lengths) >= 1
tokvecs = self.model[0].ops.flatten(tokvecs) tokvecs = self.model[0].ops.flatten(tokvecs)
if USE_FINE_TUNE: my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop) my_tokvecs = self.model[0].ops.flatten(my_tokvecs)
my_tokvecs = self.model[0].ops.flatten(my_tokvecs) tokvecs += my_tokvecs
tokvecs += my_tokvecs
states = self.moves.init_batch(docs) states = self.moves.init_batch(docs)
for gold in golds: for gold in golds:
@ -643,10 +609,8 @@ cdef class Parser:
d_tokvecs = self.model[0].ops.allocate(tokvecs.shape) d_tokvecs = self.model[0].ops.allocate(tokvecs.shape)
self._make_updates(d_tokvecs, backprop_lower, sgd, cuda_stream) self._make_updates(d_tokvecs, backprop_lower, sgd, cuda_stream)
d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, lengths) d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, lengths)
if USE_FINE_TUNE: bp_my_tokvecs(d_tokvecs, sgd=sgd)
bp_my_tokvecs(d_tokvecs, sgd=sgd)
return d_tokvecs return d_tokvecs
>>>>>>> feature/nn-beam-parser
def _init_gold_batch(self, whole_docs, whole_golds): def _init_gold_batch(self, whole_docs, whole_golds):
"""Make a square batch, of length equal to the shortest doc. A long """Make a square batch, of length equal to the shortest doc. A long
@ -691,21 +655,10 @@ cdef class Parser:
xp = get_array_module(d_tokvecs) xp = get_array_module(d_tokvecs)
for ids, d_vector, bp_vector in backprops: for ids, d_vector, bp_vector in backprops:
d_state_features = bp_vector(d_vector, sgd=sgd) d_state_features = bp_vector(d_vector, sgd=sgd)
<<<<<<< HEAD
active_feats = ids * (ids >= 0)
active_feats = active_feats.reshape((ids.shape[0], ids.shape[1], 1))
if hasattr(xp, 'scatter_add'):
xp.scatter_add(d_tokvecs,
ids, d_state_features * active_feats)
else:
xp.add.at(d_tokvecs,
ids, d_state_features * active_feats)
=======
mask = ids >= 0 mask = ids >= 0
d_state_features *= mask.reshape(ids.shape + (1,)) d_state_features *= mask.reshape(ids.shape + (1,))
self.model[0].ops.scatter_add(d_tokvecs, ids * mask, self.model[0].ops.scatter_add(d_tokvecs, ids * mask,
d_state_features) d_state_features)
>>>>>>> feature/nn-beam-parser
@property @property
def move_names(self): def move_names(self):

View File

@ -1,18 +1,26 @@
from libc.stdint cimport int32_t, uint64_t
import numpy import numpy
from collections import OrderedDict from collections import OrderedDict
import msgpack import msgpack
import msgpack_numpy import msgpack_numpy
msgpack_numpy.patch() msgpack_numpy.patch()
from cymem.cymem cimport Pool
cimport numpy as np
from libcpp.vector cimport vector
from .typedefs cimport attr_t
from .strings cimport StringStore from .strings cimport StringStore
from . import util from . import util
from ._cfile cimport CFile
MAX_VEC_SIZE = 10000
cdef class Vectors: cdef class Vectors:
'''Store, save and load word vectors.''' '''Store, save and load word vectors.'''
cdef public object data cdef public object data
cdef readonly StringStore strings cdef readonly StringStore strings
cdef public object key2i cdef public object index
def __init__(self, strings, data_or_width): def __init__(self, strings, data_or_width):
self.strings = StringStore() self.strings = StringStore()
@ -22,9 +30,9 @@ cdef class Vectors:
else: else:
data = data_or_width data = data_or_width
self.data = data self.data = data
self.key2i = {} self.index = {}
for i, string in enumerate(strings): for i, string in enumerate(strings):
self.key2i[self.strings.add(string)] = i self.index[self.strings.add(string)] = i
def __reduce__(self): def __reduce__(self):
return (Vectors, (self.strings, self.data)) return (Vectors, (self.strings, self.data))
@ -32,7 +40,7 @@ cdef class Vectors:
def __getitem__(self, key): def __getitem__(self, key):
if isinstance(key, basestring): if isinstance(key, basestring):
key = self.strings[key] key = self.strings[key]
i = self.key2i[key] i = self.index[key]
if i is None: if i is None:
raise KeyError(key) raise KeyError(key)
else: else:
@ -41,7 +49,7 @@ cdef class Vectors:
def __setitem__(self, key, vector): def __setitem__(self, key, vector):
if isinstance(key, basestring): if isinstance(key, basestring):
key = self.strings.add(key) key = self.strings.add(key)
i = self.key2i[key] i = self.index[key]
self.data[i] = vector self.data[i] = vector
def __iter__(self): def __iter__(self):
@ -61,34 +69,119 @@ cdef class Vectors:
def most_similar(self, key): def most_similar(self, key):
raise NotImplementedError raise NotImplementedError
def to_disk(self, path): def to_disk(self, path, **exclude):
raise NotImplementedError def serialize_vectors(p):
write_vectors_to_bin_loc(self.strings, self.key2i, self.data, str(p))
def from_disk(self, path): serializers = OrderedDict((
raise NotImplementedError ('vec.bin', serialize_vectors),
))
return util.to_disk(serializers, exclude)
def from_disk(self, path, **exclude):
def deserialize_vectors(p):
self.key2i, self.vectors = load_vectors_from_bin_loc(self.strings, str(p))
serializers = OrderedDict((
('vec.bin', deserialize_vectors)
))
return util.to_disk(serializers, exclude)
def to_bytes(self, **exclude): def to_bytes(self, **exclude):
def serialize_weights(): def serialize_weights():
if hasattr(self.weights, 'to_bytes'): if hasattr(self.data, 'to_bytes'):
return self.weights.to_bytes() return self.data.to_bytes()
else: else:
return msgpack.dumps(self.weights) return msgpack.dumps(self.data)
serializers = OrderedDict(( serializers = OrderedDict((
('key2row', lambda: msgpack.dumps(self.key2i)),
('strings', lambda: self.strings.to_bytes()), ('strings', lambda: self.strings.to_bytes()),
('weights', serialize_weights) ('vectors', serialize_weights)
)) ))
return util.to_bytes(serializers, exclude) return util.to_bytes(serializers, exclude)
def from_bytes(self, data, **exclude): def from_bytes(self, data, **exclude):
def deserialize_weights(b): def deserialize_weights(b):
if hasattr(self.weights, 'from_bytes'): if hasattr(self.data, 'from_bytes'):
self.weights.from_bytes() self.data.from_bytes()
else: else:
self.weights = msgpack.loads(b) self.data = msgpack.loads(b)
deserializers = OrderedDict(( deserializers = OrderedDict((
('key2row', lambda b: self.key2i.update(msgpack.loads(b))),
('strings', lambda b: self.strings.from_bytes(b)), ('strings', lambda b: self.strings.from_bytes(b)),
('weights', deserialize_weights) ('vectors', deserialize_weights)
)) ))
return util.from_bytes(deserializers, exclude) return util.from_bytes(deserializers, exclude)
def write_vectors_to_bin_loc(StringStore strings, dict key2i,
np.ndarray vectors, out_loc):
cdef int32_t vec_len = vectors.shape[1]
cdef int32_t word_len
cdef bytes word_str
cdef char* chars
cdef uint64_t key
cdef int32_t i
cdef float* vec
cdef CFile out_file = CFile(out_loc, 'wb')
keys = [(i, key) for (key, i) in key2i.item()]
keys.sort()
for i, key in keys:
vec = <float*>vectors.data[i * vec_len]
word_str = strings[key].encode('utf8')
word_len = len(word_str)
out_file.write_from(&word_len, 1, sizeof(word_len))
out_file.write_from(&vec_len, 1, sizeof(vec_len))
chars = <char*>word_str
out_file.write_from(chars, word_len, sizeof(char))
out_file.write_from(vec, vec_len, sizeof(float))
out_file.close()
def load_vectors_from_bin_loc(StringStore strings, loc):
"""
Load vectors from the location of a binary file.
Arguments:
loc (unicode): The path of the binary file to load from.
Returns:
vec_len (int): The length of the vectors loaded.
"""
cdef CFile file_ = CFile(loc, b'rb')
cdef int32_t word_len
cdef int32_t vec_len = 0
cdef int32_t prev_vec_len = 0
cdef float* vec
cdef attr_t string_id
cdef bytes py_word
cdef vector[float*] vectors
cdef int line_num = 0
cdef Pool mem = Pool()
cdef dict key2i = {}
while True:
try:
file_.read_into(&word_len, sizeof(word_len), 1)
except IOError:
break
file_.read_into(&vec_len, sizeof(vec_len), 1)
if prev_vec_len != 0 and vec_len != prev_vec_len:
raise Exception("Mismatched vector sizes")
if 0 >= vec_len >= MAX_VEC_SIZE:
raise Exception("Mismatched vector sizes")
chars = <char*>file_.alloc_read(mem, word_len, sizeof(char))
vec = <float*>file_.alloc_read(mem, vec_len, sizeof(float))
key = strings.add(chars[:word_len])
key2i[key] = vectors.size()
vectors.push_back(vec)
numpy_vectors = numpy.zeros((vectors.size(), vec_len), dtype='f')
for i in range(vectors.size()):
for j in range(vec_len):
numpy_vectors[i, j] = vectors[i][j]
return key2i, numpy_vectors

View File

@ -280,7 +280,7 @@ cdef class Vocab:
or int ID.""" or int ID."""
return False return False
def to_disk(self, path): def to_disk(self, path, **exclude):
"""Save the current state to a directory. """Save the current state to a directory.
path (unicode or Path): A path to a directory, which will be created if path (unicode or Path): A path to a directory, which will be created if
@ -292,8 +292,10 @@ cdef class Vocab:
self.strings.to_disk(path / 'strings.json') self.strings.to_disk(path / 'strings.json')
with (path / 'lexemes.bin').open('wb') as file_: with (path / 'lexemes.bin').open('wb') as file_:
file_.write(self.lexemes_to_bytes()) file_.write(self.lexemes_to_bytes())
if self.vectors is not None:
self.vectors.to_disk(path, exclude='strings.json')
def from_disk(self, path): def from_disk(self, path, **exclude):
"""Loads state from a directory. Modifies the object in place and """Loads state from a directory. Modifies the object in place and
returns it. returns it.
@ -305,6 +307,8 @@ cdef class Vocab:
self.strings.from_disk(path / 'strings.json') self.strings.from_disk(path / 'strings.json')
with (path / 'lexemes.bin').open('rb') as file_: with (path / 'lexemes.bin').open('rb') as file_:
self.lexemes_from_bytes(file_.read()) self.lexemes_from_bytes(file_.read())
if self.vectors is not None:
self.vectors.from_disk(path, exclude='string.json')
return self return self
def to_bytes(self, **exclude): def to_bytes(self, **exclude):
@ -313,9 +317,16 @@ cdef class Vocab:
**exclude: Named attributes to prevent from being serialized. **exclude: Named attributes to prevent from being serialized.
RETURNS (bytes): The serialized form of the `Vocab` object. RETURNS (bytes): The serialized form of the `Vocab` object.
""" """
def deserialize_vectors():
if self.vectors is None:
return None
else:
return self.vectors.to_bytes(exclude='strings')
getters = OrderedDict(( getters = OrderedDict((
('strings', lambda: self.strings.to_bytes()), ('strings', lambda: self.strings.to_bytes()),
('lexemes', lambda: self.lexemes_to_bytes()), ('lexemes', lambda: self.lexemes_to_bytes()),
('vectors', deserialize_vectors)
)) ))
return util.to_bytes(getters, exclude) return util.to_bytes(getters, exclude)
@ -326,9 +337,15 @@ cdef class Vocab:
**exclude: Named attributes to prevent from being loaded. **exclude: Named attributes to prevent from being loaded.
RETURNS (Vocab): The `Vocab` object. RETURNS (Vocab): The `Vocab` object.
""" """
def serialize_vectors(b):
if self.vectors is None:
return None
else:
return self.vectors.from_bytes(b, exclude='strings')
setters = OrderedDict(( setters = OrderedDict((
('strings', lambda b: self.strings.from_bytes(b)), ('strings', lambda b: self.strings.from_bytes(b)),
('lexemes', lambda b: self.lexemes_from_bytes(b)), ('lexemes', lambda b: self.lexemes_from_bytes(b)),
('vectors', lambda b: serialize_vectors(b))
)) ))
util.from_bytes(bytes_data, setters, exclude) util.from_bytes(bytes_data, setters, exclude)
return self return self