Strip serializer from code

This commit is contained in:
Matthew Honnibal 2017-05-09 17:28:50 +02:00
parent 825c6403d8
commit 9e167b7bb6
4 changed files with 7 additions and 67 deletions

View File

@ -203,7 +203,6 @@ class Language(object):
parser=False, parser=False,
entity=False, entity=False,
matcher=False, matcher=False,
serializer=False,
vectors=False, vectors=False,
pipeline=False) pipeline=False)

View File

@ -22,7 +22,6 @@ from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN from ..parts_of_speech cimport CCONJ, PUNCT, NOUN
from ..parts_of_speech cimport univ_pos_t from ..parts_of_speech cimport univ_pos_t
from ..lexeme cimport Lexeme from ..lexeme cimport Lexeme
from ..serialize.bits cimport BitArray
from ..util import normalize_slice from ..util import normalize_slice
from ..syntax.iterators import CHUNKERS from ..syntax.iterators import CHUNKERS
from ..compat import is_config from ..compat import is_config
@ -81,11 +80,6 @@ cdef class Doc:
""" """
Create a Doc object. Create a Doc object.
Aside: Implementation
This method of constructing a `Doc` object is usually only used
for deserialization. Standard usage is to construct the document via
a call to the language object.
Arguments: Arguments:
vocab: vocab:
A Vocabulary object, which must match any models you want to A Vocabulary object, which must match any models you want to
@ -615,46 +609,13 @@ cdef class Doc:
""" """
Serialize, producing a byte string. Serialize, producing a byte string.
""" """
byte_string = self.vocab.serializer.pack(self) raise NotImplementedError
cdef uint32_t length = len(byte_string)
return struct.pack('I', length) + byte_string
def from_bytes(self, data): def from_bytes(self, data):
""" """
Deserialize, loading from bytes. Deserialize, loading from bytes.
""" """
self.vocab.serializer.unpack_into(data[4:], self) raise NotImplementedError
return self
@staticmethod
def read_bytes(file_):
"""
A static method, used to read serialized #[code Doc] objects from
a file. For example:
Example:
from spacy.tokens.doc import Doc
loc = 'test_serialize.bin'
with open(loc, 'wb') as file_:
file_.write(nlp(u'This is a document.').to_bytes())
file_.write(nlp(u'This is another.').to_bytes())
docs = []
with open(loc, 'rb') as file_:
for byte_string in Doc.read_bytes(file_):
docs.append(Doc(nlp.vocab).from_bytes(byte_string))
assert len(docs) == 2
"""
keep_reading = True
while keep_reading:
try:
n_bytes_str = file_.read(4)
if len(n_bytes_str) < 4:
break
n_bytes = struct.unpack('I', n_bytes_str)[0]
data = file_.read(n_bytes)
except StopIteration:
keep_reading = False
yield n_bytes_str + data
def merge(self, int start_idx, int end_idx, *args, **attributes): def merge(self, int start_idx, int end_idx, *args, **attributes):
""" """

View File

@ -29,10 +29,8 @@ cdef class Vocab:
cpdef readonly StringStore strings cpdef readonly StringStore strings
cpdef readonly Morphology morphology cpdef readonly Morphology morphology
cdef readonly int length cdef readonly int length
cdef public object _serializer
cdef public object data_dir cdef public object data_dir
cdef public object lex_attr_getters cdef public object lex_attr_getters
cdef public object serializer_freqs
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL cdef const LexemeC* get(self, Pool mem, unicode string) except NULL
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL

View File

@ -15,7 +15,6 @@ from .strings cimport hash_string
from .typedefs cimport attr_t from .typedefs cimport attr_t
from .cfile cimport CFile, StringCFile from .cfile cimport CFile, StringCFile
from .tokens.token cimport Token from .tokens.token cimport Token
from .serialize.packer cimport Packer
from .attrs cimport PROB, LANG from .attrs cimport PROB, LANG
from .compat import copy_reg, pickle from .compat import copy_reg, pickle
@ -41,7 +40,7 @@ cdef class Vocab:
""" """
@classmethod @classmethod
def load(cls, path, lex_attr_getters=None, lemmatizer=True, def load(cls, path, lex_attr_getters=None, lemmatizer=True,
tag_map=True, serializer_freqs=True, oov_prob=True, **deprecated_kwargs): tag_map=True, oov_prob=True, **deprecated_kwargs):
""" """
Load the vocabulary from a path. Load the vocabulary from a path.
@ -80,22 +79,17 @@ cdef class Vocab:
lex_attr_getters[PROB] = lambda text: oov_prob lex_attr_getters[PROB] = lambda text: oov_prob
if lemmatizer is True: if lemmatizer is True:
lemmatizer = Lemmatizer.load(path) lemmatizer = Lemmatizer.load(path)
if serializer_freqs is True and (path / 'vocab' / 'serializer.json').exists():
with (path / 'vocab' / 'serializer.json').open('r', encoding='utf8') as file_:
serializer_freqs = ujson.load(file_)
else:
serializer_freqs = None
with (path / 'vocab' / 'strings.json').open('r', encoding='utf8') as file_: with (path / 'vocab' / 'strings.json').open('r', encoding='utf8') as file_:
strings_list = ujson.load(file_) strings_list = ujson.load(file_)
cdef Vocab self = cls(lex_attr_getters=lex_attr_getters, tag_map=tag_map, cdef Vocab self = cls(lex_attr_getters=lex_attr_getters, tag_map=tag_map,
lemmatizer=lemmatizer, serializer_freqs=serializer_freqs, lemmatizer=lemmatizer,
strings=strings_list) strings=strings_list)
self.load_lexemes(path / 'vocab' / 'lexemes.bin') self.load_lexemes(path / 'vocab' / 'lexemes.bin')
return self return self
def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None, def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None,
serializer_freqs=None, strings=tuple(), **deprecated_kwargs): strings=tuple(), **deprecated_kwargs):
""" """
Create the vocabulary. Create the vocabulary.
@ -119,7 +113,6 @@ cdef class Vocab:
tag_map = tag_map if tag_map is not None else {} tag_map = tag_map if tag_map is not None else {}
if lemmatizer in (None, True, False): if lemmatizer in (None, True, False):
lemmatizer = Lemmatizer({}, {}, {}) lemmatizer = Lemmatizer({}, {}, {})
serializer_freqs = serializer_freqs if serializer_freqs is not None else {}
self.mem = Pool() self.mem = Pool()
self._by_hash = PreshMap() self._by_hash = PreshMap()
@ -141,17 +134,8 @@ cdef class Vocab:
_ = self.strings[name] _ = self.strings[name]
self.lex_attr_getters = lex_attr_getters self.lex_attr_getters = lex_attr_getters
self.morphology = Morphology(self.strings, tag_map, lemmatizer) self.morphology = Morphology(self.strings, tag_map, lemmatizer)
self.serializer_freqs = serializer_freqs
self.length = 1 self.length = 1
self._serializer = None
property serializer:
# Having the serializer live here is super messy :(
def __get__(self):
if self._serializer is None:
self._serializer = Packer(self, self.serializer_freqs)
return self._serializer
property lang: property lang:
def __get__(self): def __get__(self):
@ -630,7 +614,6 @@ def pickle_vocab(vocab):
sstore = vocab.strings sstore = vocab.strings
morph = vocab.morphology morph = vocab.morphology
length = vocab.length length = vocab.length
serializer = vocab._serializer
data_dir = vocab.data_dir data_dir = vocab.data_dir
lex_attr_getters = vocab.lex_attr_getters lex_attr_getters = vocab.lex_attr_getters
@ -638,11 +621,11 @@ def pickle_vocab(vocab):
vectors_length = vocab.vectors_length vectors_length = vocab.vectors_length
return (unpickle_vocab, return (unpickle_vocab,
(sstore, morph, serializer, data_dir, lex_attr_getters, (sstore, morph, data_dir, lex_attr_getters,
lexemes_data, length, vectors_length)) lexemes_data, length, vectors_length))
def unpickle_vocab(sstore, morphology, serializer, data_dir, def unpickle_vocab(sstore, morphology, data_dir,
lex_attr_getters, bytes lexemes_data, int length, int vectors_length): lex_attr_getters, bytes lexemes_data, int length, int vectors_length):
cdef Vocab vocab = Vocab() cdef Vocab vocab = Vocab()
vocab.length = length vocab.length = length
@ -650,7 +633,6 @@ def unpickle_vocab(sstore, morphology, serializer, data_dir,
vocab.strings = sstore vocab.strings = sstore
cdef CFile fp = StringCFile('r', data=lexemes_data) cdef CFile fp = StringCFile('r', data=lexemes_data)
vocab.morphology = morphology vocab.morphology = morphology
vocab._serializer = serializer
vocab.data_dir = data_dir vocab.data_dir = data_dir
vocab.lex_attr_getters = lex_attr_getters vocab.lex_attr_getters = lex_attr_getters
vocab._deserialize_lexemes(fp) vocab._deserialize_lexemes(fp)