diff --git a/spacy/language.py b/spacy/language.py index 51070b2f2..473381159 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -203,7 +203,6 @@ class Language(object): parser=False, entity=False, matcher=False, - serializer=False, vectors=False, pipeline=False) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 66c09072c..8f91bd5c7 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -22,7 +22,6 @@ from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE from ..parts_of_speech cimport CCONJ, PUNCT, NOUN from ..parts_of_speech cimport univ_pos_t from ..lexeme cimport Lexeme -from ..serialize.bits cimport BitArray from ..util import normalize_slice from ..syntax.iterators import CHUNKERS from ..compat import is_config @@ -81,11 +80,6 @@ cdef class Doc: """ Create a Doc object. - Aside: Implementation - This method of constructing a `Doc` object is usually only used - for deserialization. Standard usage is to construct the document via - a call to the language object. - Arguments: vocab: A Vocabulary object, which must match any models you want to @@ -615,46 +609,13 @@ cdef class Doc: """ Serialize, producing a byte string. """ - byte_string = self.vocab.serializer.pack(self) - cdef uint32_t length = len(byte_string) - return struct.pack('I', length) + byte_string + raise NotImplementedError def from_bytes(self, data): """ Deserialize, loading from bytes. """ - self.vocab.serializer.unpack_into(data[4:], self) - return self - - @staticmethod - def read_bytes(file_): - """ - A static method, used to read serialized #[code Doc] objects from - a file. For example: - - Example: - from spacy.tokens.doc import Doc - loc = 'test_serialize.bin' - with open(loc, 'wb') as file_: - file_.write(nlp(u'This is a document.').to_bytes()) - file_.write(nlp(u'This is another.').to_bytes()) - docs = [] - with open(loc, 'rb') as file_: - for byte_string in Doc.read_bytes(file_): - docs.append(Doc(nlp.vocab).from_bytes(byte_string)) - assert len(docs) == 2 - """ - keep_reading = True - while keep_reading: - try: - n_bytes_str = file_.read(4) - if len(n_bytes_str) < 4: - break - n_bytes = struct.unpack('I', n_bytes_str)[0] - data = file_.read(n_bytes) - except StopIteration: - keep_reading = False - yield n_bytes_str + data + raise NotImplementedError def merge(self, int start_idx, int end_idx, *args, **attributes): """ diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd index 8e0e363bc..3c31a8f8f 100644 --- a/spacy/vocab.pxd +++ b/spacy/vocab.pxd @@ -29,10 +29,8 @@ cdef class Vocab: cpdef readonly StringStore strings cpdef readonly Morphology morphology cdef readonly int length - cdef public object _serializer cdef public object data_dir cdef public object lex_attr_getters - cdef public object serializer_freqs cdef const LexemeC* get(self, Pool mem, unicode string) except NULL cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 4df97ddf0..4255819aa 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -15,7 +15,6 @@ from .strings cimport hash_string from .typedefs cimport attr_t from .cfile cimport CFile, StringCFile from .tokens.token cimport Token -from .serialize.packer cimport Packer from .attrs cimport PROB, LANG from .compat import copy_reg, pickle @@ -41,7 +40,7 @@ cdef class Vocab: """ @classmethod def load(cls, path, lex_attr_getters=None, lemmatizer=True, - tag_map=True, serializer_freqs=True, oov_prob=True, **deprecated_kwargs): + tag_map=True, oov_prob=True, **deprecated_kwargs): """ Load the vocabulary from a path. @@ -80,22 +79,17 @@ cdef class Vocab: lex_attr_getters[PROB] = lambda text: oov_prob if lemmatizer is True: lemmatizer = Lemmatizer.load(path) - if serializer_freqs is True and (path / 'vocab' / 'serializer.json').exists(): - with (path / 'vocab' / 'serializer.json').open('r', encoding='utf8') as file_: - serializer_freqs = ujson.load(file_) - else: - serializer_freqs = None with (path / 'vocab' / 'strings.json').open('r', encoding='utf8') as file_: strings_list = ujson.load(file_) cdef Vocab self = cls(lex_attr_getters=lex_attr_getters, tag_map=tag_map, - lemmatizer=lemmatizer, serializer_freqs=serializer_freqs, + lemmatizer=lemmatizer, strings=strings_list) self.load_lexemes(path / 'vocab' / 'lexemes.bin') return self def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None, - serializer_freqs=None, strings=tuple(), **deprecated_kwargs): + strings=tuple(), **deprecated_kwargs): """ Create the vocabulary. @@ -119,7 +113,6 @@ cdef class Vocab: tag_map = tag_map if tag_map is not None else {} if lemmatizer in (None, True, False): lemmatizer = Lemmatizer({}, {}, {}) - serializer_freqs = serializer_freqs if serializer_freqs is not None else {} self.mem = Pool() self._by_hash = PreshMap() @@ -141,17 +134,8 @@ cdef class Vocab: _ = self.strings[name] self.lex_attr_getters = lex_attr_getters self.morphology = Morphology(self.strings, tag_map, lemmatizer) - self.serializer_freqs = serializer_freqs self.length = 1 - self._serializer = None - - property serializer: - # Having the serializer live here is super messy :( - def __get__(self): - if self._serializer is None: - self._serializer = Packer(self, self.serializer_freqs) - return self._serializer property lang: def __get__(self): @@ -630,7 +614,6 @@ def pickle_vocab(vocab): sstore = vocab.strings morph = vocab.morphology length = vocab.length - serializer = vocab._serializer data_dir = vocab.data_dir lex_attr_getters = vocab.lex_attr_getters @@ -638,11 +621,11 @@ def pickle_vocab(vocab): vectors_length = vocab.vectors_length return (unpickle_vocab, - (sstore, morph, serializer, data_dir, lex_attr_getters, + (sstore, morph, data_dir, lex_attr_getters, lexemes_data, length, vectors_length)) -def unpickle_vocab(sstore, morphology, serializer, data_dir, +def unpickle_vocab(sstore, morphology, data_dir, lex_attr_getters, bytes lexemes_data, int length, int vectors_length): cdef Vocab vocab = Vocab() vocab.length = length @@ -650,7 +633,6 @@ def unpickle_vocab(sstore, morphology, serializer, data_dir, vocab.strings = sstore cdef CFile fp = StringCFile('r', data=lexemes_data) vocab.morphology = morphology - vocab._serializer = serializer vocab.data_dir = data_dir vocab.lex_attr_getters = lex_attr_getters vocab._deserialize_lexemes(fp)