Strip serializer from code

2024-12-24 17:06:29 +03:00 · 2017-05-09 17:28:50 +02:00 · 2017-05-09 17:28:50 +02:00 · 9e167b7bb6
commit 9e167b7bb6
parent 825c6403d8
4 changed files with 7 additions and 67 deletions
--- a/spacy/language.py
+++ b/spacy/language.py
@ -203,7 +203,6 @@ class Language(object):
                parser=False,
                entity=False,
                matcher=False,
-                serializer=False,
                vectors=False,
                pipeline=False)

--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -22,7 +22,6 @@ from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
 from ..parts_of_speech cimport CCONJ, PUNCT, NOUN
 from ..parts_of_speech cimport univ_pos_t
 from ..lexeme cimport Lexeme
-from ..serialize.bits cimport BitArray
 from ..util import normalize_slice
 from ..syntax.iterators import CHUNKERS
 from ..compat import is_config
@ -81,11 +80,6 @@ cdef class Doc:
        """
        Create a Doc object.

-        Aside: Implementation
-            This method of constructing a `Doc` object is usually only used
-            for deserialization. Standard usage is to construct the document via
-            a call to the language object.
-
        Arguments:
            vocab:
                A Vocabulary object, which must match any models you want to
@ -615,46 +609,13 @@ cdef class Doc:
        """
        Serialize, producing a byte string.
        """
-        byte_string = self.vocab.serializer.pack(self)
-        cdef uint32_t length = len(byte_string)
-        return struct.pack('I', length) + byte_string
+        raise NotImplementedError

    def from_bytes(self, data):
        """
        Deserialize, loading from bytes.
        """
-        self.vocab.serializer.unpack_into(data[4:], self)
-        return self
-
-    @staticmethod
-    def read_bytes(file_):
-        """
-        A static method, used to read serialized #[code Doc] objects from
-        a file. For example:
-
-        Example:
-            from spacy.tokens.doc import Doc
-            loc = 'test_serialize.bin'
-            with open(loc, 'wb') as file_:
-                file_.write(nlp(u'This is a document.').to_bytes())
-                file_.write(nlp(u'This is another.').to_bytes())
-            docs = []
-            with open(loc, 'rb') as file_:
-                for byte_string in Doc.read_bytes(file_):
-                    docs.append(Doc(nlp.vocab).from_bytes(byte_string))
-            assert len(docs) == 2
-        """
-        keep_reading = True
-        while keep_reading:
-            try:
-                n_bytes_str = file_.read(4)
-                if len(n_bytes_str) < 4:
-                    break
-                n_bytes = struct.unpack('I', n_bytes_str)[0]
-                data = file_.read(n_bytes)
-            except StopIteration:
-                keep_reading = False
-            yield n_bytes_str + data
+        raise NotImplementedError

    def merge(self, int start_idx, int end_idx, *args, **attributes):
        """
--- a/spacy/vocab.pxd
+++ b/spacy/vocab.pxd
@ -29,10 +29,8 @@ cdef class Vocab:
    cpdef readonly StringStore strings
    cpdef readonly Morphology morphology
    cdef readonly int length
-    cdef public object _serializer
    cdef public object data_dir
    cdef public object lex_attr_getters
-    cdef public object serializer_freqs

    cdef const LexemeC* get(self, Pool mem, unicode string) except NULL
    cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -15,7 +15,6 @@ from .strings cimport hash_string
 from .typedefs cimport attr_t
 from .cfile cimport CFile, StringCFile
 from .tokens.token cimport Token
-from .serialize.packer cimport Packer
 from .attrs cimport PROB, LANG

 from .compat import copy_reg, pickle
@ -41,7 +40,7 @@ cdef class Vocab:
    """
    @classmethod
    def load(cls, path, lex_attr_getters=None, lemmatizer=True,
-             tag_map=True, serializer_freqs=True, oov_prob=True, **deprecated_kwargs):
+             tag_map=True, oov_prob=True, **deprecated_kwargs):
        """
        Load the vocabulary from a path.

@ -80,22 +79,17 @@ cdef class Vocab:
            lex_attr_getters[PROB] = lambda text: oov_prob
        if lemmatizer is True:
            lemmatizer = Lemmatizer.load(path)
-        if serializer_freqs is True and (path / 'vocab' / 'serializer.json').exists():
-            with (path / 'vocab' / 'serializer.json').open('r', encoding='utf8') as file_:
-                serializer_freqs = ujson.load(file_)
-        else:
-            serializer_freqs = None

        with (path / 'vocab' / 'strings.json').open('r', encoding='utf8') as file_:
            strings_list = ujson.load(file_)
        cdef Vocab self = cls(lex_attr_getters=lex_attr_getters, tag_map=tag_map,
-                              lemmatizer=lemmatizer, serializer_freqs=serializer_freqs,
+                              lemmatizer=lemmatizer,
                              strings=strings_list)
        self.load_lexemes(path / 'vocab' / 'lexemes.bin')
        return self

    def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None,
-            serializer_freqs=None, strings=tuple(), **deprecated_kwargs):
+            strings=tuple(), **deprecated_kwargs):
        """
        Create the vocabulary.

@ -119,7 +113,6 @@ cdef class Vocab:
        tag_map = tag_map if tag_map is not None else {}
        if lemmatizer in (None, True, False):
            lemmatizer = Lemmatizer({}, {}, {})
-        serializer_freqs = serializer_freqs if serializer_freqs is not None else {}

        self.mem = Pool()
        self._by_hash = PreshMap()
@ -141,17 +134,8 @@ cdef class Vocab:
                _ = self.strings[name]
        self.lex_attr_getters = lex_attr_getters
        self.morphology = Morphology(self.strings, tag_map, lemmatizer)
-        self.serializer_freqs = serializer_freqs

        self.length = 1
-        self._serializer = None
-
-    property serializer:
-        # Having the serializer live here is super messy :(
-        def __get__(self):
-            if self._serializer is None:
-                self._serializer = Packer(self, self.serializer_freqs)
-            return self._serializer

    property lang:
        def __get__(self):
@ -630,7 +614,6 @@ def pickle_vocab(vocab):
    sstore = vocab.strings
    morph = vocab.morphology
    length = vocab.length
-    serializer = vocab._serializer
    data_dir = vocab.data_dir
    lex_attr_getters = vocab.lex_attr_getters

@ -638,11 +621,11 @@ def pickle_vocab(vocab):
    vectors_length = vocab.vectors_length

    return (unpickle_vocab,
-        (sstore, morph, serializer, data_dir, lex_attr_getters,
+        (sstore, morph, data_dir, lex_attr_getters,
            lexemes_data, length, vectors_length))


-def unpickle_vocab(sstore, morphology, serializer, data_dir,
+def unpickle_vocab(sstore, morphology, data_dir,
        lex_attr_getters, bytes lexemes_data, int length, int vectors_length):
    cdef Vocab vocab = Vocab()
    vocab.length = length
@ -650,7 +633,6 @@ def unpickle_vocab(sstore, morphology, serializer, data_dir,
    vocab.strings = sstore
    cdef CFile fp = StringCFile('r', data=lexemes_data)
    vocab.morphology = morphology
-    vocab._serializer = serializer
    vocab.data_dir = data_dir
    vocab.lex_attr_getters = lex_attr_getters
    vocab._deserialize_lexemes(fp)