Strip serializer from code

2025-10-28 06:31:12 +03:00 · 2017-05-09 17:28:50 +02:00 · 2017-05-09 17:28:50 +02:00 · 9e167b7bb6
commit 9e167b7bb6
parent 825c6403d8
4 changed files with 7 additions and 67 deletions
--- a/spacy/language.py
+++ b/spacy/language.py
@ -203,7 +203,6 @@ class Language(object):
                parser=False,
                entity=False,
                matcher=False,
                serializer=False,
                vectors=False,
                pipeline=False)
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -22,7 +22,6 @@ from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
 from ..parts_of_speech cimport CCONJ, PUNCT, NOUN
 from ..parts_of_speech cimport univ_pos_t
 from ..lexeme cimport Lexeme
 from ..serialize.bits cimport BitArray
 from ..util import normalize_slice
 from ..syntax.iterators import CHUNKERS
 from ..compat import is_config
@ -81,11 +80,6 @@ cdef class Doc:
        """
        Create a Doc object.
        Aside: Implementation
            This method of constructing a `Doc` object is usually only used
            for deserialization. Standard usage is to construct the document via
            a call to the language object.
        Arguments:
            vocab:
                A Vocabulary object, which must match any models you want to
@ -615,46 +609,13 @@ cdef class Doc:
        """
        Serialize, producing a byte string.
        """
-        byte_string = self.vocab.serializer.pack(self)
+        raise NotImplementedError
        cdef uint32_t length = len(byte_string)
        return struct.pack('I', length) + byte_string
    def from_bytes(self, data):
        """
        Deserialize, loading from bytes.
        """
-        self.vocab.serializer.unpack_into(data[4:], self)
+        raise NotImplementedError
        return self
    @staticmethod
    def read_bytes(file_):
        """
        A static method, used to read serialized #[code Doc] objects from
        a file. For example:
        Example:
            from spacy.tokens.doc import Doc
            loc = 'test_serialize.bin'
            with open(loc, 'wb') as file_:
                file_.write(nlp(u'This is a document.').to_bytes())
                file_.write(nlp(u'This is another.').to_bytes())
            docs = []
            with open(loc, 'rb') as file_:
                for byte_string in Doc.read_bytes(file_):
                    docs.append(Doc(nlp.vocab).from_bytes(byte_string))
            assert len(docs) == 2
        """
        keep_reading = True
        while keep_reading:
            try:
                n_bytes_str = file_.read(4)
                if len(n_bytes_str) < 4:
                    break
                n_bytes = struct.unpack('I', n_bytes_str)[0]
                data = file_.read(n_bytes)
            except StopIteration:
                keep_reading = False
            yield n_bytes_str + data
    def merge(self, int start_idx, int end_idx, *args, **attributes):
        """
--- a/spacy/vocab.pxd
+++ b/spacy/vocab.pxd
@ -29,10 +29,8 @@ cdef class Vocab:
    cpdef readonly StringStore strings
    cpdef readonly Morphology morphology
    cdef readonly int length
    cdef public object _serializer
    cdef public object data_dir
    cdef public object lex_attr_getters
    cdef public object serializer_freqs
    cdef const LexemeC* get(self, Pool mem, unicode string) except NULL
    cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -15,7 +15,6 @@ from .strings cimport hash_string
 from .typedefs cimport attr_t
 from .cfile cimport CFile, StringCFile
 from .tokens.token cimport Token
 from .serialize.packer cimport Packer
 from .attrs cimport PROB, LANG
 from .compat import copy_reg, pickle
@ -41,7 +40,7 @@ cdef class Vocab:
    """
    @classmethod
    def load(cls, path, lex_attr_getters=None, lemmatizer=True,
-             tag_map=True, serializer_freqs=True, oov_prob=True, **deprecated_kwargs):
+             tag_map=True, oov_prob=True, **deprecated_kwargs):
        """
        Load the vocabulary from a path.
@ -80,22 +79,17 @@ cdef class Vocab:
            lex_attr_getters[PROB] = lambda text: oov_prob
        if lemmatizer is True:
            lemmatizer = Lemmatizer.load(path)
        if serializer_freqs is True and (path / 'vocab' / 'serializer.json').exists():
            with (path / 'vocab' / 'serializer.json').open('r', encoding='utf8') as file_:
                serializer_freqs = ujson.load(file_)
        else:
            serializer_freqs = None
        with (path / 'vocab' / 'strings.json').open('r', encoding='utf8') as file_:
            strings_list = ujson.load(file_)
        cdef Vocab self = cls(lex_attr_getters=lex_attr_getters, tag_map=tag_map,
-                              lemmatizer=lemmatizer, serializer_freqs=serializer_freqs,
+                              lemmatizer=lemmatizer,
                              strings=strings_list)
        self.load_lexemes(path / 'vocab' / 'lexemes.bin')
        return self
    def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None,
-            serializer_freqs=None, strings=tuple(), **deprecated_kwargs):
+            strings=tuple(), **deprecated_kwargs):
        """
        Create the vocabulary.
@ -119,7 +113,6 @@ cdef class Vocab:
        tag_map = tag_map if tag_map is not None else {}
        if lemmatizer in (None, True, False):
            lemmatizer = Lemmatizer({}, {}, {})
        serializer_freqs = serializer_freqs if serializer_freqs is not None else {}
        self.mem = Pool()
        self._by_hash = PreshMap()
@ -141,17 +134,8 @@ cdef class Vocab:
                _ = self.strings[name]
        self.lex_attr_getters = lex_attr_getters
        self.morphology = Morphology(self.strings, tag_map, lemmatizer)
        self.serializer_freqs = serializer_freqs
        self.length = 1
        self._serializer = None
    property serializer:
        # Having the serializer live here is super messy :(
        def __get__(self):
            if self._serializer is None:
                self._serializer = Packer(self, self.serializer_freqs)
            return self._serializer
    property lang:
        def __get__(self):
@ -630,7 +614,6 @@ def pickle_vocab(vocab):
    sstore = vocab.strings
    morph = vocab.morphology
    length = vocab.length
    serializer = vocab._serializer
    data_dir = vocab.data_dir
    lex_attr_getters = vocab.lex_attr_getters
@ -638,11 +621,11 @@ def pickle_vocab(vocab):
    vectors_length = vocab.vectors_length
    return (unpickle_vocab,
-        (sstore, morph, serializer, data_dir, lex_attr_getters,
+        (sstore, morph, data_dir, lex_attr_getters,
            lexemes_data, length, vectors_length))
-def unpickle_vocab(sstore, morphology, serializer, data_dir,
+def unpickle_vocab(sstore, morphology, data_dir,
        lex_attr_getters, bytes lexemes_data, int length, int vectors_length):
    cdef Vocab vocab = Vocab()
    vocab.length = length
@ -650,7 +633,6 @@ def unpickle_vocab(sstore, morphology, serializer, data_dir,
    vocab.strings = sstore
    cdef CFile fp = StringCFile('r', data=lexemes_data)
    vocab.morphology = morphology
    vocab._serializer = serializer
    vocab.data_dir = data_dir
    vocab.lex_attr_getters = lex_attr_getters
    vocab._deserialize_lexemes(fp)