mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Strip serializer from code
This commit is contained in:
parent
825c6403d8
commit
9e167b7bb6
|
@ -203,7 +203,6 @@ class Language(object):
|
|||
parser=False,
|
||||
entity=False,
|
||||
matcher=False,
|
||||
serializer=False,
|
||||
vectors=False,
|
||||
pipeline=False)
|
||||
|
||||
|
|
|
@ -22,7 +22,6 @@ from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
|
|||
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN
|
||||
from ..parts_of_speech cimport univ_pos_t
|
||||
from ..lexeme cimport Lexeme
|
||||
from ..serialize.bits cimport BitArray
|
||||
from ..util import normalize_slice
|
||||
from ..syntax.iterators import CHUNKERS
|
||||
from ..compat import is_config
|
||||
|
@ -81,11 +80,6 @@ cdef class Doc:
|
|||
"""
|
||||
Create a Doc object.
|
||||
|
||||
Aside: Implementation
|
||||
This method of constructing a `Doc` object is usually only used
|
||||
for deserialization. Standard usage is to construct the document via
|
||||
a call to the language object.
|
||||
|
||||
Arguments:
|
||||
vocab:
|
||||
A Vocabulary object, which must match any models you want to
|
||||
|
@ -615,46 +609,13 @@ cdef class Doc:
|
|||
"""
|
||||
Serialize, producing a byte string.
|
||||
"""
|
||||
byte_string = self.vocab.serializer.pack(self)
|
||||
cdef uint32_t length = len(byte_string)
|
||||
return struct.pack('I', length) + byte_string
|
||||
raise NotImplementedError
|
||||
|
||||
def from_bytes(self, data):
|
||||
"""
|
||||
Deserialize, loading from bytes.
|
||||
"""
|
||||
self.vocab.serializer.unpack_into(data[4:], self)
|
||||
return self
|
||||
|
||||
@staticmethod
|
||||
def read_bytes(file_):
|
||||
"""
|
||||
A static method, used to read serialized #[code Doc] objects from
|
||||
a file. For example:
|
||||
|
||||
Example:
|
||||
from spacy.tokens.doc import Doc
|
||||
loc = 'test_serialize.bin'
|
||||
with open(loc, 'wb') as file_:
|
||||
file_.write(nlp(u'This is a document.').to_bytes())
|
||||
file_.write(nlp(u'This is another.').to_bytes())
|
||||
docs = []
|
||||
with open(loc, 'rb') as file_:
|
||||
for byte_string in Doc.read_bytes(file_):
|
||||
docs.append(Doc(nlp.vocab).from_bytes(byte_string))
|
||||
assert len(docs) == 2
|
||||
"""
|
||||
keep_reading = True
|
||||
while keep_reading:
|
||||
try:
|
||||
n_bytes_str = file_.read(4)
|
||||
if len(n_bytes_str) < 4:
|
||||
break
|
||||
n_bytes = struct.unpack('I', n_bytes_str)[0]
|
||||
data = file_.read(n_bytes)
|
||||
except StopIteration:
|
||||
keep_reading = False
|
||||
yield n_bytes_str + data
|
||||
raise NotImplementedError
|
||||
|
||||
def merge(self, int start_idx, int end_idx, *args, **attributes):
|
||||
"""
|
||||
|
|
|
@ -29,10 +29,8 @@ cdef class Vocab:
|
|||
cpdef readonly StringStore strings
|
||||
cpdef readonly Morphology morphology
|
||||
cdef readonly int length
|
||||
cdef public object _serializer
|
||||
cdef public object data_dir
|
||||
cdef public object lex_attr_getters
|
||||
cdef public object serializer_freqs
|
||||
|
||||
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL
|
||||
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL
|
||||
|
|
|
@ -15,7 +15,6 @@ from .strings cimport hash_string
|
|||
from .typedefs cimport attr_t
|
||||
from .cfile cimport CFile, StringCFile
|
||||
from .tokens.token cimport Token
|
||||
from .serialize.packer cimport Packer
|
||||
from .attrs cimport PROB, LANG
|
||||
|
||||
from .compat import copy_reg, pickle
|
||||
|
@ -41,7 +40,7 @@ cdef class Vocab:
|
|||
"""
|
||||
@classmethod
|
||||
def load(cls, path, lex_attr_getters=None, lemmatizer=True,
|
||||
tag_map=True, serializer_freqs=True, oov_prob=True, **deprecated_kwargs):
|
||||
tag_map=True, oov_prob=True, **deprecated_kwargs):
|
||||
"""
|
||||
Load the vocabulary from a path.
|
||||
|
||||
|
@ -80,22 +79,17 @@ cdef class Vocab:
|
|||
lex_attr_getters[PROB] = lambda text: oov_prob
|
||||
if lemmatizer is True:
|
||||
lemmatizer = Lemmatizer.load(path)
|
||||
if serializer_freqs is True and (path / 'vocab' / 'serializer.json').exists():
|
||||
with (path / 'vocab' / 'serializer.json').open('r', encoding='utf8') as file_:
|
||||
serializer_freqs = ujson.load(file_)
|
||||
else:
|
||||
serializer_freqs = None
|
||||
|
||||
with (path / 'vocab' / 'strings.json').open('r', encoding='utf8') as file_:
|
||||
strings_list = ujson.load(file_)
|
||||
cdef Vocab self = cls(lex_attr_getters=lex_attr_getters, tag_map=tag_map,
|
||||
lemmatizer=lemmatizer, serializer_freqs=serializer_freqs,
|
||||
lemmatizer=lemmatizer,
|
||||
strings=strings_list)
|
||||
self.load_lexemes(path / 'vocab' / 'lexemes.bin')
|
||||
return self
|
||||
|
||||
def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None,
|
||||
serializer_freqs=None, strings=tuple(), **deprecated_kwargs):
|
||||
strings=tuple(), **deprecated_kwargs):
|
||||
"""
|
||||
Create the vocabulary.
|
||||
|
||||
|
@ -119,7 +113,6 @@ cdef class Vocab:
|
|||
tag_map = tag_map if tag_map is not None else {}
|
||||
if lemmatizer in (None, True, False):
|
||||
lemmatizer = Lemmatizer({}, {}, {})
|
||||
serializer_freqs = serializer_freqs if serializer_freqs is not None else {}
|
||||
|
||||
self.mem = Pool()
|
||||
self._by_hash = PreshMap()
|
||||
|
@ -141,17 +134,8 @@ cdef class Vocab:
|
|||
_ = self.strings[name]
|
||||
self.lex_attr_getters = lex_attr_getters
|
||||
self.morphology = Morphology(self.strings, tag_map, lemmatizer)
|
||||
self.serializer_freqs = serializer_freqs
|
||||
|
||||
self.length = 1
|
||||
self._serializer = None
|
||||
|
||||
property serializer:
|
||||
# Having the serializer live here is super messy :(
|
||||
def __get__(self):
|
||||
if self._serializer is None:
|
||||
self._serializer = Packer(self, self.serializer_freqs)
|
||||
return self._serializer
|
||||
|
||||
property lang:
|
||||
def __get__(self):
|
||||
|
@ -630,7 +614,6 @@ def pickle_vocab(vocab):
|
|||
sstore = vocab.strings
|
||||
morph = vocab.morphology
|
||||
length = vocab.length
|
||||
serializer = vocab._serializer
|
||||
data_dir = vocab.data_dir
|
||||
lex_attr_getters = vocab.lex_attr_getters
|
||||
|
||||
|
@ -638,11 +621,11 @@ def pickle_vocab(vocab):
|
|||
vectors_length = vocab.vectors_length
|
||||
|
||||
return (unpickle_vocab,
|
||||
(sstore, morph, serializer, data_dir, lex_attr_getters,
|
||||
(sstore, morph, data_dir, lex_attr_getters,
|
||||
lexemes_data, length, vectors_length))
|
||||
|
||||
|
||||
def unpickle_vocab(sstore, morphology, serializer, data_dir,
|
||||
def unpickle_vocab(sstore, morphology, data_dir,
|
||||
lex_attr_getters, bytes lexemes_data, int length, int vectors_length):
|
||||
cdef Vocab vocab = Vocab()
|
||||
vocab.length = length
|
||||
|
@ -650,7 +633,6 @@ def unpickle_vocab(sstore, morphology, serializer, data_dir,
|
|||
vocab.strings = sstore
|
||||
cdef CFile fp = StringCFile('r', data=lexemes_data)
|
||||
vocab.morphology = morphology
|
||||
vocab._serializer = serializer
|
||||
vocab.data_dir = data_dir
|
||||
vocab.lex_attr_getters = lex_attr_getters
|
||||
vocab._deserialize_lexemes(fp)
|
||||
|
|
Loading…
Reference in New Issue
Block a user