mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 01:16:28 +03:00
Strip serializer from code
This commit is contained in:
parent
825c6403d8
commit
9e167b7bb6
|
@ -203,7 +203,6 @@ class Language(object):
|
||||||
parser=False,
|
parser=False,
|
||||||
entity=False,
|
entity=False,
|
||||||
matcher=False,
|
matcher=False,
|
||||||
serializer=False,
|
|
||||||
vectors=False,
|
vectors=False,
|
||||||
pipeline=False)
|
pipeline=False)
|
||||||
|
|
||||||
|
|
|
@ -22,7 +22,6 @@ from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
|
||||||
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN
|
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN
|
||||||
from ..parts_of_speech cimport univ_pos_t
|
from ..parts_of_speech cimport univ_pos_t
|
||||||
from ..lexeme cimport Lexeme
|
from ..lexeme cimport Lexeme
|
||||||
from ..serialize.bits cimport BitArray
|
|
||||||
from ..util import normalize_slice
|
from ..util import normalize_slice
|
||||||
from ..syntax.iterators import CHUNKERS
|
from ..syntax.iterators import CHUNKERS
|
||||||
from ..compat import is_config
|
from ..compat import is_config
|
||||||
|
@ -81,11 +80,6 @@ cdef class Doc:
|
||||||
"""
|
"""
|
||||||
Create a Doc object.
|
Create a Doc object.
|
||||||
|
|
||||||
Aside: Implementation
|
|
||||||
This method of constructing a `Doc` object is usually only used
|
|
||||||
for deserialization. Standard usage is to construct the document via
|
|
||||||
a call to the language object.
|
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
vocab:
|
vocab:
|
||||||
A Vocabulary object, which must match any models you want to
|
A Vocabulary object, which must match any models you want to
|
||||||
|
@ -615,46 +609,13 @@ cdef class Doc:
|
||||||
"""
|
"""
|
||||||
Serialize, producing a byte string.
|
Serialize, producing a byte string.
|
||||||
"""
|
"""
|
||||||
byte_string = self.vocab.serializer.pack(self)
|
raise NotImplementedError
|
||||||
cdef uint32_t length = len(byte_string)
|
|
||||||
return struct.pack('I', length) + byte_string
|
|
||||||
|
|
||||||
def from_bytes(self, data):
|
def from_bytes(self, data):
|
||||||
"""
|
"""
|
||||||
Deserialize, loading from bytes.
|
Deserialize, loading from bytes.
|
||||||
"""
|
"""
|
||||||
self.vocab.serializer.unpack_into(data[4:], self)
|
raise NotImplementedError
|
||||||
return self
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def read_bytes(file_):
|
|
||||||
"""
|
|
||||||
A static method, used to read serialized #[code Doc] objects from
|
|
||||||
a file. For example:
|
|
||||||
|
|
||||||
Example:
|
|
||||||
from spacy.tokens.doc import Doc
|
|
||||||
loc = 'test_serialize.bin'
|
|
||||||
with open(loc, 'wb') as file_:
|
|
||||||
file_.write(nlp(u'This is a document.').to_bytes())
|
|
||||||
file_.write(nlp(u'This is another.').to_bytes())
|
|
||||||
docs = []
|
|
||||||
with open(loc, 'rb') as file_:
|
|
||||||
for byte_string in Doc.read_bytes(file_):
|
|
||||||
docs.append(Doc(nlp.vocab).from_bytes(byte_string))
|
|
||||||
assert len(docs) == 2
|
|
||||||
"""
|
|
||||||
keep_reading = True
|
|
||||||
while keep_reading:
|
|
||||||
try:
|
|
||||||
n_bytes_str = file_.read(4)
|
|
||||||
if len(n_bytes_str) < 4:
|
|
||||||
break
|
|
||||||
n_bytes = struct.unpack('I', n_bytes_str)[0]
|
|
||||||
data = file_.read(n_bytes)
|
|
||||||
except StopIteration:
|
|
||||||
keep_reading = False
|
|
||||||
yield n_bytes_str + data
|
|
||||||
|
|
||||||
def merge(self, int start_idx, int end_idx, *args, **attributes):
|
def merge(self, int start_idx, int end_idx, *args, **attributes):
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -29,10 +29,8 @@ cdef class Vocab:
|
||||||
cpdef readonly StringStore strings
|
cpdef readonly StringStore strings
|
||||||
cpdef readonly Morphology morphology
|
cpdef readonly Morphology morphology
|
||||||
cdef readonly int length
|
cdef readonly int length
|
||||||
cdef public object _serializer
|
|
||||||
cdef public object data_dir
|
cdef public object data_dir
|
||||||
cdef public object lex_attr_getters
|
cdef public object lex_attr_getters
|
||||||
cdef public object serializer_freqs
|
|
||||||
|
|
||||||
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL
|
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL
|
||||||
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL
|
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL
|
||||||
|
|
|
@ -15,7 +15,6 @@ from .strings cimport hash_string
|
||||||
from .typedefs cimport attr_t
|
from .typedefs cimport attr_t
|
||||||
from .cfile cimport CFile, StringCFile
|
from .cfile cimport CFile, StringCFile
|
||||||
from .tokens.token cimport Token
|
from .tokens.token cimport Token
|
||||||
from .serialize.packer cimport Packer
|
|
||||||
from .attrs cimport PROB, LANG
|
from .attrs cimport PROB, LANG
|
||||||
|
|
||||||
from .compat import copy_reg, pickle
|
from .compat import copy_reg, pickle
|
||||||
|
@ -41,7 +40,7 @@ cdef class Vocab:
|
||||||
"""
|
"""
|
||||||
@classmethod
|
@classmethod
|
||||||
def load(cls, path, lex_attr_getters=None, lemmatizer=True,
|
def load(cls, path, lex_attr_getters=None, lemmatizer=True,
|
||||||
tag_map=True, serializer_freqs=True, oov_prob=True, **deprecated_kwargs):
|
tag_map=True, oov_prob=True, **deprecated_kwargs):
|
||||||
"""
|
"""
|
||||||
Load the vocabulary from a path.
|
Load the vocabulary from a path.
|
||||||
|
|
||||||
|
@ -80,22 +79,17 @@ cdef class Vocab:
|
||||||
lex_attr_getters[PROB] = lambda text: oov_prob
|
lex_attr_getters[PROB] = lambda text: oov_prob
|
||||||
if lemmatizer is True:
|
if lemmatizer is True:
|
||||||
lemmatizer = Lemmatizer.load(path)
|
lemmatizer = Lemmatizer.load(path)
|
||||||
if serializer_freqs is True and (path / 'vocab' / 'serializer.json').exists():
|
|
||||||
with (path / 'vocab' / 'serializer.json').open('r', encoding='utf8') as file_:
|
|
||||||
serializer_freqs = ujson.load(file_)
|
|
||||||
else:
|
|
||||||
serializer_freqs = None
|
|
||||||
|
|
||||||
with (path / 'vocab' / 'strings.json').open('r', encoding='utf8') as file_:
|
with (path / 'vocab' / 'strings.json').open('r', encoding='utf8') as file_:
|
||||||
strings_list = ujson.load(file_)
|
strings_list = ujson.load(file_)
|
||||||
cdef Vocab self = cls(lex_attr_getters=lex_attr_getters, tag_map=tag_map,
|
cdef Vocab self = cls(lex_attr_getters=lex_attr_getters, tag_map=tag_map,
|
||||||
lemmatizer=lemmatizer, serializer_freqs=serializer_freqs,
|
lemmatizer=lemmatizer,
|
||||||
strings=strings_list)
|
strings=strings_list)
|
||||||
self.load_lexemes(path / 'vocab' / 'lexemes.bin')
|
self.load_lexemes(path / 'vocab' / 'lexemes.bin')
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None,
|
def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None,
|
||||||
serializer_freqs=None, strings=tuple(), **deprecated_kwargs):
|
strings=tuple(), **deprecated_kwargs):
|
||||||
"""
|
"""
|
||||||
Create the vocabulary.
|
Create the vocabulary.
|
||||||
|
|
||||||
|
@ -119,7 +113,6 @@ cdef class Vocab:
|
||||||
tag_map = tag_map if tag_map is not None else {}
|
tag_map = tag_map if tag_map is not None else {}
|
||||||
if lemmatizer in (None, True, False):
|
if lemmatizer in (None, True, False):
|
||||||
lemmatizer = Lemmatizer({}, {}, {})
|
lemmatizer = Lemmatizer({}, {}, {})
|
||||||
serializer_freqs = serializer_freqs if serializer_freqs is not None else {}
|
|
||||||
|
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
self._by_hash = PreshMap()
|
self._by_hash = PreshMap()
|
||||||
|
@ -141,17 +134,8 @@ cdef class Vocab:
|
||||||
_ = self.strings[name]
|
_ = self.strings[name]
|
||||||
self.lex_attr_getters = lex_attr_getters
|
self.lex_attr_getters = lex_attr_getters
|
||||||
self.morphology = Morphology(self.strings, tag_map, lemmatizer)
|
self.morphology = Morphology(self.strings, tag_map, lemmatizer)
|
||||||
self.serializer_freqs = serializer_freqs
|
|
||||||
|
|
||||||
self.length = 1
|
self.length = 1
|
||||||
self._serializer = None
|
|
||||||
|
|
||||||
property serializer:
|
|
||||||
# Having the serializer live here is super messy :(
|
|
||||||
def __get__(self):
|
|
||||||
if self._serializer is None:
|
|
||||||
self._serializer = Packer(self, self.serializer_freqs)
|
|
||||||
return self._serializer
|
|
||||||
|
|
||||||
property lang:
|
property lang:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
@ -630,7 +614,6 @@ def pickle_vocab(vocab):
|
||||||
sstore = vocab.strings
|
sstore = vocab.strings
|
||||||
morph = vocab.morphology
|
morph = vocab.morphology
|
||||||
length = vocab.length
|
length = vocab.length
|
||||||
serializer = vocab._serializer
|
|
||||||
data_dir = vocab.data_dir
|
data_dir = vocab.data_dir
|
||||||
lex_attr_getters = vocab.lex_attr_getters
|
lex_attr_getters = vocab.lex_attr_getters
|
||||||
|
|
||||||
|
@ -638,11 +621,11 @@ def pickle_vocab(vocab):
|
||||||
vectors_length = vocab.vectors_length
|
vectors_length = vocab.vectors_length
|
||||||
|
|
||||||
return (unpickle_vocab,
|
return (unpickle_vocab,
|
||||||
(sstore, morph, serializer, data_dir, lex_attr_getters,
|
(sstore, morph, data_dir, lex_attr_getters,
|
||||||
lexemes_data, length, vectors_length))
|
lexemes_data, length, vectors_length))
|
||||||
|
|
||||||
|
|
||||||
def unpickle_vocab(sstore, morphology, serializer, data_dir,
|
def unpickle_vocab(sstore, morphology, data_dir,
|
||||||
lex_attr_getters, bytes lexemes_data, int length, int vectors_length):
|
lex_attr_getters, bytes lexemes_data, int length, int vectors_length):
|
||||||
cdef Vocab vocab = Vocab()
|
cdef Vocab vocab = Vocab()
|
||||||
vocab.length = length
|
vocab.length = length
|
||||||
|
@ -650,7 +633,6 @@ def unpickle_vocab(sstore, morphology, serializer, data_dir,
|
||||||
vocab.strings = sstore
|
vocab.strings = sstore
|
||||||
cdef CFile fp = StringCFile('r', data=lexemes_data)
|
cdef CFile fp = StringCFile('r', data=lexemes_data)
|
||||||
vocab.morphology = morphology
|
vocab.morphology = morphology
|
||||||
vocab._serializer = serializer
|
|
||||||
vocab.data_dir = data_dir
|
vocab.data_dir = data_dir
|
||||||
vocab.lex_attr_getters = lex_attr_getters
|
vocab.lex_attr_getters = lex_attr_getters
|
||||||
vocab._deserialize_lexemes(fp)
|
vocab._deserialize_lexemes(fp)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user