mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
* Tests passing on round-trip pack/unpack on basic example
This commit is contained in:
parent
44f39a876f
commit
cf0c788892
|
@ -19,3 +19,5 @@ cdef class BitArray:
|
||||||
cdef uint32_t i
|
cdef uint32_t i
|
||||||
|
|
||||||
cdef int extend(self, uint64_t code, char n_bits) except -1
|
cdef int extend(self, uint64_t code, char n_bits) except -1
|
||||||
|
|
||||||
|
cdef uint32_t read32(self) except 0
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
|
from libc.string cimport memcpy
|
||||||
|
|
||||||
# Note that we're setting the most significant bits here first, when in practice
|
# Note that we're setting the most significant bits here first, when in practice
|
||||||
# we're actually wanting the last bit to be most significant (for Huffman coding,
|
# we're actually wanting the last bit to be most significant (for Huffman coding,
|
||||||
|
@ -20,19 +20,62 @@ cdef class BitArray:
|
||||||
self.bit_of_byte = 0
|
self.bit_of_byte = 0
|
||||||
self.i = 0
|
self.i = 0
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return 8 * len(self.data) + self.bit_of_byte
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
cdef uchar byte, i
|
||||||
|
cdef uchar one = 1
|
||||||
|
string = b''
|
||||||
|
for i in range(len(self.data)):
|
||||||
|
byte = ord(self.data[i])
|
||||||
|
for j in range(8):
|
||||||
|
string += b'1' if (byte & (one << j)) else b'0'
|
||||||
|
for i in range(self.bit_of_byte):
|
||||||
|
string += b'1' if (byte & (one << i)) else b'0'
|
||||||
|
return string
|
||||||
|
|
||||||
|
def seek(self, i):
|
||||||
|
self.i = i
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
cdef uchar byte, i
|
cdef uchar byte, i
|
||||||
cdef uchar one = 1
|
cdef uchar one = 1
|
||||||
start_byte = self.i // 8
|
start_byte = self.i // 8
|
||||||
if (self.i % 8) != 0:
|
start_bit = self.i % 8
|
||||||
for i in range(self.i % 8):
|
|
||||||
yield 1 if (self.data[start_byte] & (one << i)) else 0
|
if start_bit != 0 and start_byte < len(self.data):
|
||||||
|
byte = ord(self.data[start_byte])
|
||||||
|
for i in range(start_bit, 8):
|
||||||
|
self.i += 1
|
||||||
|
yield 1 if (byte & (one << i)) else 0
|
||||||
start_byte += 1
|
start_byte += 1
|
||||||
|
start_bit = 0
|
||||||
|
|
||||||
for byte in self.data[start_byte:]:
|
for byte in self.data[start_byte:]:
|
||||||
for i in range(8):
|
for i in range(8):
|
||||||
|
self.i += 1
|
||||||
yield 1 if byte & (one << i) else 0
|
yield 1 if byte & (one << i) else 0
|
||||||
for i in range(self.bit_of_byte):
|
|
||||||
yield 1 if self.byte & (one << i) else 0
|
if self.bit_of_byte != 0:
|
||||||
|
byte = self.byte
|
||||||
|
for i in range(start_bit, self.bit_of_byte):
|
||||||
|
self.i += 1
|
||||||
|
yield 1 if self.byte & (one << i) else 0
|
||||||
|
|
||||||
|
cdef uint32_t read32(self) except 0:
|
||||||
|
cdef int start_byte = self.i // 8
|
||||||
|
|
||||||
|
# TODO portability
|
||||||
|
cdef uchar[4] chars
|
||||||
|
chars[0] = <uchar>ord(self.data[start_byte])
|
||||||
|
chars[1] = <uchar>ord(self.data[start_byte+1])
|
||||||
|
chars[2] = <uchar>ord(self.data[start_byte+2])
|
||||||
|
chars[3] = <uchar>ord(self.data[start_byte+3])
|
||||||
|
cdef uint32_t output
|
||||||
|
memcpy(&output, chars, 4)
|
||||||
|
self.i += 32
|
||||||
|
return output
|
||||||
|
|
||||||
def as_bytes(self):
|
def as_bytes(self):
|
||||||
if self.bit_of_byte != 0:
|
if self.bit_of_byte != 0:
|
||||||
|
@ -47,6 +90,7 @@ cdef class BitArray:
|
||||||
else:
|
else:
|
||||||
self.byte &= ~(one << self.bit_of_byte)
|
self.byte &= ~(one << self.bit_of_byte)
|
||||||
self.bit_of_byte += 1
|
self.bit_of_byte += 1
|
||||||
|
self.i += 1
|
||||||
if self.bit_of_byte == 8:
|
if self.bit_of_byte == 8:
|
||||||
self.data += chr(self.byte)
|
self.data += chr(self.byte)
|
||||||
self.byte = 0
|
self.byte = 0
|
||||||
|
@ -65,5 +109,4 @@ cdef class BitArray:
|
||||||
self.data += chr(self.byte)
|
self.data += chr(self.byte)
|
||||||
self.byte = 0
|
self.byte = 0
|
||||||
self.bit_of_byte = 0
|
self.bit_of_byte = 0
|
||||||
|
self.i += 1
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,7 @@
|
||||||
cimport cython
|
cimport cython
|
||||||
|
|
||||||
|
from ..typedefs cimport attr_t
|
||||||
|
|
||||||
from .bits cimport bit_append
|
from .bits cimport bit_append
|
||||||
from .bits cimport BitArray
|
from .bits cimport BitArray
|
||||||
|
|
||||||
|
@ -16,7 +18,6 @@ cdef class HuffmanCodec:
|
||||||
weights (float[:]): A descending-sorted sequence of probabilities/weights.
|
weights (float[:]): A descending-sorted sequence of probabilities/weights.
|
||||||
Must include a weight for an EOL symbol.
|
Must include a weight for an EOL symbol.
|
||||||
|
|
||||||
eol (uint32_t): The index of the weight of the EOL symbol.
|
|
||||||
"""
|
"""
|
||||||
def __init__(self, float[:] weights):
|
def __init__(self, float[:] weights):
|
||||||
self.codes.resize(len(weights))
|
self.codes.resize(len(weights))
|
||||||
|
@ -29,12 +30,12 @@ cdef class HuffmanCodec:
|
||||||
path.length = 0
|
path.length = 0
|
||||||
assign_codes(self.nodes, self.codes, len(self.nodes) - 1, path)
|
assign_codes(self.nodes, self.codes, len(self.nodes) - 1, path)
|
||||||
|
|
||||||
def encode(self, uint32_t[:] msg, BitArray into_bits):
|
def encode(self, attr_t[:] msg, BitArray into_bits):
|
||||||
cdef uint32_t i
|
cdef int i
|
||||||
for i in range(len(msg)):
|
for i in range(len(msg)):
|
||||||
into_bits.extend(self.codes[msg[i]].bits, self.codes[msg[i]].length)
|
into_bits.extend(self.codes[msg[i]].bits, self.codes[msg[i]].length)
|
||||||
|
|
||||||
def decode(self, bits, uint32_t[:] into_msg):
|
def decode(self, bits, attr_t[:] into_msg):
|
||||||
node = self.nodes.back()
|
node = self.nodes.back()
|
||||||
cdef int i = 0
|
cdef int i = 0
|
||||||
cdef int n = len(into_msg)
|
cdef int n = len(into_msg)
|
||||||
|
@ -49,7 +50,8 @@ cdef class HuffmanCodec:
|
||||||
if i == n:
|
if i == n:
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
raise Exception
|
raise Exception(
|
||||||
|
"Buffer exhausted at %d/%d symbols read." % (i, len(into_msg)))
|
||||||
|
|
||||||
property strings:
|
property strings:
|
||||||
@cython.boundscheck(False)
|
@cython.boundscheck(False)
|
||||||
|
|
|
@ -2,5 +2,6 @@ from ..vocab cimport Vocab
|
||||||
|
|
||||||
|
|
||||||
cdef class Packer:
|
cdef class Packer:
|
||||||
cdef tuple _codecs
|
cdef readonly tuple attrs
|
||||||
cdef Vocab vocab
|
cdef readonly tuple _codecs
|
||||||
|
cdef readonly Vocab vocab
|
||||||
|
|
|
@ -7,7 +7,7 @@ from libcpp.pair cimport pair
|
||||||
from cymem.cymem cimport Address, Pool
|
from cymem.cymem cimport Address, Pool
|
||||||
from preshed.maps cimport PreshMap
|
from preshed.maps cimport PreshMap
|
||||||
|
|
||||||
from ..attrs cimport ID, ORTH, SPACY, TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
|
from ..attrs cimport ID, SPACY, TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
|
||||||
from ..tokens.doc cimport Doc
|
from ..tokens.doc cimport Doc
|
||||||
from ..vocab cimport Vocab
|
from ..vocab cimport Vocab
|
||||||
from ..typedefs cimport attr_t
|
from ..typedefs cimport attr_t
|
||||||
|
@ -50,24 +50,28 @@ cdef class _BinaryCodec:
|
||||||
for i in range(len(msg)):
|
for i in range(len(msg)):
|
||||||
bits.append(msg[i])
|
bits.append(msg[i])
|
||||||
|
|
||||||
def decode(self, bits, attr_t[:] msg):
|
def decode(self, BitArray bits, attr_t[:] msg):
|
||||||
for i in range(len(msg)):
|
cdef int i = 0
|
||||||
msg[i] = bits.next()
|
for bit in bits:
|
||||||
|
msg[i] = bit
|
||||||
|
i += 1
|
||||||
|
if i == len(msg):
|
||||||
|
break
|
||||||
|
|
||||||
|
|
||||||
cdef class _AttributeCodec:
|
cdef class _AttributeCodec:
|
||||||
cdef Pool mem
|
cdef Pool mem
|
||||||
cdef attr_t* _keys
|
cdef attr_t* _keys
|
||||||
cdef PreshMap _map
|
cdef dict _map
|
||||||
cdef HuffmanCodec _codec
|
cdef HuffmanCodec _codec
|
||||||
|
|
||||||
def __init__(self, freqs):
|
def __init__(self, freqs):
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
cdef uint64_t key
|
cdef attr_t key
|
||||||
cdef uint64_t count
|
cdef float count
|
||||||
cdef pair[uint64_t, uint64_t] item
|
cdef pair[float, attr_t] item
|
||||||
|
|
||||||
cdef priority_queue[pair[uint64_t, uint64_t]] items
|
cdef priority_queue[pair[float, attr_t]] items
|
||||||
|
|
||||||
for key, count in freqs:
|
for key, count in freqs:
|
||||||
item.first = count
|
item.first = count
|
||||||
|
@ -75,7 +79,7 @@ cdef class _AttributeCodec:
|
||||||
items.push(item)
|
items.push(item)
|
||||||
weights = numpy.ndarray(shape=(len(freqs),), dtype=numpy.float32)
|
weights = numpy.ndarray(shape=(len(freqs),), dtype=numpy.float32)
|
||||||
self._keys = <attr_t*>self.mem.alloc(len(freqs), sizeof(attr_t))
|
self._keys = <attr_t*>self.mem.alloc(len(freqs), sizeof(attr_t))
|
||||||
self._map = PreshMap()
|
self._map = {}
|
||||||
cdef int i = 0
|
cdef int i = 0
|
||||||
while not items.empty():
|
while not items.empty():
|
||||||
item = items.top()
|
item = items.top()
|
||||||
|
@ -88,8 +92,9 @@ cdef class _AttributeCodec:
|
||||||
self._codec = HuffmanCodec(weights)
|
self._codec = HuffmanCodec(weights)
|
||||||
|
|
||||||
def encode(self, attr_t[:] msg, BitArray dest):
|
def encode(self, attr_t[:] msg, BitArray dest):
|
||||||
|
cdef int i
|
||||||
for i in range(len(msg)):
|
for i in range(len(msg)):
|
||||||
msg[i] = <attr_t>self._map[msg[i]]
|
msg[i] = self._map[msg[i]]
|
||||||
self._codec.encode(msg, dest)
|
self._codec.encode(msg, dest)
|
||||||
|
|
||||||
def decode(self, BitArray bits, attr_t[:] dest):
|
def decode(self, BitArray bits, attr_t[:] dest):
|
||||||
|
@ -103,30 +108,36 @@ cdef class Packer:
|
||||||
def __init__(self, Vocab vocab, list_of_attr_freqs):
|
def __init__(self, Vocab vocab, list_of_attr_freqs):
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
codecs = []
|
codecs = []
|
||||||
self.attrs = []
|
attrs = []
|
||||||
|
|
||||||
for attr, freqs in list_of_attr_freqs:
|
for attr, freqs in list_of_attr_freqs:
|
||||||
if attr == ORTH:
|
if attr == ID:
|
||||||
codecs.append(make_vocab_codec(vocab))
|
codecs.append(make_vocab_codec(vocab))
|
||||||
elif attr == SPACY:
|
elif attr == SPACY:
|
||||||
codecs.append(_BinaryCodec())
|
codecs.append(_BinaryCodec())
|
||||||
else:
|
else:
|
||||||
codecs.append(_AttributeCodec(freqs))
|
codecs.append(_AttributeCodec(freqs))
|
||||||
self.attrs.append(attr)
|
attrs.append(attr)
|
||||||
self._codecs = tuple(codecs)
|
self._codecs = tuple(codecs)
|
||||||
|
self.attrs = tuple(attrs)
|
||||||
|
|
||||||
def pack(self, Doc doc):
|
def pack(self, Doc doc):
|
||||||
array = doc.to_array(self.attrs)
|
array = doc.to_array(self.attrs)
|
||||||
cdef BitArray bits = BitArray()
|
cdef BitArray bits = BitArray()
|
||||||
cdef uint32_t length = len(array)
|
cdef uint32_t length = 3
|
||||||
bits.extend(length, 32)
|
#cdef uint32_t length = len(doc)
|
||||||
|
#bits.extend(length, 32)
|
||||||
for i, codec in enumerate(self._codecs):
|
for i, codec in enumerate(self._codecs):
|
||||||
codec.encode(array[i], bits)
|
codec.encode(array[:, i], bits)
|
||||||
return bits
|
return bits
|
||||||
|
|
||||||
def unpack(self, bits):
|
def unpack(self, BitArray bits):
|
||||||
cdef uint32_t length = bits.read(32)
|
bits.seek(0)
|
||||||
array = numpy.ndarray(shape=(len(self.codecs), length), dtype=numpy.int)
|
#cdef uint32_t length = bits.read32()
|
||||||
for i, codec in enumerate(self.codecs):
|
cdef uint32_t length = 3
|
||||||
array[i] = codec.decode(bits)
|
array = numpy.zeros(shape=(length, len(self._codecs)), dtype=numpy.int32)
|
||||||
return Doc.from_array(self.vocab, self.attrs, array)
|
for i, codec in enumerate(self._codecs):
|
||||||
|
codec.decode(bits, array[:, i])
|
||||||
|
doc = Doc.from_ids(self.vocab, array[:, 0], array[:, 1])
|
||||||
|
doc.from_array(self.attrs, array)
|
||||||
|
return doc
|
||||||
|
|
|
@ -97,17 +97,15 @@ cdef class Doc:
|
||||||
self._py_tokens = []
|
self._py_tokens = []
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_orth(cls, Vocab vocab, attr_t[:] orths, attr_t[:] spaces):
|
def from_ids(cls, Vocab vocab, ids, spaces):
|
||||||
cdef int i
|
cdef int i
|
||||||
cdef const LexemeC* lex
|
cdef const LexemeC* lex
|
||||||
cdef Doc self = cls(vocab)
|
cdef Doc self = cls(vocab)
|
||||||
cdef unicode string
|
cdef bint space = 0
|
||||||
cdef UniStr new_orth_c
|
for i in range(len(ids)):
|
||||||
for i in range(len(orths)):
|
lex = self.vocab.lexemes.at(ids[i])
|
||||||
string = vocab.strings[orths[i]]
|
space = spaces[i]
|
||||||
slice_unicode(&new_orth_c, string, 0, len(string))
|
self.push_back(lex, space)
|
||||||
lex = self.vocab.get(self.mem, &new_orth_c)
|
|
||||||
self.push_back(lex, spaces[i])
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def __getitem__(self, object i):
|
def __getitem__(self, object i):
|
||||||
|
@ -229,11 +227,11 @@ cdef class Doc:
|
||||||
"""
|
"""
|
||||||
cdef int i, j
|
cdef int i, j
|
||||||
cdef attr_id_t feature
|
cdef attr_id_t feature
|
||||||
cdef np.ndarray[long, ndim=2] output
|
cdef np.ndarray[attr_t, ndim=2] output
|
||||||
# Make an array from the attributes --- otherwise our inner loop is Python
|
# Make an array from the attributes --- otherwise our inner loop is Python
|
||||||
# dict iteration.
|
# dict iteration.
|
||||||
cdef np.ndarray[long, ndim=1] attr_ids = numpy.asarray(py_attr_ids)
|
cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.int32)
|
||||||
output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.int)
|
output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.int32)
|
||||||
for i in range(self.length):
|
for i in range(self.length):
|
||||||
for j, feature in enumerate(attr_ids):
|
for j, feature in enumerate(attr_ids):
|
||||||
output[i, j] = get_token_attr(&self.data[i], feature)
|
output[i, j] = get_token_attr(&self.data[i], feature)
|
||||||
|
|
|
@ -1,10 +1,10 @@
|
||||||
from libc.stdint cimport uint16_t, uint32_t, uint64_t, uintptr_t
|
from libc.stdint cimport uint16_t, uint32_t, uint64_t, uintptr_t, int32_t
|
||||||
from libc.stdint cimport uint8_t
|
from libc.stdint cimport uint8_t
|
||||||
|
|
||||||
|
|
||||||
ctypedef uint64_t hash_t
|
ctypedef uint64_t hash_t
|
||||||
ctypedef char* utf8_t
|
ctypedef char* utf8_t
|
||||||
ctypedef uint32_t attr_t
|
ctypedef int32_t attr_t
|
||||||
ctypedef uint64_t flags_t
|
ctypedef uint64_t flags_t
|
||||||
ctypedef uint32_t id_t
|
ctypedef uint32_t id_t
|
||||||
ctypedef uint16_t len_t
|
ctypedef uint16_t len_t
|
||||||
|
|
Loading…
Reference in New Issue
Block a user