diff --git a/spacy/serialize/bits.pxd b/spacy/serialize/bits.pxd index 51ecf4c63..fea5ad786 100644 --- a/spacy/serialize/bits.pxd +++ b/spacy/serialize/bits.pxd @@ -19,3 +19,5 @@ cdef class BitArray: cdef uint32_t i cdef int extend(self, uint64_t code, char n_bits) except -1 + + cdef uint32_t read32(self) except 0 diff --git a/spacy/serialize/bits.pyx b/spacy/serialize/bits.pyx index 7df236537..340b18cd5 100644 --- a/spacy/serialize/bits.pyx +++ b/spacy/serialize/bits.pyx @@ -1,4 +1,4 @@ - +from libc.string cimport memcpy # Note that we're setting the most significant bits here first, when in practice # we're actually wanting the last bit to be most significant (for Huffman coding, @@ -20,19 +20,62 @@ cdef class BitArray: self.bit_of_byte = 0 self.i = 0 + def __len__(self): + return 8 * len(self.data) + self.bit_of_byte + + def __str__(self): + cdef uchar byte, i + cdef uchar one = 1 + string = b'' + for i in range(len(self.data)): + byte = ord(self.data[i]) + for j in range(8): + string += b'1' if (byte & (one << j)) else b'0' + for i in range(self.bit_of_byte): + string += b'1' if (byte & (one << i)) else b'0' + return string + + def seek(self, i): + self.i = i + def __iter__(self): cdef uchar byte, i cdef uchar one = 1 start_byte = self.i // 8 - if (self.i % 8) != 0: - for i in range(self.i % 8): - yield 1 if (self.data[start_byte] & (one << i)) else 0 + start_bit = self.i % 8 + + if start_bit != 0 and start_byte < len(self.data): + byte = ord(self.data[start_byte]) + for i in range(start_bit, 8): + self.i += 1 + yield 1 if (byte & (one << i)) else 0 start_byte += 1 + start_bit = 0 + for byte in self.data[start_byte:]: for i in range(8): + self.i += 1 yield 1 if byte & (one << i) else 0 - for i in range(self.bit_of_byte): - yield 1 if self.byte & (one << i) else 0 + + if self.bit_of_byte != 0: + byte = self.byte + for i in range(start_bit, self.bit_of_byte): + self.i += 1 + yield 1 if self.byte & (one << i) else 0 + + cdef uint32_t read32(self) except 0: + cdef int start_byte = self.i // 8 + + # TODO portability + cdef uchar[4] chars + chars[0] = ord(self.data[start_byte]) + chars[1] = ord(self.data[start_byte+1]) + chars[2] = ord(self.data[start_byte+2]) + chars[3] = ord(self.data[start_byte+3]) + cdef uint32_t output + memcpy(&output, chars, 4) + self.i += 32 + return output def as_bytes(self): if self.bit_of_byte != 0: @@ -47,6 +90,7 @@ cdef class BitArray: else: self.byte &= ~(one << self.bit_of_byte) self.bit_of_byte += 1 + self.i += 1 if self.bit_of_byte == 8: self.data += chr(self.byte) self.byte = 0 @@ -65,5 +109,4 @@ cdef class BitArray: self.data += chr(self.byte) self.byte = 0 self.bit_of_byte = 0 - - + self.i += 1 diff --git a/spacy/serialize/huffman.pyx b/spacy/serialize/huffman.pyx index 826ee4e29..24dc6c30c 100644 --- a/spacy/serialize/huffman.pyx +++ b/spacy/serialize/huffman.pyx @@ -1,5 +1,7 @@ cimport cython +from ..typedefs cimport attr_t + from .bits cimport bit_append from .bits cimport BitArray @@ -16,7 +18,6 @@ cdef class HuffmanCodec: weights (float[:]): A descending-sorted sequence of probabilities/weights. Must include a weight for an EOL symbol. - eol (uint32_t): The index of the weight of the EOL symbol. """ def __init__(self, float[:] weights): self.codes.resize(len(weights)) @@ -29,12 +30,12 @@ cdef class HuffmanCodec: path.length = 0 assign_codes(self.nodes, self.codes, len(self.nodes) - 1, path) - def encode(self, uint32_t[:] msg, BitArray into_bits): - cdef uint32_t i + def encode(self, attr_t[:] msg, BitArray into_bits): + cdef int i for i in range(len(msg)): into_bits.extend(self.codes[msg[i]].bits, self.codes[msg[i]].length) - def decode(self, bits, uint32_t[:] into_msg): + def decode(self, bits, attr_t[:] into_msg): node = self.nodes.back() cdef int i = 0 cdef int n = len(into_msg) @@ -49,7 +50,8 @@ cdef class HuffmanCodec: if i == n: break else: - raise Exception + raise Exception( + "Buffer exhausted at %d/%d symbols read." % (i, len(into_msg))) property strings: @cython.boundscheck(False) diff --git a/spacy/serialize/packer.pxd b/spacy/serialize/packer.pxd index f6fca5c9e..02bcdf56e 100644 --- a/spacy/serialize/packer.pxd +++ b/spacy/serialize/packer.pxd @@ -2,5 +2,6 @@ from ..vocab cimport Vocab cdef class Packer: - cdef tuple _codecs - cdef Vocab vocab + cdef readonly tuple attrs + cdef readonly tuple _codecs + cdef readonly Vocab vocab diff --git a/spacy/serialize/packer.pyx b/spacy/serialize/packer.pyx index 0edcbde04..01ae9947c 100644 --- a/spacy/serialize/packer.pyx +++ b/spacy/serialize/packer.pyx @@ -7,7 +7,7 @@ from libcpp.pair cimport pair from cymem.cymem cimport Address, Pool from preshed.maps cimport PreshMap -from ..attrs cimport ID, ORTH, SPACY, TAG, HEAD, DEP, ENT_IOB, ENT_TYPE +from ..attrs cimport ID, SPACY, TAG, HEAD, DEP, ENT_IOB, ENT_TYPE from ..tokens.doc cimport Doc from ..vocab cimport Vocab from ..typedefs cimport attr_t @@ -50,24 +50,28 @@ cdef class _BinaryCodec: for i in range(len(msg)): bits.append(msg[i]) - def decode(self, bits, attr_t[:] msg): - for i in range(len(msg)): - msg[i] = bits.next() + def decode(self, BitArray bits, attr_t[:] msg): + cdef int i = 0 + for bit in bits: + msg[i] = bit + i += 1 + if i == len(msg): + break cdef class _AttributeCodec: cdef Pool mem cdef attr_t* _keys - cdef PreshMap _map + cdef dict _map cdef HuffmanCodec _codec def __init__(self, freqs): self.mem = Pool() - cdef uint64_t key - cdef uint64_t count - cdef pair[uint64_t, uint64_t] item + cdef attr_t key + cdef float count + cdef pair[float, attr_t] item - cdef priority_queue[pair[uint64_t, uint64_t]] items + cdef priority_queue[pair[float, attr_t]] items for key, count in freqs: item.first = count @@ -75,7 +79,7 @@ cdef class _AttributeCodec: items.push(item) weights = numpy.ndarray(shape=(len(freqs),), dtype=numpy.float32) self._keys = self.mem.alloc(len(freqs), sizeof(attr_t)) - self._map = PreshMap() + self._map = {} cdef int i = 0 while not items.empty(): item = items.top() @@ -88,8 +92,9 @@ cdef class _AttributeCodec: self._codec = HuffmanCodec(weights) def encode(self, attr_t[:] msg, BitArray dest): + cdef int i for i in range(len(msg)): - msg[i] = self._map[msg[i]] + msg[i] = self._map[msg[i]] self._codec.encode(msg, dest) def decode(self, BitArray bits, attr_t[:] dest): @@ -103,30 +108,36 @@ cdef class Packer: def __init__(self, Vocab vocab, list_of_attr_freqs): self.vocab = vocab codecs = [] - self.attrs = [] + attrs = [] for attr, freqs in list_of_attr_freqs: - if attr == ORTH: + if attr == ID: codecs.append(make_vocab_codec(vocab)) elif attr == SPACY: codecs.append(_BinaryCodec()) else: codecs.append(_AttributeCodec(freqs)) - self.attrs.append(attr) + attrs.append(attr) self._codecs = tuple(codecs) + self.attrs = tuple(attrs) def pack(self, Doc doc): array = doc.to_array(self.attrs) cdef BitArray bits = BitArray() - cdef uint32_t length = len(array) - bits.extend(length, 32) + cdef uint32_t length = 3 + #cdef uint32_t length = len(doc) + #bits.extend(length, 32) for i, codec in enumerate(self._codecs): - codec.encode(array[i], bits) + codec.encode(array[:, i], bits) return bits - def unpack(self, bits): - cdef uint32_t length = bits.read(32) - array = numpy.ndarray(shape=(len(self.codecs), length), dtype=numpy.int) - for i, codec in enumerate(self.codecs): - array[i] = codec.decode(bits) - return Doc.from_array(self.vocab, self.attrs, array) + def unpack(self, BitArray bits): + bits.seek(0) + #cdef uint32_t length = bits.read32() + cdef uint32_t length = 3 + array = numpy.zeros(shape=(length, len(self._codecs)), dtype=numpy.int32) + for i, codec in enumerate(self._codecs): + codec.decode(bits, array[:, i]) + doc = Doc.from_ids(self.vocab, array[:, 0], array[:, 1]) + doc.from_array(self.attrs, array) + return doc diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index db3df7156..df5a0f6c0 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -97,17 +97,15 @@ cdef class Doc: self._py_tokens = [] @classmethod - def from_orth(cls, Vocab vocab, attr_t[:] orths, attr_t[:] spaces): + def from_ids(cls, Vocab vocab, ids, spaces): cdef int i cdef const LexemeC* lex cdef Doc self = cls(vocab) - cdef unicode string - cdef UniStr new_orth_c - for i in range(len(orths)): - string = vocab.strings[orths[i]] - slice_unicode(&new_orth_c, string, 0, len(string)) - lex = self.vocab.get(self.mem, &new_orth_c) - self.push_back(lex, spaces[i]) + cdef bint space = 0 + for i in range(len(ids)): + lex = self.vocab.lexemes.at(ids[i]) + space = spaces[i] + self.push_back(lex, space) return self def __getitem__(self, object i): @@ -229,11 +227,11 @@ cdef class Doc: """ cdef int i, j cdef attr_id_t feature - cdef np.ndarray[long, ndim=2] output + cdef np.ndarray[attr_t, ndim=2] output # Make an array from the attributes --- otherwise our inner loop is Python # dict iteration. - cdef np.ndarray[long, ndim=1] attr_ids = numpy.asarray(py_attr_ids) - output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.int) + cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.int32) + output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.int32) for i in range(self.length): for j, feature in enumerate(attr_ids): output[i, j] = get_token_attr(&self.data[i], feature) diff --git a/spacy/typedefs.pxd b/spacy/typedefs.pxd index 84a6794e0..0257f0e81 100644 --- a/spacy/typedefs.pxd +++ b/spacy/typedefs.pxd @@ -1,10 +1,10 @@ -from libc.stdint cimport uint16_t, uint32_t, uint64_t, uintptr_t +from libc.stdint cimport uint16_t, uint32_t, uint64_t, uintptr_t, int32_t from libc.stdint cimport uint8_t ctypedef uint64_t hash_t ctypedef char* utf8_t -ctypedef uint32_t attr_t +ctypedef int32_t attr_t ctypedef uint64_t flags_t ctypedef uint32_t id_t ctypedef uint16_t len_t