From cf0c78889232d5bb8fea868edd44b684f34097e7 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Fri, 17 Jul 2015 21:20:48 +0200
Subject: [PATCH] * Tests passing on round-trip pack/unpack on basic example

---
 spacy/serialize/bits.pxd    |  2 ++
 spacy/serialize/bits.pyx    | 59 ++++++++++++++++++++++++++++++++-----
 spacy/serialize/huffman.pyx | 12 ++++----
 spacy/serialize/packer.pxd  |  5 ++--
 spacy/serialize/packer.pyx  | 57 ++++++++++++++++++++---------------
 spacy/tokens/doc.pyx        | 20 ++++++-------
 spacy/typedefs.pxd          |  4 +--
 7 files changed, 108 insertions(+), 51 deletions(-)
diff --git a/spacy/serialize/bits.pxd b/spacy/serialize/bits.pxd
index 51ecf4c63..fea5ad786 100644
--- a/spacy/serialize/bits.pxd
+++ b/spacy/serialize/bits.pxd
@@ -19,3 +19,5 @@ cdef class BitArray:
     cdef uint32_t i
     
     cdef int extend(self, uint64_t code, char n_bits) except -1
+
+    cdef uint32_t read32(self) except 0
diff --git a/spacy/serialize/bits.pyx b/spacy/serialize/bits.pyx
index 7df236537..340b18cd5 100644
--- a/spacy/serialize/bits.pyx
+++ b/spacy/serialize/bits.pyx
@@ -1,4 +1,4 @@
-
+from libc.string cimport memcpy
 
 # Note that we're setting the most significant bits here first, when in practice
 # we're actually wanting the last bit to be most significant (for Huffman coding,
@@ -20,19 +20,62 @@ cdef class BitArray:
         self.bit_of_byte = 0
         self.i = 0
 
+    def __len__(self):
+        return 8 * len(self.data) + self.bit_of_byte
+
+    def __str__(self):
+        cdef uchar byte, i
+        cdef uchar one = 1
+        string = b''
+        for i in range(len(self.data)):
+            byte = ord(self.data[i])
+            for j in range(8):
+                string += b'1' if (byte & (one << j)) else b'0'
+        for i in range(self.bit_of_byte):
+            string += b'1' if (byte & (one << i)) else b'0'
+        return string
+
+    def seek(self, i):
+        self.i = i
+
     def __iter__(self):
         cdef uchar byte, i
         cdef uchar one = 1
         start_byte = self.i // 8
-        if (self.i % 8) != 0:
-            for i in range(self.i % 8):
-                yield 1 if (self.data[start_byte] & (one << i)) else 0
+        start_bit = self.i % 8
+
+        if start_bit != 0 and start_byte < len(self.data):
+            byte = ord(self.data[start_byte])
+            for i in range(start_bit, 8):
+                self.i += 1
+                yield 1 if (byte & (one << i)) else 0
             start_byte += 1
+            start_bit = 0
+
         for byte in self.data[start_byte:]:
             for i in range(8):
+                self.i += 1
                 yield 1 if byte & (one << i) else 0
-        for i in range(self.bit_of_byte):
-            yield 1 if self.byte & (one << i) else 0
+
+        if self.bit_of_byte != 0:
+            byte = self.byte
+            for i in range(start_bit, self.bit_of_byte):
+                self.i += 1
+                yield 1 if self.byte & (one << i) else 0
+
+    cdef uint32_t read32(self) except 0:
+        cdef int start_byte = self.i // 8
+
+        # TODO portability
+        cdef uchar[4] chars
+        chars[0] = <uchar>ord(self.data[start_byte])
+        chars[1] = <uchar>ord(self.data[start_byte+1])
+        chars[2] = <uchar>ord(self.data[start_byte+2])
+        chars[3] = <uchar>ord(self.data[start_byte+3])
+        cdef uint32_t output
+        memcpy(&output, chars, 4)
+        self.i += 32
+        return output
 
     def as_bytes(self):
         if self.bit_of_byte != 0:
@@ -47,6 +90,7 @@ cdef class BitArray:
         else:
             self.byte &= ~(one << self.bit_of_byte)
         self.bit_of_byte += 1
+        self.i += 1
         if self.bit_of_byte == 8:
             self.data += chr(self.byte)
             self.byte = 0
@@ -65,5 +109,4 @@ cdef class BitArray:
                 self.data += chr(self.byte)
                 self.byte = 0
                 self.bit_of_byte = 0
-
-
+            self.i += 1
diff --git a/spacy/serialize/huffman.pyx b/spacy/serialize/huffman.pyx
index 826ee4e29..24dc6c30c 100644
--- a/spacy/serialize/huffman.pyx
+++ b/spacy/serialize/huffman.pyx
@@ -1,5 +1,7 @@
 cimport cython
 
+from ..typedefs cimport attr_t
+
 from .bits cimport bit_append
 from .bits cimport BitArray
 
@@ -16,7 +18,6 @@ cdef class HuffmanCodec:
         weights (float[:]): A descending-sorted sequence of probabilities/weights.
           Must include a weight for an EOL symbol.
 
-        eol (uint32_t): The index of the weight of the EOL symbol.
     """
     def __init__(self, float[:] weights):
         self.codes.resize(len(weights))
@@ -29,12 +30,12 @@ cdef class HuffmanCodec:
         path.length = 0
         assign_codes(self.nodes, self.codes, len(self.nodes) - 1, path)
 
-    def encode(self, uint32_t[:] msg, BitArray into_bits):
-        cdef uint32_t i
+    def encode(self, attr_t[:] msg, BitArray into_bits):
+        cdef int i
         for i in range(len(msg)):
             into_bits.extend(self.codes[msg[i]].bits, self.codes[msg[i]].length)
 
-    def decode(self, bits, uint32_t[:] into_msg):
+    def decode(self, bits, attr_t[:] into_msg):
         node = self.nodes.back()
         cdef int i = 0
         cdef int n = len(into_msg)
@@ -49,7 +50,8 @@ cdef class HuffmanCodec:
                 if i == n:
                     break
         else:
-            raise Exception
+            raise Exception(
+                "Buffer exhausted at %d/%d symbols read." % (i, len(into_msg)))
 
     property strings:
         @cython.boundscheck(False)
diff --git a/spacy/serialize/packer.pxd b/spacy/serialize/packer.pxd
index f6fca5c9e..02bcdf56e 100644
--- a/spacy/serialize/packer.pxd
+++ b/spacy/serialize/packer.pxd
@@ -2,5 +2,6 @@ from ..vocab cimport Vocab
 
 
 cdef class Packer:
-    cdef tuple _codecs
-    cdef Vocab vocab
+    cdef readonly tuple attrs
+    cdef readonly tuple _codecs
+    cdef readonly Vocab vocab
diff --git a/spacy/serialize/packer.pyx b/spacy/serialize/packer.pyx
index 0edcbde04..01ae9947c 100644
--- a/spacy/serialize/packer.pyx
+++ b/spacy/serialize/packer.pyx
@@ -7,7 +7,7 @@ from libcpp.pair cimport pair
 from cymem.cymem cimport Address, Pool
 from preshed.maps cimport PreshMap
 
-from ..attrs cimport ID, ORTH, SPACY, TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
+from ..attrs cimport ID, SPACY, TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
 from ..tokens.doc cimport Doc
 from ..vocab cimport Vocab
 from ..typedefs cimport attr_t
@@ -50,24 +50,28 @@ cdef class _BinaryCodec:
         for i in range(len(msg)):
             bits.append(msg[i])
 
-    def decode(self, bits, attr_t[:] msg):
-        for i in range(len(msg)):
-            msg[i] = bits.next()
+    def decode(self, BitArray bits, attr_t[:] msg):
+        cdef int i = 0 
+        for bit in bits:
+            msg[i] = bit
+            i += 1
+            if i == len(msg):
+                break
 
 
 cdef class _AttributeCodec:
     cdef Pool mem
     cdef attr_t* _keys
-    cdef PreshMap _map
+    cdef dict _map
     cdef HuffmanCodec _codec
 
     def __init__(self, freqs):
         self.mem = Pool()
-        cdef uint64_t key
-        cdef uint64_t count
-        cdef pair[uint64_t, uint64_t] item
+        cdef attr_t key
+        cdef float count
+        cdef pair[float, attr_t] item
 
-        cdef priority_queue[pair[uint64_t, uint64_t]] items
+        cdef priority_queue[pair[float, attr_t]] items
 
         for key, count in freqs:
             item.first = count
@@ -75,7 +79,7 @@ cdef class _AttributeCodec:
             items.push(item)
         weights = numpy.ndarray(shape=(len(freqs),), dtype=numpy.float32)
         self._keys = <attr_t*>self.mem.alloc(len(freqs), sizeof(attr_t))
-        self._map = PreshMap()
+        self._map = {}
         cdef int i = 0
         while not items.empty():
             item = items.top()
@@ -88,8 +92,9 @@ cdef class _AttributeCodec:
         self._codec = HuffmanCodec(weights)
 
     def encode(self, attr_t[:] msg, BitArray dest):
+        cdef int i
         for i in range(len(msg)):
-            msg[i] = <attr_t>self._map[msg[i]]
+            msg[i] = self._map[msg[i]]
         self._codec.encode(msg, dest)
 
     def decode(self, BitArray bits, attr_t[:] dest):
@@ -103,30 +108,36 @@ cdef class Packer:
     def __init__(self, Vocab vocab, list_of_attr_freqs):
         self.vocab = vocab
         codecs = []
-        self.attrs = []
+        attrs = []
 
         for attr, freqs in list_of_attr_freqs:
-            if attr == ORTH:
+            if attr == ID:
                 codecs.append(make_vocab_codec(vocab))
             elif attr == SPACY:
                 codecs.append(_BinaryCodec())
             else:
                 codecs.append(_AttributeCodec(freqs))
-            self.attrs.append(attr)
+            attrs.append(attr)
         self._codecs = tuple(codecs)
+        self.attrs = tuple(attrs)
 
     def pack(self, Doc doc):
         array = doc.to_array(self.attrs)
         cdef BitArray bits = BitArray()
-        cdef uint32_t length = len(array)
-        bits.extend(length, 32)
+        cdef uint32_t length = 3
+        #cdef uint32_t length = len(doc)
+        #bits.extend(length, 32)
         for i, codec in enumerate(self._codecs):
-            codec.encode(array[i], bits)
+            codec.encode(array[:, i], bits)
         return bits
 
-    def unpack(self, bits):
-        cdef uint32_t length = bits.read(32)
-        array = numpy.ndarray(shape=(len(self.codecs), length), dtype=numpy.int)
-        for i, codec in enumerate(self.codecs):
-            array[i] = codec.decode(bits)
-        return Doc.from_array(self.vocab, self.attrs, array)
+    def unpack(self, BitArray bits):
+        bits.seek(0)
+        #cdef uint32_t length = bits.read32()
+        cdef uint32_t length = 3
+        array = numpy.zeros(shape=(length, len(self._codecs)), dtype=numpy.int32)
+        for i, codec in enumerate(self._codecs):
+            codec.decode(bits, array[:, i])
+        doc = Doc.from_ids(self.vocab, array[:, 0], array[:, 1])
+        doc.from_array(self.attrs, array)
+        return doc
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index db3df7156..df5a0f6c0 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -97,17 +97,15 @@ cdef class Doc:
         self._py_tokens = []
 
     @classmethod
-    def from_orth(cls, Vocab vocab, attr_t[:] orths, attr_t[:] spaces):
+    def from_ids(cls, Vocab vocab, ids, spaces):
         cdef int i
         cdef const LexemeC* lex
         cdef Doc self = cls(vocab)
-        cdef unicode string
-        cdef UniStr new_orth_c
-        for i in range(len(orths)):
-            string = vocab.strings[orths[i]]
-            slice_unicode(&new_orth_c, string, 0, len(string))
-            lex = self.vocab.get(self.mem, &new_orth_c)
-            self.push_back(lex, spaces[i])
+        cdef bint space = 0
+        for i in range(len(ids)):
+            lex = self.vocab.lexemes.at(ids[i])
+            space = spaces[i]
+            self.push_back(lex, space)
         return self
 
     def __getitem__(self, object i):
@@ -229,11 +227,11 @@ cdef class Doc:
         """
         cdef int i, j
         cdef attr_id_t feature
-        cdef np.ndarray[long, ndim=2] output
+        cdef np.ndarray[attr_t, ndim=2] output
         # Make an array from the attributes --- otherwise our inner loop is Python
         # dict iteration.
-        cdef np.ndarray[long, ndim=1] attr_ids = numpy.asarray(py_attr_ids)
-        output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.int)
+        cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.int32)
+        output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.int32)
         for i in range(self.length):
             for j, feature in enumerate(attr_ids):
                 output[i, j] = get_token_attr(&self.data[i], feature)
diff --git a/spacy/typedefs.pxd b/spacy/typedefs.pxd
index 84a6794e0..0257f0e81 100644
--- a/spacy/typedefs.pxd
+++ b/spacy/typedefs.pxd
@@ -1,10 +1,10 @@
-from libc.stdint cimport uint16_t, uint32_t, uint64_t, uintptr_t
+from libc.stdint cimport uint16_t, uint32_t, uint64_t, uintptr_t, int32_t
 from libc.stdint cimport uint8_t
 
 
 ctypedef uint64_t hash_t
 ctypedef char* utf8_t
-ctypedef uint32_t attr_t
+ctypedef int32_t attr_t
 ctypedef uint64_t flags_t
 ctypedef uint32_t id_t
 ctypedef uint16_t len_t