From ae78c9e3ce604a5a73a44789461f90f9b4c82c42 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 19 Jul 2015 22:03:39 +0200 Subject: [PATCH] * Implement character-based codec, so that we can do word/char backoff --- spacy/serialize/packer.pyx | 121 +++++++++++++++++++++---------------- 1 file changed, 68 insertions(+), 53 deletions(-) diff --git a/spacy/serialize/packer.pyx b/spacy/serialize/packer.pyx index 8c7adb1d8..dc7b7a8e8 100644 --- a/spacy/serialize/packer.pyx +++ b/spacy/serialize/packer.pyx @@ -1,12 +1,18 @@ # cython: profile=True +from __future__ import unicode_literals + from libc.stdint cimport uint32_t from libc.stdint cimport uint64_t from libc.math cimport exp as c_exp from libcpp.queue cimport priority_queue from libcpp.pair cimport pair +from ..structs cimport UniStr +from ..strings cimport slice_unicode + from cymem.cymem cimport Address, Pool from preshed.maps cimport PreshMap +from preshed.counter cimport PreshCounter from ..attrs cimport ORTH, ID, SPACY, TAG, HEAD, DEP, ENT_IOB, ENT_TYPE from ..tokens.doc cimport Doc @@ -51,51 +57,6 @@ cdef class _BinaryCodec: break -cdef class _AttributeCodec: - cdef Pool mem - cdef attr_t* _keys - cdef dict _map - cdef HuffmanCodec _codec - - def __init__(self, freqs): - self.mem = Pool() - cdef attr_t key - cdef float count - cdef pair[float, attr_t] item - - cdef priority_queue[pair[float, attr_t]] items - - for key, count in freqs: - item.first = count - item.second = key - items.push(item) - weights = numpy.ndarray(shape=(items.size(),), dtype=numpy.float32) - self._keys = self.mem.alloc(items.size(), sizeof(attr_t)) - self._map = {} - cdef int i = 0 - while not items.empty(): - item = items.top() - # We put freq first above, for sorting - self._keys[i] = item.second - weights[i] = item.first - self._map[self._keys[i]] = i - items.pop() - i += 1 - self._codec = HuffmanCodec(weights) - - def encode(self, attr_t[:] msg, BitArray dest): - cdef int i - for i in range(len(msg)): - msg[i] = self._map[msg[i]] - self._codec.encode(msg, dest) - - def decode(self, BitArray bits, attr_t[:] dest): - cdef int i - self._codec.decode(bits, dest) - for i in range(len(dest)): - dest[i] = self._keys[dest[i]] - - def _gen_orths(Vocab vocab): cdef attr_t orth cdef size_t addr @@ -104,17 +65,34 @@ def _gen_orths(Vocab vocab): yield orth, c_exp(lex.prob) +def _gen_chars(Vocab vocab): + cdef attr_t orth + cdef size_t addr + char_weights = {b' ': 0.0} + cdef unicode string + cdef unicode char + for orth, addr in vocab._by_orth.items(): + lex = addr + string = vocab.strings[lex.orth] + for char in string: + char_weights.setdefault(char, 0.0) + char_weights[char] += c_exp(lex.prob) + char_weights[u' '] += c_exp(lex.prob) + return char_weights.items() + + cdef class Packer: def __init__(self, Vocab vocab, attr_freqs): self.vocab = vocab - self.lex_codec = _AttributeCodec(_gen_orths(vocab)) + self.orth_codec = HuffmanCodec(_gen_orths(vocab)) + self.char_codec = HuffmanCodec(_gen_chars(vocab)) - codecs = [_AttributeCodec(_gen_orths(vocab)), _BinaryCodec()] - attrs = [ORTH, SPACY] + codecs = [] + attrs = [] for attr, freqs in sorted(attr_freqs): if attr in (ORTH, ID, SPACY): continue - codecs.append(_AttributeCodec(freqs)) + codecs.append(HuffmanCodec(freqs)) attrs.append(attr) self._codecs = tuple(codecs) self.attrs = tuple(attrs) @@ -124,10 +102,11 @@ cdef class Packer: return cls(vocab, util.read_encoding_freqs(data_dir)) def pack(self, Doc doc): - array = doc.to_array(self.attrs) cdef BitArray bits = BitArray() - cdef uint32_t length = len(doc) + cdef uint32_t length = len(doc.string) bits.extend(length, 32) + self._char_encode(doc, bits) + array = doc.to_array(self.attrs) for i, codec in enumerate(self._codecs): codec.encode(array[:, i], bits) return bits @@ -135,7 +114,43 @@ cdef class Packer: def unpack(self, BitArray bits): bits.seek(0) cdef uint32_t length = bits.read32() - array = numpy.zeros(shape=(length, len(self._codecs)), dtype=numpy.int32) + doc = self._char_decode(bits, length) + + array = numpy.zeros(shape=(len(doc), len(self._codecs)), dtype=numpy.int32) for i, codec in enumerate(self._codecs): codec.decode(bits, array[:, i]) - return array + + doc.from_array(self.attrs, array) + return doc + + def _char_encode(self, Doc doc, BitArray bits): + cdef unicode string = doc.string + self.char_codec.encode(string, bits) + for token in doc: + for i in range(len(token)-1): + bits.append(False) + bits.append(True) + if token.whitespace_: + bits.append(False) + + def _char_decode(self, BitArray bits, n): + chars = [u''] * n + self.char_codec.decode(bits, chars) + + cdef unicode string = u''.join(chars) + cdef Doc tokens = Doc(self.vocab) + cdef int i + cdef int start = 0 + cdef bint is_spacy + cdef UniStr span + cdef int length = len(string) + iter_bits = iter(bits) + for i in range(length): + is_end_token = iter_bits.next() + if is_end_token: + slice_unicode(&span, string, start, i+1) + lex = self.vocab.get(tokens.mem, &span) + is_spacy = (i+1) < length and string[i+1] == u' ' + tokens.push_back(lex, is_spacy) + start = i + 1 + is_spacy + return tokens