spaCy/spacy/serialize/packer.pyx

# cython: profile=True
from __future__ import unicode_literals

from libc.stdint cimport uint32_t, int32_t
from libc.stdint cimport uint64_t
from libc.math cimport exp as c_exp
from libcpp.queue cimport priority_queue
from libcpp.pair cimport pair

from cymem.cymem cimport Address, Pool
from preshed.maps cimport PreshMap
from preshed.counter cimport PreshCounter
import json

from ..attrs cimport ORTH, ID, SPACY, TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
from ..tokens.doc cimport Doc
from ..vocab cimport Vocab
from ..structs cimport LexemeC
from ..typedefs cimport attr_t
from .bits cimport BitArray
from .huffman cimport HuffmanCodec

from os import path
import numpy
from .. import util

cimport cython


# Format
# - Total number of bytes in message (32 bit int) --- handled outside this
# - Number of words (32 bit int)
# - Words, terminating in an EOL symbol, huffman coded ~12 bits per word
# - Spaces 1 bit per word
# - Attributes:
#       POS tag
#       Head offset
#       Dep label
#       Entity IOB
#       Entity tag


cdef class _BinaryCodec:
    def encode(self, attr_t[:] msg, BitArray bits):
        cdef int i
        for i in range(len(msg)):
            bits.append(msg[i])

    def decode(self, BitArray bits, attr_t[:] msg):
        cdef int i = 0 
        for bit in bits:
            msg[i] = bit
            i += 1
            if i == len(msg):
                break


def _gen_orths(Vocab vocab):
    cdef attr_t orth
    cdef size_t addr
    for orth, addr in vocab._by_orth.items():
        lex = <LexemeC*>addr
        yield orth, c_exp(lex.prob)


def _gen_chars(Vocab vocab):
    cdef attr_t orth
    cdef size_t addr
    char_weights = {i: 1e-20 for i in range(256)}
    cdef unicode string
    cdef bytes char
    cdef bytes utf8_str
    for orth, addr in vocab._by_orth.items():
        lex = <LexemeC*>addr
        string = vocab.strings[lex.orth]
        utf8_str = string.encode('utf8')
        for char in utf8_str:
            char_weights.setdefault(ord(char), 0.0)
            char_weights[ord(char)] += c_exp(lex.prob)
        char_weights[ord(' ')] += c_exp(lex.prob)
    return char_weights.items()


cdef class Packer:
    def __init__(self, Vocab vocab, attr_freqs, char_freqs=None):
        if char_freqs is None:
            char_freqs = _gen_chars(vocab)
        self.vocab = vocab
        self.orth_codec = HuffmanCodec(_gen_orths(vocab))
        self.char_codec = HuffmanCodec(char_freqs)
        
        codecs = []
        attrs = []
        for attr, freqs in sorted(attr_freqs):
            if attr in (ORTH, ID, SPACY):
                continue
            codecs.append(HuffmanCodec(freqs))
            attrs.append(attr)
        self._codecs = tuple(codecs)
        self.attrs = tuple(attrs)

    def pack(self, Doc doc):
        bits = self._orth_encode(doc)
        if bits is None:
            bits = self._char_encode(doc)
        cdef int i
        if self.attrs:
            array = doc.to_array(self.attrs)
            for i, codec in enumerate(self._codecs):
                codec.encode(array[:, i], bits)
        return bits.as_bytes()

    def unpack(self, data):
        doc = Doc(self.vocab)
        self.unpack_into(data, doc)
        return doc

    def unpack_into(self, byte_string, Doc doc):
        bits = BitArray(byte_string)
        bits.seek(0)
        cdef int32_t length = bits.read32()
        if length >= 0:
            self._orth_decode(bits, length, doc)
        else:
            self._char_decode(bits, -length, doc)
        
        array = numpy.zeros(shape=(len(doc), len(self._codecs)), dtype=numpy.int32)
        for i, codec in enumerate(self._codecs):
            codec.decode(bits, array[:, i])

        doc.from_array(self.attrs, array)
        return doc

    def _orth_encode(self, Doc doc):
        for t in doc:
            if t.is_oov:
                return None
        cdef BitArray bits = BitArray()
        cdef int32_t length = len(doc)
        bits.extend(length, 32) 
        orths = doc.to_array([ORTH])
        n_bits = self.orth_codec.encode_int32(orths[:, 0], bits)
        if n_bits == 0:
            return None
        for token in doc:
            bits.append(bool(token.whitespace_))
        return bits

    def _char_encode(self, Doc doc):
        cdef bytes utf8_str = doc.string.encode('utf8')
        cdef BitArray bits = BitArray()
        cdef int32_t length = len(utf8_str)
        # Signal chars with negative length
        bits.extend(-length, 32)
        self.char_codec.encode(bytearray(utf8_str), bits)
        cdef int i, j
        for i in range(doc.length):
            for j in range(doc.c[i].lex.length-1):
                bits.append(False)
            bits.append(True)
            if doc.c[i].spacy:
                bits.append(False)
        return bits

    def _orth_decode(self, BitArray bits, int32_t n, Doc doc):
        cdef attr_t[:] orths = numpy.ndarray(shape=(n,), dtype=numpy.int32)
        self.orth_codec.decode_int32(bits, orths)
        cdef int i
        cdef bint space
        spaces = iter(bits)
        for i in range(n):
            orth = orths[i]
            space = next(spaces)
            lex = self.vocab.get_by_orth(doc.mem, orth)
            doc.push_back(lex, space)
        return doc

    def _char_decode(self, BitArray bits, int32_t n_bytes, Doc doc):
        cdef bytearray utf8_str = bytearray(n_bytes)
        self.char_codec.decode(bits, utf8_str)

        cdef unicode string = utf8_str.decode('utf8')
        cdef int start = 0
        cdef bint is_spacy
        cdef int n_unicode_chars = len(string)
        cdef int i = 0
        cdef bint is_end_token
        for is_end_token in bits:
            if is_end_token:
                span = string[start:i+1]
                lex = self.vocab.get(doc.mem, span)
                is_spacy = (i+1) < n_unicode_chars and string[i+1] == u' '
                doc.push_back(lex, is_spacy)
                start = i + 1 + is_spacy
            i += 1
            if i >= n_unicode_chars:
                break
        return doc
* Fix hard-coded length 2015-07-18 05:09:56 +03:00			`# cython: profile=True`
* Implement character-based codec, so that we can do word/char backoff 2015-07-19 23:03:39 +03:00			`from __future__ import unicode_literals`

* Switch between the orth and char codecs depending on which is shorter for that message. Mostly orth is shorter, except if there are OOV words. 2015-07-20 02:36:22 +03:00			`from libc.stdint cimport uint32_t, int32_t`
* Major refactor of serialization. Nearly complete now. 2015-07-17 02:19:29 +03:00			`from libc.stdint cimport uint64_t`
			`from libc.math cimport exp as c_exp`
			`from libcpp.queue cimport priority_queue`
			`from libcpp.pair cimport pair`

			`from cymem.cymem cimport Address, Pool`
			`from preshed.maps cimport PreshMap`
* Implement character-based codec, so that we can do word/char backoff 2015-07-19 23:03:39 +03:00			`from preshed.counter cimport PreshCounter`
* Fix Packer API, so that it reads and writes bytes strings, instead of BitArray. Docs are always byte aligned anyway. 2015-07-23 02:12:00 +03:00			`import json`
* Major refactor of serialization. Nearly complete now. 2015-07-17 02:19:29 +03:00
* Serialization round trip now working with decent API, but with rough spots in the organisation and requiring vocabulary to be fixed ahead of time. 2015-07-19 16:18:17 +03:00			`from ..attrs cimport ORTH, ID, SPACY, TAG, HEAD, DEP, ENT_IOB, ENT_TYPE`
* Major refactor of serialization. Nearly complete now. 2015-07-17 02:19:29 +03:00			`from ..tokens.doc cimport Doc`
			`from ..vocab cimport Vocab`
* Serialization round trip now working with decent API, but with rough spots in the organisation and requiring vocabulary to be fixed ahead of time. 2015-07-19 16:18:17 +03:00			`from ..structs cimport LexemeC`
* Major refactor of serialization. Nearly complete now. 2015-07-17 02:19:29 +03:00			`from ..typedefs cimport attr_t`
			`from .bits cimport BitArray`
			`from .huffman cimport HuffmanCodec`

			`from os import path`
			`import numpy`
* Serialization round trip now working with decent API, but with rough spots in the organisation and requiring vocabulary to be fixed ahead of time. 2015-07-19 16:18:17 +03:00			`from .. import util`
* Major refactor of serialization. Nearly complete now. 2015-07-17 02:19:29 +03:00
			`cimport cython`


			`# Format`
			`# - Total number of bytes in message (32 bit int) --- handled outside this`
			`# - Number of words (32 bit int)`
			`# - Words, terminating in an EOL symbol, huffman coded ~12 bits per word`
			`# - Spaces 1 bit per word`
			`# - Attributes:`
			`# POS tag`
			`# Head offset`
			`# Dep label`
			`# Entity IOB`
			`# Entity tag`


			`cdef class _BinaryCodec:`
* Move to ORTH instead of ID for encoding lexemes. Basic tests of the codec wrappers now passing 2015-07-17 17:38:29 +03:00			`def encode(self, attr_t[:] msg, BitArray bits):`
* Major refactor of serialization. Nearly complete now. 2015-07-17 02:19:29 +03:00			`cdef int i`
* Move to ORTH instead of ID for encoding lexemes. Basic tests of the codec wrappers now passing 2015-07-17 17:38:29 +03:00			`for i in range(len(msg)):`
			`bits.append(msg[i])`
* Major refactor of serialization. Nearly complete now. 2015-07-17 02:19:29 +03:00
* Tests passing on round-trip pack/unpack on basic example 2015-07-17 22:20:48 +03:00			`def decode(self, BitArray bits, attr_t[:] msg):`
			`cdef int i = 0`
			`for bit in bits:`
			`msg[i] = bit`
			`i += 1`
			`if i == len(msg):`
			`break`
* Major refactor of serialization. Nearly complete now. 2015-07-17 02:19:29 +03:00

* Serialization round trip now working with decent API, but with rough spots in the organisation and requiring vocabulary to be fixed ahead of time. 2015-07-19 16:18:17 +03:00			`def _gen_orths(Vocab vocab):`
			`cdef attr_t orth`
			`cdef size_t addr`
			`for orth, addr in vocab._by_orth.items():`
			`lex = <LexemeC*>addr`
			`yield orth, c_exp(lex.prob)`


* Implement character-based codec, so that we can do word/char backoff 2015-07-19 23:03:39 +03:00			`def _gen_chars(Vocab vocab):`
			`cdef attr_t orth`
			`cdef size_t addr`
* Fix bytes problems for Python3 2015-07-24 04:48:23 +03:00			`char_weights = {i: 1e-20 for i in range(256)}`
* Implement character-based codec, so that we can do word/char backoff 2015-07-19 23:03:39 +03:00			`cdef unicode string`
* Implement both character and orth encoding in Packer, so that we can decide which to use per-text 2015-07-19 23:39:45 +03:00			`cdef bytes char`
			`cdef bytes utf8_str`
* Implement character-based codec, so that we can do word/char backoff 2015-07-19 23:03:39 +03:00			`for orth, addr in vocab._by_orth.items():`
			`lex = <LexemeC*>addr`
			`string = vocab.strings[lex.orth]`
* Implement both character and orth encoding in Packer, so that we can decide which to use per-text 2015-07-19 23:39:45 +03:00			`utf8_str = string.encode('utf8')`
			`for char in utf8_str:`
* Fix bytes problems for Python3 2015-07-24 04:48:23 +03:00			`char_weights.setdefault(ord(char), 0.0)`
			`char_weights[ord(char)] += c_exp(lex.prob)`
			`char_weights[ord(' ')] += c_exp(lex.prob)`
* Implement character-based codec, so that we can do word/char backoff 2015-07-19 23:03:39 +03:00			`return char_weights.items()`


* Major refactor of serialization. Nearly complete now. 2015-07-17 02:19:29 +03:00			`cdef class Packer:`
* Switch between the orth and char codecs depending on which is shorter for that message. Mostly orth is shorter, except if there are OOV words. 2015-07-20 02:36:22 +03:00			`def __init__(self, Vocab vocab, attr_freqs, char_freqs=None):`
			`if char_freqs is None:`
			`char_freqs = _gen_chars(vocab)`
* Major refactor of serialization. Nearly complete now. 2015-07-17 02:19:29 +03:00			`self.vocab = vocab`
* Implement character-based codec, so that we can do word/char backoff 2015-07-19 23:03:39 +03:00			`self.orth_codec = HuffmanCodec(_gen_orths(vocab))`
* Switch between the orth and char codecs depending on which is shorter for that message. Mostly orth is shorter, except if there are OOV words. 2015-07-20 02:36:22 +03:00			`self.char_codec = HuffmanCodec(char_freqs)`
* Serialization round trip now working with decent API, but with rough spots in the organisation and requiring vocabulary to be fixed ahead of time. 2015-07-19 16:18:17 +03:00
* Implement character-based codec, so that we can do word/char backoff 2015-07-19 23:03:39 +03:00			`codecs = []`
			`attrs = []`
* Serialization round trip now working with decent API, but with rough spots in the organisation and requiring vocabulary to be fixed ahead of time. 2015-07-19 16:18:17 +03:00			`for attr, freqs in sorted(attr_freqs):`
			`if attr in (ORTH, ID, SPACY):`
			`continue`
* Implement character-based codec, so that we can do word/char backoff 2015-07-19 23:03:39 +03:00			`codecs.append(HuffmanCodec(freqs))`
* Tests passing on round-trip pack/unpack on basic example 2015-07-17 22:20:48 +03:00			`attrs.append(attr)`
* Major refactor of serialization. Nearly complete now. 2015-07-17 02:19:29 +03:00			`self._codecs = tuple(codecs)`
* Tests passing on round-trip pack/unpack on basic example 2015-07-17 22:20:48 +03:00			`self.attrs = tuple(attrs)`
* Major refactor of serialization. Nearly complete now. 2015-07-17 02:19:29 +03:00
* Move to ORTH instead of ID for encoding lexemes. Basic tests of the codec wrappers now passing 2015-07-17 17:38:29 +03:00			`def pack(self, Doc doc):`
* Optimistically try orth encoding, with char as a back-off 2015-07-21 21:22:45 +03:00			`bits = self._orth_encode(doc)`
			`if bits is None:`
			`bits = self._char_encode(doc)`
* Improve serialization speed 2015-07-20 04:27:59 +03:00			`cdef int i`
			`if self.attrs:`
			`array = doc.to_array(self.attrs)`
			`for i, codec in enumerate(self._codecs):`
* Fix Packer API, so that it reads and writes bytes strings, instead of BitArray. Docs are always byte aligned anyway. 2015-07-23 02:12:00 +03:00			`codec.encode(array[:, i], bits)`
			`return bits.as_bytes()`

* Fix bytes problems for Python3 2015-07-24 04:48:23 +03:00			`def unpack(self, data):`
* Fix Packer API, so that it reads and writes bytes strings, instead of BitArray. Docs are always byte aligned anyway. 2015-07-23 02:12:00 +03:00			`doc = Doc(self.vocab)`
			`self.unpack_into(data, doc)`
			`return doc`
* Major refactor of serialization. Nearly complete now. 2015-07-17 02:19:29 +03:00
* Fix bytes problems for Python3 2015-07-24 04:48:23 +03:00			`def unpack_into(self, byte_string, Doc doc):`
* Fix Packer API, so that it reads and writes bytes strings, instead of BitArray. Docs are always byte aligned anyway. 2015-07-23 02:12:00 +03:00			`bits = BitArray(byte_string)`
* Tests passing on round-trip pack/unpack on basic example 2015-07-17 22:20:48 +03:00			`bits.seek(0)`
* Switch between the orth and char codecs depending on which is shorter for that message. Mostly orth is shorter, except if there are OOV words. 2015-07-20 02:36:22 +03:00			`cdef int32_t length = bits.read32()`
			`if length >= 0:`
* Fix Packer API, so that it reads and writes bytes strings, instead of BitArray. Docs are always byte aligned anyway. 2015-07-23 02:12:00 +03:00			`self._orth_decode(bits, length, doc)`
* Switch between the orth and char codecs depending on which is shorter for that message. Mostly orth is shorter, except if there are OOV words. 2015-07-20 02:36:22 +03:00			`else:`
* Fix Packer API, so that it reads and writes bytes strings, instead of BitArray. Docs are always byte aligned anyway. 2015-07-23 02:12:00 +03:00			`self._char_decode(bits, -length, doc)`

* Implement character-based codec, so that we can do word/char backoff 2015-07-19 23:03:39 +03:00			`array = numpy.zeros(shape=(len(doc), len(self._codecs)), dtype=numpy.int32)`
* Tests passing on round-trip pack/unpack on basic example 2015-07-17 22:20:48 +03:00			`for i, codec in enumerate(self._codecs):`
* Fix Packer API, so that it reads and writes bytes strings, instead of BitArray. Docs are always byte aligned anyway. 2015-07-23 02:12:00 +03:00			`codec.decode(bits, array[:, i])`
* Implement character-based codec, so that we can do word/char backoff 2015-07-19 23:03:39 +03:00
			`doc.from_array(self.attrs, array)`
			`return doc`

* Optimistically try orth encoding, with char as a back-off 2015-07-21 21:22:45 +03:00			`def _orth_encode(self, Doc doc):`
* Ensure we don't use orth_encode on OOV words. 2015-07-27 03:12:01 +03:00			`for t in doc:`
			`if t.is_oov:`
			`return None`
* Implement both character and orth encoding in Packer, so that we can decide which to use per-text 2015-07-19 23:39:45 +03:00			`cdef BitArray bits = BitArray()`
* Switch between the orth and char codecs depending on which is shorter for that message. Mostly orth is shorter, except if there are OOV words. 2015-07-20 02:36:22 +03:00			`cdef int32_t length = len(doc)`
			`bits.extend(length, 32)`
* Optimistically try orth encoding, with char as a back-off 2015-07-21 21:22:45 +03:00			`orths = doc.to_array([ORTH])`
			`n_bits = self.orth_codec.encode_int32(orths[:, 0], bits)`
			`if n_bits == 0:`
			`return None`
* Implement both character and orth encoding in Packer, so that we can decide which to use per-text 2015-07-19 23:39:45 +03:00			`for token in doc:`
			`bits.append(bool(token.whitespace_))`
			`return bits`

* Optimistically try orth encoding, with char as a back-off 2015-07-21 21:22:45 +03:00			`def _char_encode(self, Doc doc):`
			`cdef bytes utf8_str = doc.string.encode('utf8')`
* Implement both character and orth encoding in Packer, so that we can decide which to use per-text 2015-07-19 23:39:45 +03:00			`cdef BitArray bits = BitArray()`
* Switch between the orth and char codecs depending on which is shorter for that message. Mostly orth is shorter, except if there are OOV words. 2015-07-20 02:36:22 +03:00			`cdef int32_t length = len(utf8_str)`
			`# Signal chars with negative length`
			`bits.extend(-length, 32)`
* Fix regression in packer 2015-07-27 22:53:38 +03:00			`self.char_codec.encode(bytearray(utf8_str), bits)`
* Improve serialization speed 2015-07-20 04:27:59 +03:00			`cdef int i, j`
			`for i in range(doc.length):`
* Rename Doc.data to Doc.c 2015-11-03 16:17:13 +03:00			`for j in range(doc.c[i].lex.length-1):`
* Implement character-based codec, so that we can do word/char backoff 2015-07-19 23:03:39 +03:00			`bits.append(False)`
			`bits.append(True)`
* Rename Doc.data to Doc.c 2015-11-03 16:17:13 +03:00			`if doc.c[i].spacy:`
* Implement character-based codec, so that we can do word/char backoff 2015-07-19 23:03:39 +03:00			`bits.append(False)`
* Implement both character and orth encoding in Packer, so that we can decide which to use per-text 2015-07-19 23:39:45 +03:00			`return bits`
* Implement character-based codec, so that we can do word/char backoff 2015-07-19 23:03:39 +03:00
* Fix Packer API, so that it reads and writes bytes strings, instead of BitArray. Docs are always byte aligned anyway. 2015-07-23 02:12:00 +03:00			`def _orth_decode(self, BitArray bits, int32_t n, Doc doc):`
			`cdef attr_t[:] orths = numpy.ndarray(shape=(n,), dtype=numpy.int32)`
			`self.orth_codec.decode_int32(bits, orths)`
			`cdef int i`
			`cdef bint space`
			`spaces = iter(bits)`
			`for i in range(n):`
			`orth = orths[i]`
* Fix bytes problems for Python3 2015-07-24 04:48:23 +03:00			`space = next(spaces)`
* Fix Packer API, so that it reads and writes bytes strings, instead of BitArray. Docs are always byte aligned anyway. 2015-07-23 02:12:00 +03:00			`lex = self.vocab.get_by_orth(doc.mem, orth)`
			`doc.push_back(lex, space)`
			`return doc`

* Fix bug in decoding non-ascii characters 2015-07-27 22:43:58 +03:00			`def _char_decode(self, BitArray bits, int32_t n_bytes, Doc doc):`
			`cdef bytearray utf8_str = bytearray(n_bytes)`
* Improve serialization speed 2015-07-20 04:27:59 +03:00			`self.char_codec.decode(bits, utf8_str)`
* Implement character-based codec, so that we can do word/char backoff 2015-07-19 23:03:39 +03:00
* Implement both character and orth encoding in Packer, so that we can decide which to use per-text 2015-07-19 23:39:45 +03:00			`cdef unicode string = utf8_str.decode('utf8')`
* Implement character-based codec, so that we can do word/char backoff 2015-07-19 23:03:39 +03:00			`cdef int start = 0`
			`cdef bint is_spacy`
* Fix bug in decoding non-ascii characters 2015-07-27 22:43:58 +03:00			`cdef int n_unicode_chars = len(string)`
* Improve serialization speed 2015-07-20 04:27:59 +03:00			`cdef int i = 0`
			`cdef bint is_end_token`
			`for is_end_token in bits:`
* Implement character-based codec, so that we can do word/char backoff 2015-07-19 23:03:39 +03:00			`if is_end_token:`
* Remove UniStr struct 2015-07-22 14:39:17 +03:00			`span = string[start:i+1]`
* Fix Packer API, so that it reads and writes bytes strings, instead of BitArray. Docs are always byte aligned anyway. 2015-07-23 02:12:00 +03:00			`lex = self.vocab.get(doc.mem, span)`
* Fix bug in decoding non-ascii characters 2015-07-27 22:43:58 +03:00			`is_spacy = (i+1) < n_unicode_chars and string[i+1] == u' '`
* Fix Packer API, so that it reads and writes bytes strings, instead of BitArray. Docs are always byte aligned anyway. 2015-07-23 02:12:00 +03:00			`doc.push_back(lex, is_spacy)`
* Implement character-based codec, so that we can do word/char backoff 2015-07-19 23:03:39 +03:00			`start = i + 1 + is_spacy`
* Improve serialization speed 2015-07-20 04:27:59 +03:00			`i += 1`
* Fix bug in decoding non-ascii characters 2015-07-27 22:43:58 +03:00			`if i >= n_unicode_chars:`
* Improve serialization speed 2015-07-20 04:27:59 +03:00			`break`
* Fix Packer API, so that it reads and writes bytes strings, instead of BitArray. Docs are always byte aligned anyway. 2015-07-23 02:12:00 +03:00			`return doc`