mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	* Major refactor of serialization. Nearly complete now.
This commit is contained in:
		
							parent
							
								
									c8282f9934
								
							
						
					
					
						commit
						db9dfd2e23
					
				
							
								
								
									
										4
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										4
									
								
								setup.py
									
									
									
									
									
								
							| 
						 | 
					@ -94,6 +94,7 @@ def cython_setup(mod_names, language, includes, compile_args, link_args):
 | 
				
			||||||
                                   "data/vocab/lexemes.bin",
 | 
					                                   "data/vocab/lexemes.bin",
 | 
				
			||||||
                                   "data/vocab/strings.txt"],
 | 
					                                   "data/vocab/strings.txt"],
 | 
				
			||||||
                      "spacy.tokens": ["*.pxd"],
 | 
					                      "spacy.tokens": ["*.pxd"],
 | 
				
			||||||
 | 
					                      "spacy.serialize": ["*.pxd"],
 | 
				
			||||||
                      "spacy.syntax": ["*.pxd"]},
 | 
					                      "spacy.syntax": ["*.pxd"]},
 | 
				
			||||||
        ext_modules=exts,
 | 
					        ext_modules=exts,
 | 
				
			||||||
        cmdclass={'build_ext': Cython.Distutils.build_ext},
 | 
					        cmdclass={'build_ext': Cython.Distutils.build_ext},
 | 
				
			||||||
| 
						 | 
					@ -158,8 +159,9 @@ MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings',
 | 
				
			||||||
             'spacy.syntax.transition_system',
 | 
					             'spacy.syntax.transition_system',
 | 
				
			||||||
             'spacy.syntax.arc_eager',
 | 
					             'spacy.syntax.arc_eager',
 | 
				
			||||||
             'spacy.syntax._parse_features',
 | 
					             'spacy.syntax._parse_features',
 | 
				
			||||||
             'spacy.gold', 'spacy.orth', 'spacy.serialize',
 | 
					             'spacy.gold', 'spacy.orth',
 | 
				
			||||||
             'spacy.tokens.doc', 'spacy.tokens.spans', 'spacy.tokens.token',
 | 
					             'spacy.tokens.doc', 'spacy.tokens.spans', 'spacy.tokens.token',
 | 
				
			||||||
 | 
					             'spacy.serialize.packer', 'spacy.serialize.huffman', 'spacy.serialize.bits',
 | 
				
			||||||
             'spacy.syntax.ner']
 | 
					             'spacy.syntax.ner']
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -70,6 +70,7 @@ class English(object):
 | 
				
			||||||
      Tagger=EnPosTagger,
 | 
					      Tagger=EnPosTagger,
 | 
				
			||||||
      Parser=ParserFactory(ParserTransitionSystem),
 | 
					      Parser=ParserFactory(ParserTransitionSystem),
 | 
				
			||||||
      Entity=ParserFactory(EntityTransitionSystem),
 | 
					      Entity=ParserFactory(EntityTransitionSystem),
 | 
				
			||||||
 | 
					      Packer=None,
 | 
				
			||||||
      load_vectors=True
 | 
					      load_vectors=True
 | 
				
			||||||
    ):
 | 
					    ):
 | 
				
			||||||
        
 | 
					        
 | 
				
			||||||
| 
						 | 
					@ -101,10 +102,10 @@ class English(object):
 | 
				
			||||||
            self.entity = Entity(self.vocab.strings, path.join(data_dir, 'ner'))
 | 
					            self.entity = Entity(self.vocab.strings, path.join(data_dir, 'ner'))
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            self.entity = None
 | 
					            self.entity = None
 | 
				
			||||||
        if Serializer:
 | 
					        if Packer:
 | 
				
			||||||
            self.bitter = Serializer(self.vocab, data_dir)
 | 
					            self.packer = Packer(self.vocab, data_dir)
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            self.bitter = None
 | 
					            self.packer = None
 | 
				
			||||||
        self.mwe_merger = RegexMerger([
 | 
					        self.mwe_merger = RegexMerger([
 | 
				
			||||||
            ('IN', 'O', regexes.MW_PREPOSITIONS_RE),
 | 
					            ('IN', 'O', regexes.MW_PREPOSITIONS_RE),
 | 
				
			||||||
            ('CD', 'TIME', regexes.TIME_RE),
 | 
					            ('CD', 'TIME', regexes.TIME_RE),
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,334 +0,0 @@
 | 
				
			||||||
from libcpp.vector cimport vector
 | 
					 | 
				
			||||||
from libc.stdint cimport uint32_t
 | 
					 | 
				
			||||||
from libc.stdint cimport int64_t
 | 
					 | 
				
			||||||
from libc.stdint cimport int32_t
 | 
					 | 
				
			||||||
from libc.stdint cimport uint64_t
 | 
					 | 
				
			||||||
from libcpp.queue cimport priority_queue
 | 
					 | 
				
			||||||
from libcpp.pair cimport pair
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from preshed.maps cimport PreshMap
 | 
					 | 
				
			||||||
from murmurhash.mrmr cimport hash64
 | 
					 | 
				
			||||||
from .tokens.doc cimport Doc
 | 
					 | 
				
			||||||
from .vocab cimport Vocab
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from os import path
 | 
					 | 
				
			||||||
import numpy
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
cimport cython
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
ctypedef unsigned char uchar
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
# Format
 | 
					 | 
				
			||||||
# - Total number of bytes in message (32 bit int)
 | 
					 | 
				
			||||||
# - Words, terminating in an EOL symbol, huffman coded ~12 bits per word
 | 
					 | 
				
			||||||
# - Spaces ~1 bit per word
 | 
					 | 
				
			||||||
# - Parse: Huffman coded head offset / dep label / POS tag / entity IOB tag
 | 
					 | 
				
			||||||
#          combo. ? bits per word. 40 * 80 * 40 * 12 = 1.5m symbol vocab
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
# Note that we're setting the most significant bits here first, when in practice
 | 
					 | 
				
			||||||
# we're actually wanting the last bit to be most significant (for Huffman coding,
 | 
					 | 
				
			||||||
# anyway).
 | 
					 | 
				
			||||||
cdef Code bit_append(Code code, bint bit) nogil:
 | 
					 | 
				
			||||||
    cdef uint64_t one = 1
 | 
					 | 
				
			||||||
    if bit:
 | 
					 | 
				
			||||||
        code.bits |= one << code.length
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        code.bits &= ~(one << code.length)
 | 
					 | 
				
			||||||
    code.length += 1
 | 
					 | 
				
			||||||
    return code
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
cdef class BitArray:
 | 
					 | 
				
			||||||
    cdef bytes data
 | 
					 | 
				
			||||||
    cdef unsigned char byte
 | 
					 | 
				
			||||||
    cdef unsigned char bit_of_byte
 | 
					 | 
				
			||||||
    cdef uint32_t i
 | 
					 | 
				
			||||||
    def __init__(self):
 | 
					 | 
				
			||||||
        self.data = b''
 | 
					 | 
				
			||||||
        self.byte = 0
 | 
					 | 
				
			||||||
        self.bit_of_byte = 0
 | 
					 | 
				
			||||||
        self.i = 0
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def __iter__(self):
 | 
					 | 
				
			||||||
        cdef uchar byte, i
 | 
					 | 
				
			||||||
        cdef uchar one = 1
 | 
					 | 
				
			||||||
        start_byte = self.i // 8
 | 
					 | 
				
			||||||
        if (self.i % 8) != 0:
 | 
					 | 
				
			||||||
            for i in range(self.i % 8):
 | 
					 | 
				
			||||||
                yield 1 if (self.data[start_byte] & (one << i)) else 0
 | 
					 | 
				
			||||||
            start_byte += 1
 | 
					 | 
				
			||||||
        for byte in self.data[start_byte:]:
 | 
					 | 
				
			||||||
            for i in range(8):
 | 
					 | 
				
			||||||
                yield 1 if byte & (one << i) else 0
 | 
					 | 
				
			||||||
        for i in range(self.bit_of_byte):
 | 
					 | 
				
			||||||
            yield 1 if self.byte & (one << i) else 0
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def as_bytes(self):
 | 
					 | 
				
			||||||
        if self.bit_of_byte != 0:
 | 
					 | 
				
			||||||
            return self.data + chr(self.byte)
 | 
					 | 
				
			||||||
        else:
 | 
					 | 
				
			||||||
            return self.data
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def append(self, bint bit):
 | 
					 | 
				
			||||||
        cdef uint64_t one = 1
 | 
					 | 
				
			||||||
        if bit:
 | 
					 | 
				
			||||||
            self.byte |= one << self.bit_of_byte
 | 
					 | 
				
			||||||
        else:
 | 
					 | 
				
			||||||
            self.byte &= ~(one << self.bit_of_byte)
 | 
					 | 
				
			||||||
        self.bit_of_byte += 1
 | 
					 | 
				
			||||||
        if self.bit_of_byte == 8:
 | 
					 | 
				
			||||||
            self.data += chr(self.byte)
 | 
					 | 
				
			||||||
            self.byte = 0
 | 
					 | 
				
			||||||
            self.bit_of_byte = 0
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    cdef int extend(self, uint64_t code, char n_bits) except -1:
 | 
					 | 
				
			||||||
        cdef uint64_t one = 1
 | 
					 | 
				
			||||||
        cdef unsigned char bit_of_code
 | 
					 | 
				
			||||||
        for bit_of_code in range(n_bits):
 | 
					 | 
				
			||||||
            if code & (one << bit_of_code):
 | 
					 | 
				
			||||||
                self.byte |= one << self.bit_of_byte
 | 
					 | 
				
			||||||
            else:
 | 
					 | 
				
			||||||
                self.byte &= ~(one << self.bit_of_byte)
 | 
					 | 
				
			||||||
            self.bit_of_byte += 1
 | 
					 | 
				
			||||||
            if self.bit_of_byte == 8:
 | 
					 | 
				
			||||||
                self.data += chr(self.byte)
 | 
					 | 
				
			||||||
                self.byte = 0
 | 
					 | 
				
			||||||
                self.bit_of_byte = 0
 | 
					 | 
				
			||||||
    
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
cdef class Serializer:
 | 
					 | 
				
			||||||
    # Manage codecs, maintain consistent format for io
 | 
					 | 
				
			||||||
    def __init__(self, Vocab vocab, data_dir):
 | 
					 | 
				
			||||||
        model_dir = path.join(data_dir, 'bitter')
 | 
					 | 
				
			||||||
        self.vocab = vocab # Vocab owns the word codec, the big one
 | 
					 | 
				
			||||||
        #self.cfg = Config.read(model_dir, 'config')
 | 
					 | 
				
			||||||
        self.codecs = tuple([CodecWrapper(attr) for attr in self.cfg.attrs])
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def __call__(self, doc_or_bits):
 | 
					 | 
				
			||||||
        if isinstance(doc_or_bits, Doc):
 | 
					 | 
				
			||||||
            return self.serialize(doc_or_bits)
 | 
					 | 
				
			||||||
        elif isinstance(doc_or_bits, BitArray):
 | 
					 | 
				
			||||||
            return self.deserialize(doc_or_bits)
 | 
					 | 
				
			||||||
        else:
 | 
					 | 
				
			||||||
            raise ValueError(doc_or_bits)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def train(self, doc):
 | 
					 | 
				
			||||||
        array = doc.to_array([codec.id for codec in self.codecs])
 | 
					 | 
				
			||||||
        for i, codec in enumerate(self.codecs):
 | 
					 | 
				
			||||||
            codec.count(array[i]) 
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def serialize(self, doc):
 | 
					 | 
				
			||||||
        bits = BitArray()
 | 
					 | 
				
			||||||
        array = doc.to_array(self.attrs)
 | 
					 | 
				
			||||||
        for i, codec in enumerate(self.codecs):
 | 
					 | 
				
			||||||
            codec.encode(array[i,], bits)
 | 
					 | 
				
			||||||
        return bits
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    @cython.boundscheck(False)
 | 
					 | 
				
			||||||
    def deserialize(self, bits):
 | 
					 | 
				
			||||||
        biterator = iter(bits)
 | 
					 | 
				
			||||||
        cdef Doc doc = Doc(self.vocab)
 | 
					 | 
				
			||||||
        ids = self.vocab.codec.decode(biterator)
 | 
					 | 
				
			||||||
        cdef int id_
 | 
					 | 
				
			||||||
        cdef bint is_spacy
 | 
					 | 
				
			||||||
        for id_ in ids:
 | 
					 | 
				
			||||||
            is_spacy = biterator.next()
 | 
					 | 
				
			||||||
            doc.push_back(self.vocab.lexemes.at(id_), is_spacy)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        cdef int length = doc.length
 | 
					 | 
				
			||||||
        array = numpy.zeros(shape=(length, len(self.codecs)), dtype=numpy.int)
 | 
					 | 
				
			||||||
        for i, codec in enumerate(self.codecs):
 | 
					 | 
				
			||||||
            array[i] = codec.decode(biterator)
 | 
					 | 
				
			||||||
        doc.from_array([c.id for c in self.codecs], array)
 | 
					 | 
				
			||||||
        return doc
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
cdef class CodecWrapper:
 | 
					 | 
				
			||||||
    """Wrapper around HuffmanCodec"""
 | 
					 | 
				
			||||||
    def __init__(self, freqs, id=0):
 | 
					 | 
				
			||||||
        cdef uint64_t key
 | 
					 | 
				
			||||||
        cdef uint64_t count
 | 
					 | 
				
			||||||
        cdef pair[uint64_t, uint64_t] item
 | 
					 | 
				
			||||||
        cdef priority_queue[pair[uint64_t, uint64_t]] items
 | 
					 | 
				
			||||||
        for key, count in freqs:
 | 
					 | 
				
			||||||
            item.first = count
 | 
					 | 
				
			||||||
            item.second = key
 | 
					 | 
				
			||||||
            items.push(item)
 | 
					 | 
				
			||||||
        
 | 
					 | 
				
			||||||
        weights = [] #array('f')
 | 
					 | 
				
			||||||
        keys = [] #array('i')
 | 
					 | 
				
			||||||
        key_to_i = PreshMap()
 | 
					 | 
				
			||||||
        i = 0
 | 
					 | 
				
			||||||
        while not items.empty():
 | 
					 | 
				
			||||||
            item = items.top()
 | 
					 | 
				
			||||||
            weights.append(item.first)
 | 
					 | 
				
			||||||
            keys.append(item.second)
 | 
					 | 
				
			||||||
            key_to_i[item.second] = i
 | 
					 | 
				
			||||||
            i += 1
 | 
					 | 
				
			||||||
            items.pop()
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def encode(self, symbols):
 | 
					 | 
				
			||||||
        indices = [self.table[symbol] for symbol in symbols]
 | 
					 | 
				
			||||||
        return self._codec.encode(indices)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def decode(self, bits):
 | 
					 | 
				
			||||||
        indices = self._codec.decode(bits)
 | 
					 | 
				
			||||||
        return [self.symbols[i] for i in indices]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
cdef class HuffmanCodec:
 | 
					 | 
				
			||||||
    """Create a Huffman code table, and use it to pack and unpack sequences into
 | 
					 | 
				
			||||||
    byte strings. Emphasis is on efficiency, so API is quite strict:
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    Messages will be encoded/decoded as indices that refer to the probability sequence.
 | 
					 | 
				
			||||||
    For instance, the sequence [5, 10, 8] indicates the 5th most frequent item,
 | 
					 | 
				
			||||||
    the 10th most frequent item, the 8th most frequent item.  The codec will add
 | 
					 | 
				
			||||||
    the EOL symbol to your message. An exception will be raised if you include
 | 
					 | 
				
			||||||
    the EOL symbol in your message.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    Arguments:
 | 
					 | 
				
			||||||
        weights (float[:]): A descending-sorted sequence of probabilities/weights.
 | 
					 | 
				
			||||||
          Must include a weight for an EOL symbol.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        eol (uint32_t): The index of the weight of the EOL symbol.
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    def __init__(self, float[:] weights, uint32_t eol):
 | 
					 | 
				
			||||||
        self.codes.resize(len(weights))
 | 
					 | 
				
			||||||
        for i in range(len(self.codes)):
 | 
					 | 
				
			||||||
            self.codes[i].bits = 0
 | 
					 | 
				
			||||||
            self.codes[i].length = 0
 | 
					 | 
				
			||||||
        populate_nodes(self.nodes, weights)
 | 
					 | 
				
			||||||
        cdef Code path
 | 
					 | 
				
			||||||
        path.bits = 0
 | 
					 | 
				
			||||||
        path.length = 0
 | 
					 | 
				
			||||||
        assign_codes(self.nodes, self.codes, len(self.nodes) - 1, path)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def encode(self, uint32_t[:] sequence, BitArray bits=None):
 | 
					 | 
				
			||||||
        if bits is None:
 | 
					 | 
				
			||||||
            bits = BitArray()
 | 
					 | 
				
			||||||
        for i in sequence:
 | 
					 | 
				
			||||||
            bits.extend(self.codes[i].bits, self.codes[i].length) 
 | 
					 | 
				
			||||||
        bits.extend(self.codes[self.eol].bits, self.codes[self.eol].length)
 | 
					 | 
				
			||||||
        return bits
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def decode(self, bits):
 | 
					 | 
				
			||||||
        node = self.nodes.back()
 | 
					 | 
				
			||||||
        symbols = []
 | 
					 | 
				
			||||||
        for bit in bits:
 | 
					 | 
				
			||||||
            branch = node.right if bit else node.left
 | 
					 | 
				
			||||||
            if branch >= 0:
 | 
					 | 
				
			||||||
                node = self.nodes.at(branch)
 | 
					 | 
				
			||||||
            else:
 | 
					 | 
				
			||||||
                symbol = -(branch + 1)
 | 
					 | 
				
			||||||
                if symbol == self.eol:
 | 
					 | 
				
			||||||
                    return symbols
 | 
					 | 
				
			||||||
                else:
 | 
					 | 
				
			||||||
                    symbols.append(symbol)
 | 
					 | 
				
			||||||
                node = self.nodes.back()
 | 
					 | 
				
			||||||
        return symbols
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    property strings:
 | 
					 | 
				
			||||||
        @cython.boundscheck(False)
 | 
					 | 
				
			||||||
        @cython.wraparound(False)
 | 
					 | 
				
			||||||
        @cython.nonecheck(False)
 | 
					 | 
				
			||||||
        def __get__(self):
 | 
					 | 
				
			||||||
            output = []
 | 
					 | 
				
			||||||
            cdef int i, j
 | 
					 | 
				
			||||||
            cdef bytes string
 | 
					 | 
				
			||||||
            cdef Code code
 | 
					 | 
				
			||||||
            for i in range(self.codes.size()):
 | 
					 | 
				
			||||||
                code = self.codes[i]
 | 
					 | 
				
			||||||
                string = b'{0:b}'.format(code.bits).rjust(code.length, '0')
 | 
					 | 
				
			||||||
                string = string[::-1]
 | 
					 | 
				
			||||||
                output.append(string)
 | 
					 | 
				
			||||||
            return output
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@cython.boundscheck(False)
 | 
					 | 
				
			||||||
@cython.wraparound(False)
 | 
					 | 
				
			||||||
@cython.nonecheck(False)
 | 
					 | 
				
			||||||
cdef int populate_nodes(vector[Node]& nodes, float[:] probs) except -1:
 | 
					 | 
				
			||||||
    assert len(probs) >= 3
 | 
					 | 
				
			||||||
    cdef int size = len(probs)
 | 
					 | 
				
			||||||
    cdef int i = size - 1
 | 
					 | 
				
			||||||
    cdef int j = 0
 | 
					 | 
				
			||||||
    
 | 
					 | 
				
			||||||
    while i >= 0 or (j+1) < nodes.size():
 | 
					 | 
				
			||||||
        if i < 0:
 | 
					 | 
				
			||||||
            _cover_two_nodes(nodes, j)
 | 
					 | 
				
			||||||
            j += 2
 | 
					 | 
				
			||||||
        elif j >= nodes.size():
 | 
					 | 
				
			||||||
            _cover_two_words(nodes, i, i-1, probs[i] + probs[i-1])
 | 
					 | 
				
			||||||
            i -= 2
 | 
					 | 
				
			||||||
        elif i >= 1 and (j == nodes.size() or probs[i-1] < nodes[j].prob):
 | 
					 | 
				
			||||||
            _cover_two_words(nodes, i, i-1, probs[i] + probs[i-1])
 | 
					 | 
				
			||||||
            i -= 2
 | 
					 | 
				
			||||||
        elif (j+1) < nodes.size() and nodes[j+1].prob < probs[i]:
 | 
					 | 
				
			||||||
            _cover_two_nodes(nodes, j)
 | 
					 | 
				
			||||||
            j += 2
 | 
					 | 
				
			||||||
        else:
 | 
					 | 
				
			||||||
            _cover_one_word_one_node(nodes, j, i, probs[i])
 | 
					 | 
				
			||||||
            i -= 1
 | 
					 | 
				
			||||||
            j += 1
 | 
					 | 
				
			||||||
    return 0
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
cdef int _cover_two_nodes(vector[Node]& nodes, int j) nogil:
 | 
					 | 
				
			||||||
    """Introduce a new non-terminal, over two non-terminals)"""
 | 
					 | 
				
			||||||
    cdef Node node
 | 
					 | 
				
			||||||
    node.left = j
 | 
					 | 
				
			||||||
    node.right = j+1
 | 
					 | 
				
			||||||
    node.prob = nodes[j].prob + nodes[j+1].prob
 | 
					 | 
				
			||||||
    nodes.push_back(node)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
cdef int _cover_one_word_one_node(vector[Node]& nodes, int j, int id_, float prob) nogil:
 | 
					 | 
				
			||||||
    """Introduce a new non-terminal, over one terminal and one non-terminal."""
 | 
					 | 
				
			||||||
    cdef Node node
 | 
					 | 
				
			||||||
    # Encode leaves as negative integers, where the integer is the index of the
 | 
					 | 
				
			||||||
    # word in the vocabulary.
 | 
					 | 
				
			||||||
    cdef int64_t leaf_id = - <int64_t>(id_ + 1)
 | 
					 | 
				
			||||||
    cdef float new_prob = prob + nodes[j].prob
 | 
					 | 
				
			||||||
    if prob < nodes[j].prob:
 | 
					 | 
				
			||||||
        node.left = leaf_id
 | 
					 | 
				
			||||||
        node.right = j
 | 
					 | 
				
			||||||
        node.prob = new_prob
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        node.left = j
 | 
					 | 
				
			||||||
        node.right = leaf_id
 | 
					 | 
				
			||||||
        node.prob = new_prob
 | 
					 | 
				
			||||||
    nodes.push_back(node)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
cdef int _cover_two_words(vector[Node]& nodes, int id1, int id2, float prob) nogil:
 | 
					 | 
				
			||||||
    """Introduce a new node, over two non-terminals."""
 | 
					 | 
				
			||||||
    cdef Node node
 | 
					 | 
				
			||||||
    node.left = -(id1+1)
 | 
					 | 
				
			||||||
    node.right = -(id2+1)
 | 
					 | 
				
			||||||
    node.prob = prob
 | 
					 | 
				
			||||||
    nodes.push_back(node)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
cdef int assign_codes(vector[Node]& nodes, vector[Code]& codes, int i, Code path) except -1:
 | 
					 | 
				
			||||||
    """Recursively assign paths, from the top down. At the end, the entry codes[i]
 | 
					 | 
				
			||||||
    knows the bit-address of the node[j] that points to entry i in the vocabulary.
 | 
					 | 
				
			||||||
    So, to encode i, we go to codes[i] and read its bit-string. To decode, we
 | 
					 | 
				
			||||||
    navigate nodes recursively.
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    cdef Code left_path = bit_append(path, 0)
 | 
					 | 
				
			||||||
    cdef Code right_path = bit_append(path, 1)
 | 
					 | 
				
			||||||
    
 | 
					 | 
				
			||||||
    # Assign down left branch
 | 
					 | 
				
			||||||
    if nodes[i].left >= 0:
 | 
					 | 
				
			||||||
        assign_codes(nodes, codes, nodes[i].left, left_path)
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        # Leaf on left
 | 
					 | 
				
			||||||
        id_ = -(nodes[i].left + 1)
 | 
					 | 
				
			||||||
        codes[id_] = left_path
 | 
					 | 
				
			||||||
    # Assign down right branch
 | 
					 | 
				
			||||||
    if nodes[i].right >= 0:
 | 
					 | 
				
			||||||
        assign_codes(nodes, codes, nodes[i].right, right_path)
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        # Leaf on right
 | 
					 | 
				
			||||||
        id_ = -(nodes[i].right + 1)
 | 
					 | 
				
			||||||
        codes[id_] = right_path
 | 
					 | 
				
			||||||
							
								
								
									
										21
									
								
								spacy/serialize/bits.pxd
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										21
									
								
								spacy/serialize/bits.pxd
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,21 @@
 | 
				
			||||||
 | 
					from libc.stdint cimport uint64_t
 | 
				
			||||||
 | 
					from libc.stdint cimport uint32_t
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					ctypedef unsigned char uchar
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cdef struct Code:
 | 
				
			||||||
 | 
					    uint64_t bits
 | 
				
			||||||
 | 
					    char length
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cdef Code bit_append(Code code, bint bit) nogil
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cdef class BitArray:
 | 
				
			||||||
 | 
					    cdef bytes data
 | 
				
			||||||
 | 
					    cdef uchar byte
 | 
				
			||||||
 | 
					    cdef uchar bit_of_byte
 | 
				
			||||||
 | 
					    cdef uint32_t i
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    cdef int extend(self, uint64_t code, char n_bits) except -1
 | 
				
			||||||
							
								
								
									
										69
									
								
								spacy/serialize/bits.pyx
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										69
									
								
								spacy/serialize/bits.pyx
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,69 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Note that we're setting the most significant bits here first, when in practice
 | 
				
			||||||
 | 
					# we're actually wanting the last bit to be most significant (for Huffman coding,
 | 
				
			||||||
 | 
					# anyway).
 | 
				
			||||||
 | 
					cdef Code bit_append(Code code, bint bit) nogil:
 | 
				
			||||||
 | 
					    cdef uint64_t one = 1
 | 
				
			||||||
 | 
					    if bit:
 | 
				
			||||||
 | 
					        code.bits |= one << code.length
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        code.bits &= ~(one << code.length)
 | 
				
			||||||
 | 
					    code.length += 1
 | 
				
			||||||
 | 
					    return code
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cdef class BitArray:
 | 
				
			||||||
 | 
					    def __init__(self):
 | 
				
			||||||
 | 
					        self.data = b''
 | 
				
			||||||
 | 
					        self.byte = 0
 | 
				
			||||||
 | 
					        self.bit_of_byte = 0
 | 
				
			||||||
 | 
					        self.i = 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def __iter__(self):
 | 
				
			||||||
 | 
					        cdef uchar byte, i
 | 
				
			||||||
 | 
					        cdef uchar one = 1
 | 
				
			||||||
 | 
					        start_byte = self.i // 8
 | 
				
			||||||
 | 
					        if (self.i % 8) != 0:
 | 
				
			||||||
 | 
					            for i in range(self.i % 8):
 | 
				
			||||||
 | 
					                yield 1 if (self.data[start_byte] & (one << i)) else 0
 | 
				
			||||||
 | 
					            start_byte += 1
 | 
				
			||||||
 | 
					        for byte in self.data[start_byte:]:
 | 
				
			||||||
 | 
					            for i in range(8):
 | 
				
			||||||
 | 
					                yield 1 if byte & (one << i) else 0
 | 
				
			||||||
 | 
					        for i in range(self.bit_of_byte):
 | 
				
			||||||
 | 
					            yield 1 if self.byte & (one << i) else 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def as_bytes(self):
 | 
				
			||||||
 | 
					        if self.bit_of_byte != 0:
 | 
				
			||||||
 | 
					            return self.data + chr(self.byte)
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            return self.data
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def append(self, bint bit):
 | 
				
			||||||
 | 
					        cdef uint64_t one = 1
 | 
				
			||||||
 | 
					        if bit:
 | 
				
			||||||
 | 
					            self.byte |= one << self.bit_of_byte
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            self.byte &= ~(one << self.bit_of_byte)
 | 
				
			||||||
 | 
					        self.bit_of_byte += 1
 | 
				
			||||||
 | 
					        if self.bit_of_byte == 8:
 | 
				
			||||||
 | 
					            self.data += chr(self.byte)
 | 
				
			||||||
 | 
					            self.byte = 0
 | 
				
			||||||
 | 
					            self.bit_of_byte = 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    cdef int extend(self, uint64_t code, char n_bits) except -1:
 | 
				
			||||||
 | 
					        cdef uint64_t one = 1
 | 
				
			||||||
 | 
					        cdef unsigned char bit_of_code
 | 
				
			||||||
 | 
					        for bit_of_code in range(n_bits):
 | 
				
			||||||
 | 
					            if code & (one << bit_of_code):
 | 
				
			||||||
 | 
					                self.byte |= one << self.bit_of_byte
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                self.byte &= ~(one << self.bit_of_byte)
 | 
				
			||||||
 | 
					            self.bit_of_byte += 1
 | 
				
			||||||
 | 
					            if self.bit_of_byte == 8:
 | 
				
			||||||
 | 
					                self.data += chr(self.byte)
 | 
				
			||||||
 | 
					                self.byte = 0
 | 
				
			||||||
 | 
					                self.bit_of_byte = 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -4,7 +4,7 @@ from libc.stdint cimport int64_t
 | 
				
			||||||
from libc.stdint cimport int32_t
 | 
					from libc.stdint cimport int32_t
 | 
				
			||||||
from libc.stdint cimport uint64_t
 | 
					from libc.stdint cimport uint64_t
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .vocab cimport Vocab
 | 
					from .bits cimport Code
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef struct Node:
 | 
					cdef struct Node:
 | 
				
			||||||
| 
						 | 
					@ -13,19 +13,6 @@ cdef struct Node:
 | 
				
			||||||
    int32_t right
 | 
					    int32_t right
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef struct Code:
 | 
					 | 
				
			||||||
    uint64_t bits
 | 
					 | 
				
			||||||
    char length
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
cdef class Serializer:
 | 
					 | 
				
			||||||
    cdef list codecs
 | 
					 | 
				
			||||||
    cdef Vocab vocab
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
cdef class HuffmanCodec:
 | 
					cdef class HuffmanCodec:
 | 
				
			||||||
    cdef vector[Node] nodes
 | 
					    cdef vector[Node] nodes
 | 
				
			||||||
    cdef vector[Code] codes
 | 
					    cdef vector[Code] codes
 | 
				
			||||||
    cdef uint32_t eol
 | 
					 | 
				
			||||||
    cdef int id
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
							
								
								
									
										157
									
								
								spacy/serialize/huffman.pyx
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										157
									
								
								spacy/serialize/huffman.pyx
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,157 @@
 | 
				
			||||||
 | 
					cimport cython
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from .bits cimport bit_append
 | 
				
			||||||
 | 
					from .bits cimport BitArray
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cdef class HuffmanCodec:
 | 
				
			||||||
 | 
					    """Create a Huffman code table, and use it to pack and unpack sequences into
 | 
				
			||||||
 | 
					    byte strings. Emphasis is on efficiency, so API is quite strict:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Messages will be encoded/decoded as indices that refer to the probability sequence.
 | 
				
			||||||
 | 
					    For instance, the sequence [5, 10, 8] indicates the 5th most frequent item,
 | 
				
			||||||
 | 
					    the 10th most frequent item, the 8th most frequent item.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Arguments:
 | 
				
			||||||
 | 
					        weights (float[:]): A descending-sorted sequence of probabilities/weights.
 | 
				
			||||||
 | 
					          Must include a weight for an EOL symbol.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        eol (uint32_t): The index of the weight of the EOL symbol.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    def __init__(self, float[:] weights):
 | 
				
			||||||
 | 
					        self.codes.resize(len(weights))
 | 
				
			||||||
 | 
					        for i in range(len(self.codes)):
 | 
				
			||||||
 | 
					            self.codes[i].bits = 0
 | 
				
			||||||
 | 
					            self.codes[i].length = 0
 | 
				
			||||||
 | 
					        populate_nodes(self.nodes, weights)
 | 
				
			||||||
 | 
					        cdef Code path
 | 
				
			||||||
 | 
					        path.bits = 0
 | 
				
			||||||
 | 
					        path.length = 0
 | 
				
			||||||
 | 
					        assign_codes(self.nodes, self.codes, len(self.nodes) - 1, path)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def encode(self, uint32_t[:] msg, BitArray into_bits):
 | 
				
			||||||
 | 
					        cdef uint32_t i
 | 
				
			||||||
 | 
					        for i in range(len(msg)):
 | 
				
			||||||
 | 
					            into_bits.extend(self.codes[msg[i]].bits, self.codes[msg[i]].length)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def decode(self, bits, uint32_t[:] into_msg):
 | 
				
			||||||
 | 
					        node = self.nodes.back()
 | 
				
			||||||
 | 
					        cdef int i = 0
 | 
				
			||||||
 | 
					        cdef int n = len(into_msg)
 | 
				
			||||||
 | 
					        for bit in bits:
 | 
				
			||||||
 | 
					            branch = node.right if bit else node.left
 | 
				
			||||||
 | 
					            if branch >= 0:
 | 
				
			||||||
 | 
					                node = self.nodes.at(branch)
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                into_msg[i] = -(branch + 1)
 | 
				
			||||||
 | 
					                node = self.nodes.back()
 | 
				
			||||||
 | 
					                i += 1
 | 
				
			||||||
 | 
					                if i == n:
 | 
				
			||||||
 | 
					                    break
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            raise Exception
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    property strings:
 | 
				
			||||||
 | 
					        @cython.boundscheck(False)
 | 
				
			||||||
 | 
					        @cython.wraparound(False)
 | 
				
			||||||
 | 
					        @cython.nonecheck(False)
 | 
				
			||||||
 | 
					        def __get__(self):
 | 
				
			||||||
 | 
					            output = []
 | 
				
			||||||
 | 
					            cdef int i, j
 | 
				
			||||||
 | 
					            cdef bytes string
 | 
				
			||||||
 | 
					            cdef Code code
 | 
				
			||||||
 | 
					            for i in range(self.codes.size()):
 | 
				
			||||||
 | 
					                code = self.codes[i]
 | 
				
			||||||
 | 
					                string = b'{0:b}'.format(code.bits).rjust(code.length, '0')
 | 
				
			||||||
 | 
					                string = string[::-1]
 | 
				
			||||||
 | 
					                output.append(string)
 | 
				
			||||||
 | 
					            return output
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@cython.boundscheck(False)
 | 
				
			||||||
 | 
					@cython.wraparound(False)
 | 
				
			||||||
 | 
					@cython.nonecheck(False)
 | 
				
			||||||
 | 
					cdef int populate_nodes(vector[Node]& nodes, float[:] probs) except -1:
 | 
				
			||||||
 | 
					    assert len(probs) >= 3
 | 
				
			||||||
 | 
					    cdef int size = len(probs)
 | 
				
			||||||
 | 
					    cdef int i = size - 1
 | 
				
			||||||
 | 
					    cdef int j = 0
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    while i >= 0 or (j+1) < nodes.size():
 | 
				
			||||||
 | 
					        if i < 0:
 | 
				
			||||||
 | 
					            _cover_two_nodes(nodes, j)
 | 
				
			||||||
 | 
					            j += 2
 | 
				
			||||||
 | 
					        elif j >= nodes.size():
 | 
				
			||||||
 | 
					            _cover_two_words(nodes, i, i-1, probs[i] + probs[i-1])
 | 
				
			||||||
 | 
					            i -= 2
 | 
				
			||||||
 | 
					        elif i >= 1 and (j == nodes.size() or probs[i-1] < nodes[j].prob):
 | 
				
			||||||
 | 
					            _cover_two_words(nodes, i, i-1, probs[i] + probs[i-1])
 | 
				
			||||||
 | 
					            i -= 2
 | 
				
			||||||
 | 
					        elif (j+1) < nodes.size() and nodes[j+1].prob < probs[i]:
 | 
				
			||||||
 | 
					            _cover_two_nodes(nodes, j)
 | 
				
			||||||
 | 
					            j += 2
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            _cover_one_word_one_node(nodes, j, i, probs[i])
 | 
				
			||||||
 | 
					            i -= 1
 | 
				
			||||||
 | 
					            j += 1
 | 
				
			||||||
 | 
					    return 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cdef int _cover_two_nodes(vector[Node]& nodes, int j) nogil:
 | 
				
			||||||
 | 
					    """Introduce a new non-terminal, over two non-terminals)"""
 | 
				
			||||||
 | 
					    cdef Node node
 | 
				
			||||||
 | 
					    node.left = j
 | 
				
			||||||
 | 
					    node.right = j+1
 | 
				
			||||||
 | 
					    node.prob = nodes[j].prob + nodes[j+1].prob
 | 
				
			||||||
 | 
					    nodes.push_back(node)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cdef int _cover_one_word_one_node(vector[Node]& nodes, int j, int id_, float prob) nogil:
 | 
				
			||||||
 | 
					    """Introduce a new non-terminal, over one terminal and one non-terminal."""
 | 
				
			||||||
 | 
					    cdef Node node
 | 
				
			||||||
 | 
					    # Encode leaves as negative integers, where the integer is the index of the
 | 
				
			||||||
 | 
					    # word in the vocabulary.
 | 
				
			||||||
 | 
					    cdef int64_t leaf_id = - <int64_t>(id_ + 1)
 | 
				
			||||||
 | 
					    cdef float new_prob = prob + nodes[j].prob
 | 
				
			||||||
 | 
					    if prob < nodes[j].prob:
 | 
				
			||||||
 | 
					        node.left = leaf_id
 | 
				
			||||||
 | 
					        node.right = j
 | 
				
			||||||
 | 
					        node.prob = new_prob
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        node.left = j
 | 
				
			||||||
 | 
					        node.right = leaf_id
 | 
				
			||||||
 | 
					        node.prob = new_prob
 | 
				
			||||||
 | 
					    nodes.push_back(node)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cdef int _cover_two_words(vector[Node]& nodes, int id1, int id2, float prob) nogil:
 | 
				
			||||||
 | 
					    """Introduce a new node, over two non-terminals."""
 | 
				
			||||||
 | 
					    cdef Node node
 | 
				
			||||||
 | 
					    node.left = -(id1+1)
 | 
				
			||||||
 | 
					    node.right = -(id2+1)
 | 
				
			||||||
 | 
					    node.prob = prob
 | 
				
			||||||
 | 
					    nodes.push_back(node)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cdef int assign_codes(vector[Node]& nodes, vector[Code]& codes, int i, Code path) except -1:
 | 
				
			||||||
 | 
					    """Recursively assign paths, from the top down. At the end, the entry codes[i]
 | 
				
			||||||
 | 
					    knows the bit-address of the node[j] that points to entry i in the vocabulary.
 | 
				
			||||||
 | 
					    So, to encode i, we go to codes[i] and read its bit-string. To decode, we
 | 
				
			||||||
 | 
					    navigate nodes recursively.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    cdef Code left_path = bit_append(path, 0)
 | 
				
			||||||
 | 
					    cdef Code right_path = bit_append(path, 1)
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    # Assign down left branch
 | 
				
			||||||
 | 
					    if nodes[i].left >= 0:
 | 
				
			||||||
 | 
					        assign_codes(nodes, codes, nodes[i].left, left_path)
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        # Leaf on left
 | 
				
			||||||
 | 
					        id_ = -(nodes[i].left + 1)
 | 
				
			||||||
 | 
					        codes[id_] = left_path
 | 
				
			||||||
 | 
					    # Assign down right branch
 | 
				
			||||||
 | 
					    if nodes[i].right >= 0:
 | 
				
			||||||
 | 
					        assign_codes(nodes, codes, nodes[i].right, right_path)
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        # Leaf on right
 | 
				
			||||||
 | 
					        id_ = -(nodes[i].right + 1)
 | 
				
			||||||
 | 
					        codes[id_] = right_path
 | 
				
			||||||
							
								
								
									
										6
									
								
								spacy/serialize/packer.pxd
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										6
									
								
								spacy/serialize/packer.pxd
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,6 @@
 | 
				
			||||||
 | 
					from ..vocab cimport Vocab
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cdef class Packer:
 | 
				
			||||||
 | 
					    cdef tuple _codecs
 | 
				
			||||||
 | 
					    cdef Vocab vocab
 | 
				
			||||||
							
								
								
									
										136
									
								
								spacy/serialize/packer.pyx
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										136
									
								
								spacy/serialize/packer.pyx
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,136 @@
 | 
				
			||||||
 | 
					from libc.stdint cimport uint32_t
 | 
				
			||||||
 | 
					from libc.stdint cimport uint64_t
 | 
				
			||||||
 | 
					from libc.math cimport exp as c_exp
 | 
				
			||||||
 | 
					from libcpp.queue cimport priority_queue
 | 
				
			||||||
 | 
					from libcpp.pair cimport pair
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from cymem.cymem cimport Address, Pool
 | 
				
			||||||
 | 
					from preshed.maps cimport PreshMap
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from ..attrs cimport ID, SPACY, TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
 | 
				
			||||||
 | 
					from ..tokens.doc cimport Doc
 | 
				
			||||||
 | 
					from ..vocab cimport Vocab
 | 
				
			||||||
 | 
					from ..typedefs cimport attr_t
 | 
				
			||||||
 | 
					from .bits cimport BitArray
 | 
				
			||||||
 | 
					from .huffman cimport HuffmanCodec
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from os import path
 | 
				
			||||||
 | 
					import numpy
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cimport cython
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Format
 | 
				
			||||||
 | 
					# - Total number of bytes in message (32 bit int) --- handled outside this
 | 
				
			||||||
 | 
					# - Number of words (32 bit int)
 | 
				
			||||||
 | 
					# - Words, terminating in an EOL symbol, huffman coded ~12 bits per word
 | 
				
			||||||
 | 
					# - Spaces 1 bit per word
 | 
				
			||||||
 | 
					# - Attributes:
 | 
				
			||||||
 | 
					#       POS tag
 | 
				
			||||||
 | 
					#       Head offset
 | 
				
			||||||
 | 
					#       Dep label
 | 
				
			||||||
 | 
					#       Entity IOB
 | 
				
			||||||
 | 
					#       Entity tag
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def make_vocab_codec(Vocab vocab):
 | 
				
			||||||
 | 
					    cdef int length = len(vocab)
 | 
				
			||||||
 | 
					    cdef Address mem = Address(length, sizeof(float))
 | 
				
			||||||
 | 
					    probs = <float*>mem.ptr
 | 
				
			||||||
 | 
					    cdef int i
 | 
				
			||||||
 | 
					    for i in range(length):
 | 
				
			||||||
 | 
					        probs[i] = <float>c_exp(vocab.lexemes[i].prob)
 | 
				
			||||||
 | 
					    cdef float[:] cv_probs = <float[:len(vocab)]>probs
 | 
				
			||||||
 | 
					    return HuffmanCodec(cv_probs)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cdef class _BinaryCodec:
 | 
				
			||||||
 | 
					    def encode(self, src, bits):
 | 
				
			||||||
 | 
					        cdef int i
 | 
				
			||||||
 | 
					        for i in range(len(src)):
 | 
				
			||||||
 | 
					            bits.append(src[i])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def decode(self, dest, bits, n):
 | 
				
			||||||
 | 
					        for i in range(n):
 | 
				
			||||||
 | 
					            dest[i] = bits.next()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cdef class _AttributeCodec:
 | 
				
			||||||
 | 
					    cdef Pool mem
 | 
				
			||||||
 | 
					    cdef attr_t* _keys
 | 
				
			||||||
 | 
					    cdef PreshMap _map
 | 
				
			||||||
 | 
					    cdef HuffmanCodec _codec
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def __init__(self, freqs):
 | 
				
			||||||
 | 
					        cdef uint64_t key
 | 
				
			||||||
 | 
					        cdef uint64_t count
 | 
				
			||||||
 | 
					        cdef pair[uint64_t, uint64_t] item
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        cdef priority_queue[pair[uint64_t, uint64_t]] items
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        for key, count in freqs:
 | 
				
			||||||
 | 
					            item.first = count
 | 
				
			||||||
 | 
					            item.second = key
 | 
				
			||||||
 | 
					            items.push(item)
 | 
				
			||||||
 | 
					        weights = numpy.array(shape=(len(freqs),), dtype=numpy.float32)
 | 
				
			||||||
 | 
					        self._keys = <attr_t*>self.mem.alloc(len(freqs), sizeof(attr_t))
 | 
				
			||||||
 | 
					        self._map = PreshMap()
 | 
				
			||||||
 | 
					        cdef int i = 0
 | 
				
			||||||
 | 
					        while not items.empty():
 | 
				
			||||||
 | 
					            item = items.top()
 | 
				
			||||||
 | 
					            weights[i] = item.first
 | 
				
			||||||
 | 
					            self._keys[i] = item.second
 | 
				
			||||||
 | 
					            self._map[self.keys[i]] = i
 | 
				
			||||||
 | 
					            items.pop()
 | 
				
			||||||
 | 
					        self._codec = HuffmanCodec(weights)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def encode(self, attr_t[:] msg, BitArray into_bits):
 | 
				
			||||||
 | 
					        for i in range(len(msg)):
 | 
				
			||||||
 | 
					            msg[i] = self._map[msg[i]]
 | 
				
			||||||
 | 
					        self._codec.encode(msg, into_bits)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def decode(self, BitArray bits, attr_t[:] into_msg):
 | 
				
			||||||
 | 
					        cdef int i
 | 
				
			||||||
 | 
					        self._codec.decode(bits, into_msg)
 | 
				
			||||||
 | 
					        for i in range(len(into_msg)):
 | 
				
			||||||
 | 
					            into_msg[i] = self._keys[into_msg[i]]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cdef class Packer:
 | 
				
			||||||
 | 
					    def __init__(self, Vocab vocab, list_of_attr_freqs):
 | 
				
			||||||
 | 
					        self.vocab = vocab
 | 
				
			||||||
 | 
					        codecs = []
 | 
				
			||||||
 | 
					        self.attrs = []
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        for attr, freqs in list_of_attr_freqs:
 | 
				
			||||||
 | 
					            if attr == ID:
 | 
				
			||||||
 | 
					                codecs.append(make_vocab_codec(vocab))
 | 
				
			||||||
 | 
					            elif attr == SPACY:
 | 
				
			||||||
 | 
					                codecs.append(_BinaryCodec())
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                codecs.append(_AttributeCodec(freqs))
 | 
				
			||||||
 | 
					            self.attrs.append(attr)
 | 
				
			||||||
 | 
					        self._codecs = tuple(codecs)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def __call__(self, msg_or_bits):
 | 
				
			||||||
 | 
					        if isinstance(msg_or_bits, BitArray):
 | 
				
			||||||
 | 
					            bits = msg_or_bits
 | 
				
			||||||
 | 
					            return Doc.from_array(self.vocab, self.attrs, self.deserialize(bits))
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            msg = msg_or_bits
 | 
				
			||||||
 | 
					            return self.serialize(msg.to_array(self.attrs))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def serialize(self, array):
 | 
				
			||||||
 | 
					        cdef BitArray bits = BitArray()
 | 
				
			||||||
 | 
					        cdef uint32_t length = len(array)
 | 
				
			||||||
 | 
					        bits.extend(length, 32)
 | 
				
			||||||
 | 
					        for i, codec in enumerate(self._codecs):
 | 
				
			||||||
 | 
					            codec.encode(array[i], bits)
 | 
				
			||||||
 | 
					        return bits
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def deserialize(self, bits):
 | 
				
			||||||
 | 
					        cdef uint32_t length = bits.read(32)
 | 
				
			||||||
 | 
					        array = numpy.ndarray(shape=(len(self.codecs), length), dtype=numpy.int)
 | 
				
			||||||
 | 
					        for i, codec in enumerate(self.codecs):
 | 
				
			||||||
 | 
					            array[i] = codec.decode(bits)
 | 
				
			||||||
 | 
					        return array
 | 
				
			||||||
| 
						 | 
					@ -4,7 +4,6 @@ from libc.string cimport memcpy, memset
 | 
				
			||||||
import numpy
 | 
					import numpy
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..lexeme cimport EMPTY_LEXEME
 | 
					from ..lexeme cimport EMPTY_LEXEME
 | 
				
			||||||
from ..serialize import BitArray
 | 
					 | 
				
			||||||
from ..strings cimport slice_unicode
 | 
					from ..strings cimport slice_unicode
 | 
				
			||||||
from ..typedefs cimport attr_t, flags_t
 | 
					from ..typedefs cimport attr_t, flags_t
 | 
				
			||||||
from ..attrs cimport attr_id_t
 | 
					from ..attrs cimport attr_id_t
 | 
				
			||||||
| 
						 | 
					@ -371,10 +370,12 @@ cdef class Doc:
 | 
				
			||||||
        return self[start]
 | 
					        return self[start]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def from_array(self, attrs, array):
 | 
					    def from_array(self, attrs, array):
 | 
				
			||||||
        cdef int i
 | 
					        cdef int i, col
 | 
				
			||||||
        cdef attr_id_t attr_id
 | 
					        cdef attr_id_t attr_id
 | 
				
			||||||
        cdef TokenC* tokens = self.data
 | 
					        cdef TokenC* tokens = self.data
 | 
				
			||||||
        for attr_id in attrs:
 | 
					        cdef int length = len(array)
 | 
				
			||||||
 | 
					        for col, attr_id in enumerate(attrs): 
 | 
				
			||||||
 | 
					            values = array[:, col]
 | 
				
			||||||
            if attr_id == HEAD:
 | 
					            if attr_id == HEAD:
 | 
				
			||||||
                for i in range(length):
 | 
					                for i in range(length):
 | 
				
			||||||
                    tokens[i].head = values[i]
 | 
					                    tokens[i].head = values[i]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -35,5 +35,3 @@ cdef class Vocab:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef PreshMap _map
 | 
					    cdef PreshMap _map
 | 
				
			||||||
    cdef readonly int repvec_length
 | 
					    cdef readonly int repvec_length
 | 
				
			||||||
 | 
					 | 
				
			||||||
    cdef public object _codec
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,7 +1,6 @@
 | 
				
			||||||
from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
 | 
					from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
 | 
				
			||||||
from libc.string cimport memset
 | 
					from libc.string cimport memset
 | 
				
			||||||
from libc.stdint cimport int32_t
 | 
					from libc.stdint cimport int32_t
 | 
				
			||||||
from libc.math cimport exp as c_exp
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
import bz2
 | 
					import bz2
 | 
				
			||||||
from os import path
 | 
					from os import path
 | 
				
			||||||
| 
						 | 
					@ -15,7 +14,6 @@ from .strings cimport slice_unicode
 | 
				
			||||||
from .strings cimport hash_string
 | 
					from .strings cimport hash_string
 | 
				
			||||||
from .orth cimport word_shape
 | 
					from .orth cimport word_shape
 | 
				
			||||||
from .typedefs cimport attr_t
 | 
					from .typedefs cimport attr_t
 | 
				
			||||||
from .serialize cimport HuffmanCodec
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
from cymem.cymem cimport Address
 | 
					from cymem.cymem cimport Address
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -227,22 +225,6 @@ cdef class Vocab:
 | 
				
			||||||
                lex.repvec = EMPTY_VEC
 | 
					                lex.repvec = EMPTY_VEC
 | 
				
			||||||
        return vec_len
 | 
					        return vec_len
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    property codec:
 | 
					 | 
				
			||||||
        def __get__(self):
 | 
					 | 
				
			||||||
            cdef Address mem
 | 
					 | 
				
			||||||
            cdef int i
 | 
					 | 
				
			||||||
            cdef float[:] cv_probs
 | 
					 | 
				
			||||||
            if self._codec is not None:
 | 
					 | 
				
			||||||
                return self._codec
 | 
					 | 
				
			||||||
            else:
 | 
					 | 
				
			||||||
                mem = Address(len(self), sizeof(float))
 | 
					 | 
				
			||||||
                probs = <float*>mem.ptr
 | 
					 | 
				
			||||||
                for i in range(len(self)):
 | 
					 | 
				
			||||||
                    probs[i] = <float>c_exp(self.lexemes[i].prob)
 | 
					 | 
				
			||||||
                cv_probs = <float[:len(self)]>probs
 | 
					 | 
				
			||||||
                self._codec = HuffmanCodec(cv_probs, 0)
 | 
					 | 
				
			||||||
                return self._codec
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
def write_binary_vectors(in_loc, out_loc):
 | 
					def write_binary_vectors(in_loc, out_loc):
 | 
				
			||||||
    cdef _CFile out_file = _CFile(out_loc, 'wb')
 | 
					    cdef _CFile out_file = _CFile(out_loc, 'wb')
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -3,14 +3,15 @@ from __future__ import division
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from spacy.serialize import HuffmanCodec
 | 
					from spacy.serialize.huffman import HuffmanCodec
 | 
				
			||||||
 | 
					from spacy.serialize.bits import BitArray
 | 
				
			||||||
import numpy
 | 
					import numpy
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from heapq import heappush, heappop, heapify
 | 
					from heapq import heappush, heappop, heapify
 | 
				
			||||||
from collections import defaultdict
 | 
					from collections import defaultdict
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Vocab(object):
 | 
					class MockPacker(object):
 | 
				
			||||||
    def __init__(self, freqs):
 | 
					    def __init__(self, freqs):
 | 
				
			||||||
        freqs['-eol-'] = 5
 | 
					        freqs['-eol-'] = 5
 | 
				
			||||||
        total = sum(freqs.values())
 | 
					        total = sum(freqs.values())
 | 
				
			||||||
| 
						 | 
					@ -19,15 +20,19 @@ class Vocab(object):
 | 
				
			||||||
        self.symbols = [sym for sym, freq in by_freq]
 | 
					        self.symbols = [sym for sym, freq in by_freq]
 | 
				
			||||||
        self.probs = numpy.array([item[1] / total for item in by_freq], dtype=numpy.float32)
 | 
					        self.probs = numpy.array([item[1] / total for item in by_freq], dtype=numpy.float32)
 | 
				
			||||||
        self.table = {sym: i for i, sym in enumerate(self.symbols)}
 | 
					        self.table = {sym: i for i, sym in enumerate(self.symbols)}
 | 
				
			||||||
        self.codec = HuffmanCodec(self.probs, self.table['-eol-'])
 | 
					        self.codec = HuffmanCodec(self.probs)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def pack(self, message):
 | 
					    def pack(self, message):
 | 
				
			||||||
        seq = [self.table[sym] for sym in message]
 | 
					        seq = [self.table[sym] for sym in message]
 | 
				
			||||||
        return self.codec.encode(numpy.array(seq, dtype=numpy.uint32))
 | 
					        msg = numpy.array(seq, dtype=numpy.uint32)
 | 
				
			||||||
 | 
					        bits = BitArray()
 | 
				
			||||||
 | 
					        self.codec.encode(msg, bits)
 | 
				
			||||||
 | 
					        return bits
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def unpack(self, packed):
 | 
					    def unpack(self, bits, n):
 | 
				
			||||||
        ids = self.codec.decode(packed)
 | 
					        msg = numpy.array(range(n), dtype=numpy.uint32)
 | 
				
			||||||
        return [self.symbols[i] for i in ids]
 | 
					        self.codec.decode(bits, msg)
 | 
				
			||||||
 | 
					        return [self.symbols[i] for i in msg]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
					 
 | 
				
			||||||
def py_encode(symb2freq):
 | 
					def py_encode(symb2freq):
 | 
				
			||||||
| 
						 | 
					@ -60,7 +65,7 @@ def test1():
 | 
				
			||||||
    probs[8] = 0.0001
 | 
					    probs[8] = 0.0001
 | 
				
			||||||
    probs[9] = 0.000001
 | 
					    probs[9] = 0.000001
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
    codec = HuffmanCodec(probs, 9)
 | 
					    codec = HuffmanCodec(probs)
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
    py_codes = py_encode(dict(enumerate(probs)))
 | 
					    py_codes = py_encode(dict(enumerate(probs)))
 | 
				
			||||||
    py_codes = py_codes.items()
 | 
					    py_codes = py_codes.items()
 | 
				
			||||||
| 
						 | 
					@ -71,19 +76,19 @@ def test1():
 | 
				
			||||||
def test_round_trip():
 | 
					def test_round_trip():
 | 
				
			||||||
    freqs = {'the': 10, 'quick': 3, 'brown': 4, 'fox': 1, 'jumped': 5, 'over': 8,
 | 
					    freqs = {'the': 10, 'quick': 3, 'brown': 4, 'fox': 1, 'jumped': 5, 'over': 8,
 | 
				
			||||||
            'lazy': 1, 'dog': 2, '.': 9}
 | 
					            'lazy': 1, 'dog': 2, '.': 9}
 | 
				
			||||||
    vocab = Vocab(freqs)
 | 
					    packer = MockPacker(freqs)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    message = ['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the',
 | 
					    message = ['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the',
 | 
				
			||||||
                'the', 'lazy', 'dog', '.']
 | 
					                'the', 'lazy', 'dog', '.']
 | 
				
			||||||
    strings = list(vocab.codec.strings)
 | 
					    strings = list(packer.codec.strings)
 | 
				
			||||||
    codes = {vocab.symbols[i]: strings[i] for i in range(len(vocab.symbols))}
 | 
					    codes = {packer.symbols[i]: strings[i] for i in range(len(packer.symbols))}
 | 
				
			||||||
    packed = vocab.pack(message)
 | 
					    bits = packer.pack(message)
 | 
				
			||||||
    string = b''.join(b'{0:b}'.format(ord(c)).rjust(8, b'0')[::-1] for c in packed.as_bytes())
 | 
					    string = b''.join(b'{0:b}'.format(ord(c)).rjust(8, b'0')[::-1] for c in bits.as_bytes())
 | 
				
			||||||
    for word in message:
 | 
					    for word in message:
 | 
				
			||||||
        code = codes[word]
 | 
					        code = codes[word]
 | 
				
			||||||
        assert string[:len(code)] == code
 | 
					        assert string[:len(code)] == code
 | 
				
			||||||
        string = string[len(code):]
 | 
					        string = string[len(code):]
 | 
				
			||||||
    unpacked = vocab.unpack(packed)
 | 
					    unpacked = packer.unpack(bits, len(message))
 | 
				
			||||||
    assert message == unpacked
 | 
					    assert message == unpacked
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -92,13 +97,12 @@ def test_rosetta():
 | 
				
			||||||
    symb2freq = defaultdict(int)
 | 
					    symb2freq = defaultdict(int)
 | 
				
			||||||
    for ch in txt:
 | 
					    for ch in txt:
 | 
				
			||||||
        symb2freq[ch] += 1
 | 
					        symb2freq[ch] += 1
 | 
				
			||||||
    symb2freq['-eol-'] = 1
 | 
					 | 
				
			||||||
    by_freq = symb2freq.items()
 | 
					    by_freq = symb2freq.items()
 | 
				
			||||||
    by_freq.sort(reverse=True, key=lambda item: item[1])
 | 
					    by_freq.sort(reverse=True, key=lambda item: item[1])
 | 
				
			||||||
    symbols = [sym for sym, prob in by_freq]
 | 
					    symbols = [sym for sym, prob in by_freq]
 | 
				
			||||||
    probs = numpy.array([prob for sym, prob in by_freq], dtype=numpy.float32)
 | 
					    probs = numpy.array([prob for sym, prob in by_freq], dtype=numpy.float32)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    codec = HuffmanCodec(probs, symbols.index('-eol-'))
 | 
					    codec = HuffmanCodec(probs)
 | 
				
			||||||
    py_codec = py_encode(symb2freq)
 | 
					    py_codec = py_encode(symb2freq)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    my_lengths = defaultdict(int)
 | 
					    my_lengths = defaultdict(int)
 | 
				
			||||||
| 
						 | 
					@ -112,6 +116,7 @@ def test_rosetta():
 | 
				
			||||||
    assert my_exp_len == py_exp_len
 | 
					    assert my_exp_len == py_exp_len
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
def test_vocab(EN):
 | 
					def test_vocab(EN):
 | 
				
			||||||
    codec = EN.vocab.codec
 | 
					    codec = EN.vocab.codec
 | 
				
			||||||
    expected_length = 0
 | 
					    expected_length = 0
 | 
				
			||||||
| 
						 | 
					@ -137,3 +142,4 @@ def test_freqs():
 | 
				
			||||||
    for i, code in enumerate(codec.strings):
 | 
					    for i, code in enumerate(codec.strings):
 | 
				
			||||||
        expected_length += len(code) * freqs[i]
 | 
					        expected_length += len(code) * freqs[i]
 | 
				
			||||||
    assert 8 < expected_length < 14
 | 
					    assert 8 < expected_length < 14
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user