* Fix bug in decoding non-ascii characters

This commit is contained in:
Matthew Honnibal 2015-07-27 21:43:58 +02:00
parent 6deb1e84b6
commit 1601e488ee

View File

@ -152,7 +152,7 @@ cdef class Packer:
cdef int32_t length = len(utf8_str) cdef int32_t length = len(utf8_str)
# Signal chars with negative length # Signal chars with negative length
bits.extend(-length, 32) bits.extend(-length, 32)
self.char_codec.encode(bytearray(utf8_str), bits) self.char_codec.encode(utf8_str, bits)
cdef int i, j cdef int i, j
for i in range(doc.length): for i in range(doc.length):
for j in range(doc.data[i].lex.length-1): for j in range(doc.data[i].lex.length-1):
@ -175,24 +175,24 @@ cdef class Packer:
doc.push_back(lex, space) doc.push_back(lex, space)
return doc return doc
def _char_decode(self, BitArray bits, int32_t n, Doc doc): def _char_decode(self, BitArray bits, int32_t n_bytes, Doc doc):
cdef bytearray utf8_str = bytearray(n) cdef bytearray utf8_str = bytearray(n_bytes)
self.char_codec.decode(bits, utf8_str) self.char_codec.decode(bits, utf8_str)
cdef unicode string = utf8_str.decode('utf8') cdef unicode string = utf8_str.decode('utf8')
cdef int start = 0 cdef int start = 0
cdef bint is_spacy cdef bint is_spacy
cdef int length = len(string) cdef int n_unicode_chars = len(string)
cdef int i = 0 cdef int i = 0
cdef bint is_end_token cdef bint is_end_token
for is_end_token in bits: for is_end_token in bits:
if is_end_token: if is_end_token:
span = string[start:i+1] span = string[start:i+1]
lex = self.vocab.get(doc.mem, span) lex = self.vocab.get(doc.mem, span)
is_spacy = (i+1) < length and string[i+1] == u' ' is_spacy = (i+1) < n_unicode_chars and string[i+1] == u' '
doc.push_back(lex, is_spacy) doc.push_back(lex, is_spacy)
start = i + 1 + is_spacy start = i + 1 + is_spacy
i += 1 i += 1
if i >= n: if i >= n_unicode_chars:
break break
return doc return doc