From bba5f57f91635a254d60c6f517fc7933d21dad6e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 9 Mar 2019 11:50:27 +0000 Subject: [PATCH] Add method to export utf8 array to Doc --- spacy/tokens/doc.pyx | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 1dfcd1687..378921f3c 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -1022,6 +1022,37 @@ cdef class Doc: data["_"][attr] = value return data + def to_utf8_array(self, int nr_char=-1): + """Encode word strings to utf8, and export to a fixed-width array + of characters. Characters are placed into the array in the order: + 0, -1, 1, -2, etc + For example, if the array is sliced array[:, :8], the array will + contain the first 4 characters and last 4 characters of each word --- + with the middle characters clipped out. The value 255 is used as a pad + value. + """ + byte_strings = [token.orth_.encode('utf8') for token in self] + if nr_char == -1: + nr_char = max(len(bs) for bs in byte_strings) + cdef np.ndarray output = numpy.zeros((len(byte_strings), nr_char), dtype='uint8') + output.fill(255) + cdef int i, j, start_idx, end_idx + cdef bytes byte_string + cdef unsigned char utf8_char + for i, byte_string in enumerate(byte_strings): + j = 0 + start_idx = 0 + end_idx = len(byte_string) - 1 + while j < nr_char and start_idx <= end_idx: + output[i, j] = byte_string[start_idx] + start_idx += 1 + j += 1 + if j < nr_char and start_idx <= end_idx: + output[i, j] = byte_string[end_idx] + end_idx -= 1 + j += 1 + return output + cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2: cdef int i