mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-23 07:44:12 +03:00
Add method to export utf8 array to Doc
This commit is contained in:
parent
e1a83d15ed
commit
bba5f57f91
|
@ -1022,6 +1022,37 @@ cdef class Doc:
|
|||
data["_"][attr] = value
|
||||
return data
|
||||
|
||||
def to_utf8_array(self, int nr_char=-1):
|
||||
"""Encode word strings to utf8, and export to a fixed-width array
|
||||
of characters. Characters are placed into the array in the order:
|
||||
0, -1, 1, -2, etc
|
||||
For example, if the array is sliced array[:, :8], the array will
|
||||
contain the first 4 characters and last 4 characters of each word ---
|
||||
with the middle characters clipped out. The value 255 is used as a pad
|
||||
value.
|
||||
"""
|
||||
byte_strings = [token.orth_.encode('utf8') for token in self]
|
||||
if nr_char == -1:
|
||||
nr_char = max(len(bs) for bs in byte_strings)
|
||||
cdef np.ndarray output = numpy.zeros((len(byte_strings), nr_char), dtype='uint8')
|
||||
output.fill(255)
|
||||
cdef int i, j, start_idx, end_idx
|
||||
cdef bytes byte_string
|
||||
cdef unsigned char utf8_char
|
||||
for i, byte_string in enumerate(byte_strings):
|
||||
j = 0
|
||||
start_idx = 0
|
||||
end_idx = len(byte_string) - 1
|
||||
while j < nr_char and start_idx <= end_idx:
|
||||
output[i, j] = <unsigned char>byte_string[start_idx]
|
||||
start_idx += 1
|
||||
j += 1
|
||||
if j < nr_char and start_idx <= end_idx:
|
||||
output[i, j] = <unsigned char>byte_string[end_idx]
|
||||
end_idx -= 1
|
||||
j += 1
|
||||
return output
|
||||
|
||||
|
||||
cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2:
|
||||
cdef int i
|
||||
|
|
Loading…
Reference in New Issue
Block a user