* Use Py_UNICODE in tokenizer for now, while sort out Py_UCS4 stuff

This commit is contained in:
Matthew Honnibal 2015-07-22 13:38:45 +02:00
parent 4d61239eac
commit 2fc66e3723

View File

@ -80,34 +80,37 @@ cdef class Tokenizer:
cdef int i = 0 cdef int i = 0
cdef int start = 0 cdef int start = 0
cdef bint cache_hit cdef bint cache_hit
chars = <Py_UNICODE*>string cdef bint in_ws = Py_UNICODE_ISSPACE(string[0])
cdef bint in_ws = Py_UNICODE_ISSPACE(chars[0])
cdef unicode span cdef unicode span
# Use of Py_UNICODE is deprecated, and I should be using Py_UCS4.
# But this is hard --- I need to acquire a pointer, but there's no
# Py_UCS4 API in Python 2.
cdef Py_UNICODE uc
cdef Py_UNICODE* chars_ptr = <Py_UNICODE*>string
# The task here is much like string.split, but not quite # The task here is much like string.split, but not quite
# We find spans of whitespace and non-space characters, and ignore # We find spans of whitespace and non-space characters, and ignore
# spans that are exactly ' '. So, our sequences will all be separated # spans that are exactly ' '. So, our sequences will all be separated
# by either ' ' or nothing. # by either ' ' or nothing.
for i in range(1, length): for i range(1, length):
if Py_UNICODE_ISSPACE(chars[i]) != in_ws: uc = chars_ptr[i]
if Py_UNICODE_ISSPACE(uc) != in_ws:
if start < i: if start < i:
span = string[start:i] key = hash64(chars_ptr, (i - start) * sizeof(Py_UNICODE), 0)
key = hash_string(span)
cache_hit = self._try_cache(key, tokens) cache_hit = self._try_cache(key, tokens)
if not cache_hit: if not cache_hit:
self._tokenize(tokens, span, key) self._tokenize(tokens, string[start:i], key)
in_ws = not in_ws in_ws = not in_ws
start = i if uc == ' ':
if chars[i] == ' ':
tokens.data[tokens.length - 1].spacy = True tokens.data[tokens.length - 1].spacy = True
start += 1 start = i + 1
else:
start = i
i += 1 i += 1
if start < i: if start < i:
span = string[start:i] key = hash64(chars_ptr, (i - start) * sizeof(Py_UNICODE), 0)
key = hash_string(span)
cache_hit = self._try_cache(key, tokens) cache_hit = self._try_cache(key, tokens)
if not cache_hit: if not cache_hit:
self._tokenize(tokens, span, key) self._tokenize(tokens, string[start:], key)
tokens.data[tokens.length - 1].spacy = string[-1] == ' ' tokens.data[tokens.length - 1].spacy = string[-1] == ' '
return tokens return tokens