* Use Py_UNICODE in tokenizer for now, while sort out Py_UCS4 stuff

2025-07-22 22:20:08 +03:00 · 2015-07-22 13:38:45 +02:00 · 2015-07-22 13:38:45 +02:00 · 2fc66e3723
commit 2fc66e3723
parent 4d61239eac
1 changed files with 17 additions and 14 deletions
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -80,34 +80,37 @@ cdef class Tokenizer:
        cdef int i = 0
        cdef int start = 0
        cdef bint cache_hit
-        chars = <Py_UNICODE*>string
+        cdef bint in_ws = Py_UNICODE_ISSPACE(string[0])
        cdef bint in_ws = Py_UNICODE_ISSPACE(chars[0])
        cdef unicode span
        # Use of Py_UNICODE is deprecated, and I should be using Py_UCS4.
        # But this is hard --- I need to acquire a pointer, but there's no
        # Py_UCS4 API in Python 2.
        cdef Py_UNICODE uc
        cdef Py_UNICODE* chars_ptr = <Py_UNICODE*>string
        # The task here is much like string.split, but not quite
        # We find spans of whitespace and non-space characters, and ignore
        # spans that are exactly ' '. So, our sequences will all be separated
        # by either ' ' or nothing.
-        for i in range(1, length):
+        for i range(1, length):
-            if Py_UNICODE_ISSPACE(chars[i]) != in_ws:
+            uc = chars_ptr[i]
            if Py_UNICODE_ISSPACE(uc) != in_ws:
                if start < i:
-                    span = string[start:i]
+                    key = hash64(chars_ptr, (i - start) * sizeof(Py_UNICODE), 0)
                    key = hash_string(span)
                    cache_hit = self._try_cache(key, tokens)
                    if not cache_hit:
-                        self._tokenize(tokens, span, key)
+                        self._tokenize(tokens, string[start:i], key)
                in_ws = not in_ws
-                start = i
+                if uc == ' ':
                if chars[i] == ' ':
                    tokens.data[tokens.length - 1].spacy = True
-                    start += 1
+                    start = i + 1
                else:
                    start = i
        i += 1
        if start < i:
-            span = string[start:i]
+            key = hash64(chars_ptr, (i - start) * sizeof(Py_UNICODE), 0)
            key = hash_string(span)
            cache_hit = self._try_cache(key, tokens)
            if not cache_hit:
-                self._tokenize(tokens, span, key)
+                self._tokenize(tokens, string[start:], key)
            tokens.data[tokens.length - 1].spacy = string[-1] == ' '
        return tokens