* Fix tokenizer

This commit is contained in:
Matthew Honnibal 2015-07-22 14:10:30 +02:00
parent c86dbe4944
commit 0c507bd80a

View File

@ -10,6 +10,7 @@ from cpython cimport Py_UNICODE_ISSPACE
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap from preshed.maps cimport PreshMap
from murmurhash.mrmr cimport hash64
from .morphology cimport set_morph_from_dict from .morphology cimport set_morph_from_dict
from .strings cimport hash_string from .strings cimport hash_string
@ -91,11 +92,11 @@ cdef class Tokenizer:
# We find spans of whitespace and non-space characters, and ignore # We find spans of whitespace and non-space characters, and ignore
# spans that are exactly ' '. So, our sequences will all be separated # spans that are exactly ' '. So, our sequences will all be separated
# by either ' ' or nothing. # by either ' ' or nothing.
for i range(1, length): for i in range(1, length):
uc = chars_ptr[i] uc = chars_ptr[i]
if Py_UNICODE_ISSPACE(uc) != in_ws: if Py_UNICODE_ISSPACE(uc) != in_ws:
if start < i: if start < i:
key = hash64(chars_ptr, (i - start) * sizeof(Py_UNICODE), 0) key = hash64(&chars_ptr[start], (i - start) * sizeof(Py_UNICODE), 0)
cache_hit = self._try_cache(key, tokens) cache_hit = self._try_cache(key, tokens)
if not cache_hit: if not cache_hit:
self._tokenize(tokens, string[start:i], key) self._tokenize(tokens, string[start:i], key)
@ -107,7 +108,7 @@ cdef class Tokenizer:
start = i start = i
i += 1 i += 1
if start < i: if start < i:
key = hash64(chars_ptr, (i - start) * sizeof(Py_UNICODE), 0) key = hash64(&chars_ptr[start], (i - start) * sizeof(Py_UNICODE), 0)
cache_hit = self._try_cache(key, tokens) cache_hit = self._try_cache(key, tokens)
if not cache_hit: if not cache_hit:
self._tokenize(tokens, string[start:], key) self._tokenize(tokens, string[start:], key)