diff --git a/spacy/lang.pxd b/spacy/lang.pxd index 988fa715d..c3b96c275 100644 --- a/spacy/lang.pxd +++ b/spacy/lang.pxd @@ -73,6 +73,6 @@ cdef class Language: cpdef Tokens tokenize(self, unicode text) cpdef Lexeme lookup(self, unicode text) - cdef _tokenize(self, Tokens tokens, Py_UNICODE* characters, size_t length) + cdef int _tokenize(self, Tokens tokens, Py_UNICODE* characters, size_t length) except -1 cdef list _split(self, unicode string) cdef int _split_one(self, unicode word) diff --git a/spacy/lang.pyx b/spacy/lang.pyx index dd3394152..3dca09ac6 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -100,32 +100,29 @@ cdef class Language: self._tokenize(tokens, &characters[start], i - start) return tokens - cdef _tokenize(self, Tokens tokens, Py_UNICODE* characters, size_t length): + cdef int _tokenize(self, Tokens tokens, Py_UNICODE* characters, size_t length) except -1: cdef uint64_t hashed = hash64(characters, length * sizeof(Py_UNICODE), 0) - cdef unicode string - cdef bint free_chunk = False cdef size_t i = 0 cdef LexemeC** lexemes = self.cache[hashed] if lexemes is not NULL: while lexemes[i] != NULL: tokens.push_back(lexemes[i]) i += 1 + return 0 + + cdef unicode string = characters[:length] + cdef list substrings = self._split(string) + lexemes = calloc(len(substrings) + 1, sizeof(LexemeC*)) + cdef unicode substring + for i, substring in enumerate(substrings): + lexemes[i] = self.lexicon.get(substring) + tokens.push_back(lexemes[i]) + lexemes[i + 1] = NULL + if self.cache_size < 1000000: + self.cache[hashed] = lexemes + self.cache_size += 1 else: - string = characters[:length] - substrings = self._split(string) - lexemes = calloc(len(substrings) + 1, sizeof(LexemeC*)) - for i, substring in enumerate(substrings): - lexemes[i] = self.lexicon.get(substring) - tokens.push_back(lexemes[i]) - lexemes[i + 1] = NULL - # The intuition here is that if an element belongs in the cache, it - # has several chances to get in. And if the cache is large, we less - # believe that the element belongs there. - if self.cache_size == 0 or random.random() < (100000.0 / self.cache_size): - self.cache[hashed] = lexemes - self.cache_size += 1 - else: - free(lexemes) + free(lexemes) cdef list _split(self, unicode string): """Find how to split a contiguous span of non-space characters into substrings.