* Pass only the tokens vector to _tokenize, instead of the whole python object.

This commit is contained in:
Matthew Honnibal 2014-09-15 04:01:38 +02:00
parent 08cef75ffd
commit e68a431e5e
2 changed files with 11 additions and 11 deletions

View File

@ -42,5 +42,5 @@ cdef class Language:
cpdef Tokens tokenize(self, unicode text) cpdef Tokens tokenize(self, unicode text)
cpdef Lexeme lookup(self, unicode text) cpdef Lexeme lookup(self, unicode text)
cdef int _tokenize(self, Tokens tokens, String* string) cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string)
cdef int _split_one(self, Py_UNICODE* characters, size_t length) cdef int _split_one(self, Py_UNICODE* characters, size_t length)

View File

@ -182,25 +182,25 @@ cdef class Language:
if Py_UNICODE_ISSPACE(c) == 1: if Py_UNICODE_ISSPACE(c) == 1:
if start < i: if start < i:
string_from_slice(&span, chars, start, i) string_from_slice(&span, chars, start, i)
self._tokenize(tokens, &span) self._tokenize(tokens.v, &span)
start = i + 1 start = i + 1
i += 1 i += 1
if start < i: if start < i:
string_from_slice(&span, chars, start, i) string_from_slice(&span, chars, start, i)
self._tokenize(tokens, &span) self._tokenize(tokens.v, &span)
return tokens return tokens
cdef int _tokenize(self, Tokens tokens, String* string): cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string):
cdef LexemeC** lexemes = <LexemeC**>self.cache.get(string.key) cdef LexemeC** lexemes = <LexemeC**>self.cache.get(string.key)
cdef size_t i cdef size_t i
if lexemes != NULL: if lexemes != NULL:
i = 0 i = 0
while lexemes[i] != NULL: while lexemes[i] != NULL:
tokens.v.push_back(lexemes[i]) tokens_v.push_back(lexemes[i])
i += 1 i += 1
return 0 return 0
cdef uint64_t key = string.key cdef uint64_t key = string.key
cdef size_t first_token = len(tokens) cdef size_t first_token = tokens_v.size()
cdef int split cdef int split
cdef int remaining = string.n cdef int remaining = string.n
cdef String prefix cdef String prefix
@ -212,14 +212,14 @@ cdef class Language:
if lexemes != NULL: if lexemes != NULL:
i = 0 i = 0
while lexemes[i] != NULL: while lexemes[i] != NULL:
tokens.v.push_back(lexemes[i]) tokens_v.push_back(lexemes[i])
i += 1 i += 1
else: else:
tokens.v.push_back(<LexemeC*>self.lexicon.get(&prefix)) tokens_v.push_back(<LexemeC*>self.lexicon.get(&prefix))
lexemes = <LexemeC**>calloc(len(tokens) - first_token, sizeof(LexemeC*)) lexemes = <LexemeC**>calloc(tokens_v.size() - first_token, sizeof(LexemeC*))
cdef size_t j cdef size_t j
for i, j in enumerate(range(first_token, tokens.v.size())): for i, j in enumerate(range(first_token, tokens_v.size())):
lexemes[i] = tokens.v[0][j] lexemes[i] = tokens_v[0][j]
self.cache.set(key, lexemes) self.cache.set(key, lexemes)
cdef int _split_one(self, Py_UNICODE* characters, size_t length): cdef int _split_one(self, Py_UNICODE* characters, size_t length):