mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
* Pass only the tokens vector to _tokenize, instead of the whole python object.
This commit is contained in:
parent
08cef75ffd
commit
e68a431e5e
|
@ -42,5 +42,5 @@ cdef class Language:
|
||||||
cpdef Tokens tokenize(self, unicode text)
|
cpdef Tokens tokenize(self, unicode text)
|
||||||
cpdef Lexeme lookup(self, unicode text)
|
cpdef Lexeme lookup(self, unicode text)
|
||||||
|
|
||||||
cdef int _tokenize(self, Tokens tokens, String* string)
|
cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string)
|
||||||
cdef int _split_one(self, Py_UNICODE* characters, size_t length)
|
cdef int _split_one(self, Py_UNICODE* characters, size_t length)
|
||||||
|
|
|
@ -182,25 +182,25 @@ cdef class Language:
|
||||||
if Py_UNICODE_ISSPACE(c) == 1:
|
if Py_UNICODE_ISSPACE(c) == 1:
|
||||||
if start < i:
|
if start < i:
|
||||||
string_from_slice(&span, chars, start, i)
|
string_from_slice(&span, chars, start, i)
|
||||||
self._tokenize(tokens, &span)
|
self._tokenize(tokens.v, &span)
|
||||||
start = i + 1
|
start = i + 1
|
||||||
i += 1
|
i += 1
|
||||||
if start < i:
|
if start < i:
|
||||||
string_from_slice(&span, chars, start, i)
|
string_from_slice(&span, chars, start, i)
|
||||||
self._tokenize(tokens, &span)
|
self._tokenize(tokens.v, &span)
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
cdef int _tokenize(self, Tokens tokens, String* string):
|
cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string):
|
||||||
cdef LexemeC** lexemes = <LexemeC**>self.cache.get(string.key)
|
cdef LexemeC** lexemes = <LexemeC**>self.cache.get(string.key)
|
||||||
cdef size_t i
|
cdef size_t i
|
||||||
if lexemes != NULL:
|
if lexemes != NULL:
|
||||||
i = 0
|
i = 0
|
||||||
while lexemes[i] != NULL:
|
while lexemes[i] != NULL:
|
||||||
tokens.v.push_back(lexemes[i])
|
tokens_v.push_back(lexemes[i])
|
||||||
i += 1
|
i += 1
|
||||||
return 0
|
return 0
|
||||||
cdef uint64_t key = string.key
|
cdef uint64_t key = string.key
|
||||||
cdef size_t first_token = len(tokens)
|
cdef size_t first_token = tokens_v.size()
|
||||||
cdef int split
|
cdef int split
|
||||||
cdef int remaining = string.n
|
cdef int remaining = string.n
|
||||||
cdef String prefix
|
cdef String prefix
|
||||||
|
@ -212,14 +212,14 @@ cdef class Language:
|
||||||
if lexemes != NULL:
|
if lexemes != NULL:
|
||||||
i = 0
|
i = 0
|
||||||
while lexemes[i] != NULL:
|
while lexemes[i] != NULL:
|
||||||
tokens.v.push_back(lexemes[i])
|
tokens_v.push_back(lexemes[i])
|
||||||
i += 1
|
i += 1
|
||||||
else:
|
else:
|
||||||
tokens.v.push_back(<LexemeC*>self.lexicon.get(&prefix))
|
tokens_v.push_back(<LexemeC*>self.lexicon.get(&prefix))
|
||||||
lexemes = <LexemeC**>calloc(len(tokens) - first_token, sizeof(LexemeC*))
|
lexemes = <LexemeC**>calloc(tokens_v.size() - first_token, sizeof(LexemeC*))
|
||||||
cdef size_t j
|
cdef size_t j
|
||||||
for i, j in enumerate(range(first_token, tokens.v.size())):
|
for i, j in enumerate(range(first_token, tokens_v.size())):
|
||||||
lexemes[i] = tokens.v[0][j]
|
lexemes[i] = tokens_v[0][j]
|
||||||
self.cache.set(key, lexemes)
|
self.cache.set(key, lexemes)
|
||||||
|
|
||||||
cdef int _split_one(self, Py_UNICODE* characters, size_t length):
|
cdef int _split_one(self, Py_UNICODE* characters, size_t length):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user