mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-04 21:50:35 +03:00
* Switch to dynamically allocating array, based on the document length
This commit is contained in:
parent
0575f16ade
commit
0074ae2fc0
|
@ -1,6 +1,8 @@
|
||||||
# cython: profile=True
|
# cython: profile=True
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from libc.stdlib cimport calloc, free
|
||||||
|
|
||||||
from ext.murmurhash cimport MurmurHash64A
|
from ext.murmurhash cimport MurmurHash64A
|
||||||
from ext.murmurhash cimport MurmurHash64B
|
from ext.murmurhash cimport MurmurHash64B
|
||||||
|
|
||||||
|
@ -60,9 +62,7 @@ cdef vector[Lexeme_addr] tokenize(Vocab* vocab, dict bacov, Splitter splitter,
|
||||||
cdef Py_UNICODE c
|
cdef Py_UNICODE c
|
||||||
|
|
||||||
cdef vector[Lexeme_addr] tokens = vector[Lexeme_addr]()
|
cdef vector[Lexeme_addr] tokens = vector[Lexeme_addr]()
|
||||||
cdef Py_UNICODE[1000] current
|
cdef Py_UNICODE* current = <Py_UNICODE*>calloc(len(string), sizeof(Py_UNICODE))
|
||||||
for i in range(1000):
|
|
||||||
current[i] = 0
|
|
||||||
cdef size_t word_len = 0
|
cdef size_t word_len = 0
|
||||||
cdef Lexeme* token
|
cdef Lexeme* token
|
||||||
for i in range(length):
|
for i in range(length):
|
||||||
|
@ -84,6 +84,7 @@ cdef vector[Lexeme_addr] tokenize(Vocab* vocab, dict bacov, Splitter splitter,
|
||||||
while token != NULL:
|
while token != NULL:
|
||||||
tokens.push_back(<Lexeme_addr>token)
|
tokens.push_back(<Lexeme_addr>token)
|
||||||
token = token.tail
|
token = token.tail
|
||||||
|
free(current)
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
cdef inline bint _is_whitespace(Py_UNICODE c) nogil:
|
cdef inline bint _is_whitespace(Py_UNICODE c) nogil:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user