mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-27 01:34:30 +03:00
* Work on efficiency
This commit is contained in:
parent
6fb42c4919
commit
43743a5d63
|
@ -51,7 +51,7 @@ cdef class Language:
|
||||||
|
|
||||||
cpdef Tokens tokenize(self, unicode text)
|
cpdef Tokens tokenize(self, unicode text)
|
||||||
|
|
||||||
cdef int _tokenize(self, Tokens tokens, Py_UNICODE* chars, int start, int end) except -1
|
cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1
|
||||||
cdef String* _split_affixes(self, String* string, vector[LexemeC*] *prefixes,
|
cdef String* _split_affixes(self, String* string, vector[LexemeC*] *prefixes,
|
||||||
vector[LexemeC*] *suffixes) except NULL
|
vector[LexemeC*] *suffixes) except NULL
|
||||||
cdef int _attach_tokens(self, Tokens tokens, int idx, String* string,
|
cdef int _attach_tokens(self, Tokens tokens, int idx, String* string,
|
||||||
|
|
|
@ -70,35 +70,40 @@ cdef class Language:
|
||||||
cdef int start = 0
|
cdef int start = 0
|
||||||
cdef int i = 0
|
cdef int i = 0
|
||||||
cdef Py_UNICODE* chars = string
|
cdef Py_UNICODE* chars = string
|
||||||
|
cdef String span
|
||||||
for i in range(length):
|
for i in range(length):
|
||||||
if Py_UNICODE_ISSPACE(chars[i]) == 1:
|
if Py_UNICODE_ISSPACE(chars[i]) == 1:
|
||||||
if start < i:
|
if start < i:
|
||||||
self._tokenize(tokens, chars, start, i)
|
string_slice(&span, chars, start, i)
|
||||||
|
lexemes = <LexemeC**>self.cache.get(span.key)
|
||||||
|
if lexemes != NULL:
|
||||||
|
tokens.extend(start, lexemes, 0)
|
||||||
|
else:
|
||||||
|
self._tokenize(tokens, &span, start, i)
|
||||||
start = i + 1
|
start = i + 1
|
||||||
i += 1
|
i += 1
|
||||||
if start < i:
|
if start < i:
|
||||||
self._tokenize(tokens, chars, start, i)
|
string_slice(&span, chars, start, i)
|
||||||
|
lexemes = <LexemeC**>self.cache.get(span.key)
|
||||||
|
if lexemes != NULL:
|
||||||
|
tokens.extend(start, lexemes, 0)
|
||||||
|
else:
|
||||||
|
self._tokenize(tokens, &span, start, i)
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
cdef int _tokenize(self, Tokens tokens, Py_UNICODE* chars, int start, int end) except -1:
|
cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1:
|
||||||
cdef String span
|
|
||||||
cdef vector[LexemeC*] prefixes
|
cdef vector[LexemeC*] prefixes
|
||||||
cdef vector[LexemeC*] suffixes
|
cdef vector[LexemeC*] suffixes
|
||||||
cdef uint64_t orig_key
|
cdef uint64_t orig_key
|
||||||
cdef int orig_size
|
cdef int orig_size
|
||||||
string_slice(&span, chars, start, end)
|
orig_key = span.key
|
||||||
lexemes = <LexemeC**>self.cache.get(span.key)
|
orig_size = tokens.lex.size()
|
||||||
if lexemes != NULL:
|
self._split_affixes(span, &prefixes, &suffixes)
|
||||||
tokens.extend(start, lexemes, 0)
|
self._attach_tokens(tokens, start, span, &prefixes, &suffixes)
|
||||||
else:
|
self._save_cached(tokens.lex, orig_key, orig_size)
|
||||||
orig_key = span.key
|
|
||||||
orig_size = tokens.lex.size()
|
|
||||||
span = self._split_affixes(&span, &prefixes, &suffixes)[0]
|
|
||||||
self._attach_tokens(tokens, start, &span, &prefixes, &suffixes)
|
|
||||||
self._save_cached(&tokens.lex, orig_key, orig_size)
|
|
||||||
|
|
||||||
cdef String* _split_affixes(self, String* string, vector[LexemeC*] *prefixes,
|
cdef String* _split_affixes(self, String* string, vector[LexemeC*] *prefixes,
|
||||||
vector[LexemeC*] *suffixes) except NULL:
|
vector[LexemeC*] *suffixes) except NULL:
|
||||||
cdef size_t i
|
cdef size_t i
|
||||||
cdef String prefix
|
cdef String prefix
|
||||||
cdef String suffix
|
cdef String suffix
|
||||||
|
@ -113,7 +118,7 @@ cdef class Language:
|
||||||
string_slice(&minus_pre, string.chars, pre_len, string.n)
|
string_slice(&minus_pre, string.chars, pre_len, string.n)
|
||||||
# Check whether we've hit a special-case
|
# Check whether we've hit a special-case
|
||||||
if minus_pre.n >= 1 and self.specials.get(minus_pre.key) != NULL:
|
if minus_pre.n >= 1 and self.specials.get(minus_pre.key) != NULL:
|
||||||
string = &minus_pre
|
string[0] = minus_pre
|
||||||
prefixes.push_back(self.lexicon.get(&prefix))
|
prefixes.push_back(self.lexicon.get(&prefix))
|
||||||
break
|
break
|
||||||
suf_len = self._find_suffix(string.chars, string.n)
|
suf_len = self._find_suffix(string.chars, string.n)
|
||||||
|
@ -122,7 +127,7 @@ cdef class Language:
|
||||||
string_slice(&minus_suf, string.chars, 0, string.n - suf_len)
|
string_slice(&minus_suf, string.chars, 0, string.n - suf_len)
|
||||||
# Check whether we've hit a special-case
|
# Check whether we've hit a special-case
|
||||||
if minus_suf.n >= 1 and self.specials.get(minus_suf.key) != NULL:
|
if minus_suf.n >= 1 and self.specials.get(minus_suf.key) != NULL:
|
||||||
string = &minus_suf
|
string[0] = minus_suf
|
||||||
suffixes.push_back(self.lexicon.get(&suffix))
|
suffixes.push_back(self.lexicon.get(&suffix))
|
||||||
break
|
break
|
||||||
if pre_len and suf_len and (pre_len + suf_len) <= string.n:
|
if pre_len and suf_len and (pre_len + suf_len) <= string.n:
|
||||||
|
@ -130,10 +135,10 @@ cdef class Language:
|
||||||
prefixes.push_back(self.lexicon.get(&prefix))
|
prefixes.push_back(self.lexicon.get(&prefix))
|
||||||
suffixes.push_back(self.lexicon.get(&suffix))
|
suffixes.push_back(self.lexicon.get(&suffix))
|
||||||
elif pre_len:
|
elif pre_len:
|
||||||
string = &minus_pre
|
string[0] = minus_pre
|
||||||
prefixes.push_back(self.lexicon.get(&prefix))
|
prefixes.push_back(self.lexicon.get(&prefix))
|
||||||
elif suf_len:
|
elif suf_len:
|
||||||
string = &minus_suf
|
string[0] = minus_suf
|
||||||
suffixes.push_back(self.lexicon.get(&suffix))
|
suffixes.push_back(self.lexicon.get(&suffix))
|
||||||
if self.specials.get(string.key):
|
if self.specials.get(string.key):
|
||||||
break
|
break
|
||||||
|
@ -271,7 +276,7 @@ cdef void string_from_unicode(String* s, unicode uni):
|
||||||
string_slice(s, c_uni, 0, len(uni))
|
string_slice(s, c_uni, 0, len(uni))
|
||||||
|
|
||||||
|
|
||||||
cdef inline void string_slice(String* s, Py_UNICODE* chars, size_t start, size_t end) nogil:
|
cdef inline void string_slice(String* s, Py_UNICODE* chars, int start, int end) nogil:
|
||||||
s.chars = &chars[start]
|
s.chars = &chars[start]
|
||||||
s.n = end - start
|
s.n = end - start
|
||||||
s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)
|
s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)
|
||||||
|
|
|
@ -3,9 +3,9 @@ from libcpp.vector cimport vector
|
||||||
|
|
||||||
|
|
||||||
cdef class Tokens:
|
cdef class Tokens:
|
||||||
cdef vector[LexemeC*] lex
|
cdef vector[LexemeC*] *lex
|
||||||
cdef vector[int] idx
|
cdef vector[int] *idx
|
||||||
cdef vector[int] pos
|
cdef vector[int] *pos
|
||||||
|
|
||||||
cdef int extend(self, int i, LexemeC** lexemes, int n) except -1
|
cdef int extend(self, int i, LexemeC** lexemes, int n) except -1
|
||||||
cdef int push_back(self, int i, LexemeC* lexeme) except -1
|
cdef int push_back(self, int i, LexemeC* lexeme) except -1
|
||||||
|
|
|
@ -25,10 +25,18 @@ cdef class Tokens:
|
||||||
"""
|
"""
|
||||||
def __cinit__(self, string_length=0):
|
def __cinit__(self, string_length=0):
|
||||||
size = int(string_length / 3) if string_length >= 3 else 1
|
size = int(string_length / 3) if string_length >= 3 else 1
|
||||||
|
self.lex = new vector[LexemeC*]()
|
||||||
|
self.idx = new vector[int]()
|
||||||
|
self.pos = new vector[int]()
|
||||||
self.lex.reserve(size)
|
self.lex.reserve(size)
|
||||||
self.idx.reserve(size)
|
self.idx.reserve(size)
|
||||||
self.pos.reserve(size)
|
self.pos.reserve(size)
|
||||||
|
|
||||||
|
def __dealloc__(self):
|
||||||
|
del self.lex
|
||||||
|
del self.idx
|
||||||
|
del self.pos
|
||||||
|
|
||||||
def __getitem__(self, i):
|
def __getitem__(self, i):
|
||||||
return Lexeme(<size_t>self.lex.at(i))
|
return Lexeme(<size_t>self.lex.at(i))
|
||||||
|
|
||||||
|
@ -38,7 +46,6 @@ cdef class Tokens:
|
||||||
cdef int push_back(self, int idx, LexemeC* lexeme) except -1:
|
cdef int push_back(self, int idx, LexemeC* lexeme) except -1:
|
||||||
self.lex.push_back(lexeme)
|
self.lex.push_back(lexeme)
|
||||||
self.idx.push_back(idx)
|
self.idx.push_back(idx)
|
||||||
self.pos.push_back(0)
|
|
||||||
return idx + lexeme.ints[<int>LexInt_length]
|
return idx + lexeme.ints[<int>LexInt_length]
|
||||||
|
|
||||||
cdef int extend(self, int idx, LexemeC** lexemes, int n) except -1:
|
cdef int extend(self, int idx, LexemeC** lexemes, int n) except -1:
|
||||||
|
@ -48,11 +55,15 @@ cdef class Tokens:
|
||||||
elif n == 0:
|
elif n == 0:
|
||||||
i = 0
|
i = 0
|
||||||
while lexemes[i] != NULL:
|
while lexemes[i] != NULL:
|
||||||
idx = self.push_back(idx, lexemes[i])
|
self.lex.push_back(lexemes[i])
|
||||||
|
self.idx.push_back(idx)
|
||||||
|
idx += lexemes[i].ints[<int>LexInt_length]
|
||||||
i += 1
|
i += 1
|
||||||
else:
|
else:
|
||||||
for i in range(n):
|
for i in range(n):
|
||||||
idx = self.push_back(idx, lexemes[i])
|
self.lex.push_back(lexemes[i])
|
||||||
|
self.idx.push_back(idx)
|
||||||
|
idx += lexemes[i].ints[<int>LexInt_length]
|
||||||
return idx
|
return idx
|
||||||
|
|
||||||
cpdef int id(self, size_t i) except -1:
|
cpdef int id(self, size_t i) except -1:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user