* Work on efficiency

This commit is contained in:
Matthew Honnibal 2014-10-14 18:22:41 +11:00
parent 6fb42c4919
commit 43743a5d63
4 changed files with 43 additions and 27 deletions

View File

@ -51,7 +51,7 @@ cdef class Language:
cpdef Tokens tokenize(self, unicode text) cpdef Tokens tokenize(self, unicode text)
cdef int _tokenize(self, Tokens tokens, Py_UNICODE* chars, int start, int end) except -1 cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1
cdef String* _split_affixes(self, String* string, vector[LexemeC*] *prefixes, cdef String* _split_affixes(self, String* string, vector[LexemeC*] *prefixes,
vector[LexemeC*] *suffixes) except NULL vector[LexemeC*] *suffixes) except NULL
cdef int _attach_tokens(self, Tokens tokens, int idx, String* string, cdef int _attach_tokens(self, Tokens tokens, int idx, String* string,

View File

@ -70,35 +70,40 @@ cdef class Language:
cdef int start = 0 cdef int start = 0
cdef int i = 0 cdef int i = 0
cdef Py_UNICODE* chars = string cdef Py_UNICODE* chars = string
cdef String span
for i in range(length): for i in range(length):
if Py_UNICODE_ISSPACE(chars[i]) == 1: if Py_UNICODE_ISSPACE(chars[i]) == 1:
if start < i: if start < i:
self._tokenize(tokens, chars, start, i) string_slice(&span, chars, start, i)
lexemes = <LexemeC**>self.cache.get(span.key)
if lexemes != NULL:
tokens.extend(start, lexemes, 0)
else:
self._tokenize(tokens, &span, start, i)
start = i + 1 start = i + 1
i += 1 i += 1
if start < i: if start < i:
self._tokenize(tokens, chars, start, i) string_slice(&span, chars, start, i)
lexemes = <LexemeC**>self.cache.get(span.key)
if lexemes != NULL:
tokens.extend(start, lexemes, 0)
else:
self._tokenize(tokens, &span, start, i)
return tokens return tokens
cdef int _tokenize(self, Tokens tokens, Py_UNICODE* chars, int start, int end) except -1: cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1:
cdef String span
cdef vector[LexemeC*] prefixes cdef vector[LexemeC*] prefixes
cdef vector[LexemeC*] suffixes cdef vector[LexemeC*] suffixes
cdef uint64_t orig_key cdef uint64_t orig_key
cdef int orig_size cdef int orig_size
string_slice(&span, chars, start, end) orig_key = span.key
lexemes = <LexemeC**>self.cache.get(span.key) orig_size = tokens.lex.size()
if lexemes != NULL: self._split_affixes(span, &prefixes, &suffixes)
tokens.extend(start, lexemes, 0) self._attach_tokens(tokens, start, span, &prefixes, &suffixes)
else: self._save_cached(tokens.lex, orig_key, orig_size)
orig_key = span.key
orig_size = tokens.lex.size()
span = self._split_affixes(&span, &prefixes, &suffixes)[0]
self._attach_tokens(tokens, start, &span, &prefixes, &suffixes)
self._save_cached(&tokens.lex, orig_key, orig_size)
cdef String* _split_affixes(self, String* string, vector[LexemeC*] *prefixes, cdef String* _split_affixes(self, String* string, vector[LexemeC*] *prefixes,
vector[LexemeC*] *suffixes) except NULL: vector[LexemeC*] *suffixes) except NULL:
cdef size_t i cdef size_t i
cdef String prefix cdef String prefix
cdef String suffix cdef String suffix
@ -113,7 +118,7 @@ cdef class Language:
string_slice(&minus_pre, string.chars, pre_len, string.n) string_slice(&minus_pre, string.chars, pre_len, string.n)
# Check whether we've hit a special-case # Check whether we've hit a special-case
if minus_pre.n >= 1 and self.specials.get(minus_pre.key) != NULL: if minus_pre.n >= 1 and self.specials.get(minus_pre.key) != NULL:
string = &minus_pre string[0] = minus_pre
prefixes.push_back(self.lexicon.get(&prefix)) prefixes.push_back(self.lexicon.get(&prefix))
break break
suf_len = self._find_suffix(string.chars, string.n) suf_len = self._find_suffix(string.chars, string.n)
@ -122,7 +127,7 @@ cdef class Language:
string_slice(&minus_suf, string.chars, 0, string.n - suf_len) string_slice(&minus_suf, string.chars, 0, string.n - suf_len)
# Check whether we've hit a special-case # Check whether we've hit a special-case
if minus_suf.n >= 1 and self.specials.get(minus_suf.key) != NULL: if minus_suf.n >= 1 and self.specials.get(minus_suf.key) != NULL:
string = &minus_suf string[0] = minus_suf
suffixes.push_back(self.lexicon.get(&suffix)) suffixes.push_back(self.lexicon.get(&suffix))
break break
if pre_len and suf_len and (pre_len + suf_len) <= string.n: if pre_len and suf_len and (pre_len + suf_len) <= string.n:
@ -130,10 +135,10 @@ cdef class Language:
prefixes.push_back(self.lexicon.get(&prefix)) prefixes.push_back(self.lexicon.get(&prefix))
suffixes.push_back(self.lexicon.get(&suffix)) suffixes.push_back(self.lexicon.get(&suffix))
elif pre_len: elif pre_len:
string = &minus_pre string[0] = minus_pre
prefixes.push_back(self.lexicon.get(&prefix)) prefixes.push_back(self.lexicon.get(&prefix))
elif suf_len: elif suf_len:
string = &minus_suf string[0] = minus_suf
suffixes.push_back(self.lexicon.get(&suffix)) suffixes.push_back(self.lexicon.get(&suffix))
if self.specials.get(string.key): if self.specials.get(string.key):
break break
@ -271,7 +276,7 @@ cdef void string_from_unicode(String* s, unicode uni):
string_slice(s, c_uni, 0, len(uni)) string_slice(s, c_uni, 0, len(uni))
cdef inline void string_slice(String* s, Py_UNICODE* chars, size_t start, size_t end) nogil: cdef inline void string_slice(String* s, Py_UNICODE* chars, int start, int end) nogil:
s.chars = &chars[start] s.chars = &chars[start]
s.n = end - start s.n = end - start
s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0) s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)

View File

@ -3,9 +3,9 @@ from libcpp.vector cimport vector
cdef class Tokens: cdef class Tokens:
cdef vector[LexemeC*] lex cdef vector[LexemeC*] *lex
cdef vector[int] idx cdef vector[int] *idx
cdef vector[int] pos cdef vector[int] *pos
cdef int extend(self, int i, LexemeC** lexemes, int n) except -1 cdef int extend(self, int i, LexemeC** lexemes, int n) except -1
cdef int push_back(self, int i, LexemeC* lexeme) except -1 cdef int push_back(self, int i, LexemeC* lexeme) except -1

View File

@ -25,10 +25,18 @@ cdef class Tokens:
""" """
def __cinit__(self, string_length=0): def __cinit__(self, string_length=0):
size = int(string_length / 3) if string_length >= 3 else 1 size = int(string_length / 3) if string_length >= 3 else 1
self.lex = new vector[LexemeC*]()
self.idx = new vector[int]()
self.pos = new vector[int]()
self.lex.reserve(size) self.lex.reserve(size)
self.idx.reserve(size) self.idx.reserve(size)
self.pos.reserve(size) self.pos.reserve(size)
def __dealloc__(self):
del self.lex
del self.idx
del self.pos
def __getitem__(self, i): def __getitem__(self, i):
return Lexeme(<size_t>self.lex.at(i)) return Lexeme(<size_t>self.lex.at(i))
@ -38,7 +46,6 @@ cdef class Tokens:
cdef int push_back(self, int idx, LexemeC* lexeme) except -1: cdef int push_back(self, int idx, LexemeC* lexeme) except -1:
self.lex.push_back(lexeme) self.lex.push_back(lexeme)
self.idx.push_back(idx) self.idx.push_back(idx)
self.pos.push_back(0)
return idx + lexeme.ints[<int>LexInt_length] return idx + lexeme.ints[<int>LexInt_length]
cdef int extend(self, int idx, LexemeC** lexemes, int n) except -1: cdef int extend(self, int idx, LexemeC** lexemes, int n) except -1:
@ -48,11 +55,15 @@ cdef class Tokens:
elif n == 0: elif n == 0:
i = 0 i = 0
while lexemes[i] != NULL: while lexemes[i] != NULL:
idx = self.push_back(idx, lexemes[i]) self.lex.push_back(lexemes[i])
self.idx.push_back(idx)
idx += lexemes[i].ints[<int>LexInt_length]
i += 1 i += 1
else: else:
for i in range(n): for i in range(n):
idx = self.push_back(idx, lexemes[i]) self.lex.push_back(lexemes[i])
self.idx.push_back(idx)
idx += lexemes[i].ints[<int>LexInt_length]
return idx return idx
cpdef int id(self, size_t i) except -1: cpdef int id(self, size_t i) except -1: