* More performance fiddling, particularly moving the specials into the cache, so that we can just lookup the cache in _tokenize

This commit is contained in:
Matthew Honnibal 2014-09-13 00:59:34 +02:00
parent 7d239df4c8
commit afdc9b7ac2

View File

@ -110,11 +110,14 @@ cdef class Language:
return tokens return tokens
cdef int _tokenize(self, Tokens tokens, String* string): cdef int _tokenize(self, Tokens tokens, String* string):
cdef LexemeC** lexemes = <LexemeC**>self.specials[string.key] cdef LexemeC** lexemes = <LexemeC**>self.cache[string.key]
if lexemes == NULL:
lexemes = <LexemeC**>self.cache[string.key] lexemes = <LexemeC**>self.cache[string.key]
cdef size_t i
if lexemes != NULL: if lexemes != NULL:
_extend_tokens(tokens, lexemes) i = 0
while lexemes[i] != NULL:
tokens.push_back(lexemes[i])
i += 1
return 0 return 0
cdef uint64_t hashed = string.key cdef uint64_t hashed = string.key
@ -128,11 +131,13 @@ cdef class Language:
string_slice_prefix(string, &prefix, split) string_slice_prefix(string, &prefix, split)
lexemes = <LexemeC**>self.specials[prefix.key] lexemes = <LexemeC**>self.specials[prefix.key]
if lexemes != NULL: if lexemes != NULL:
_extend_tokens(tokens, lexemes) i = 0
while lexemes[i] != NULL:
tokens.push_back(lexemes[i])
i += 1
else: else:
tokens.push_back(<LexemeC*>self.lexicon.get(&prefix)) tokens.push_back(<LexemeC*>self.lexicon.get(&prefix))
lexemes = <LexemeC**>calloc(tokens.length - first_token, sizeof(LexemeC*)) lexemes = <LexemeC**>calloc(tokens.length - first_token, sizeof(LexemeC*))
cdef size_t i
cdef size_t j cdef size_t j
for i, j in enumerate(range(first_token, tokens.length)): for i, j in enumerate(range(first_token, tokens.length)):
lexemes[i] = tokens.lexemes[j] lexemes[i] = tokens.lexemes[j]
@ -164,13 +169,7 @@ cdef class Language:
lexemes[i + 1] = NULL lexemes[i + 1] = NULL
string_from_unicode(&string, uni_string) string_from_unicode(&string, uni_string)
self.specials[string.key] = <size_t>lexemes self.specials[string.key] = <size_t>lexemes
self.cache[string.key] = <size_t>lexemes
cdef void _extend_tokens(Tokens tokens, LexemeC** lexemes):
cdef size_t i = 0
while lexemes[i] != NULL:
tokens.push_back(lexemes[i])
i += 1
cdef class Lexicon: cdef class Lexicon: