* Moving to storing LexemeC structs internally

This commit is contained in:
Matthew Honnibal 2014-09-11 21:44:58 +02:00
parent 563047e90f
commit bf9c60c31c

View File

@ -95,20 +95,21 @@ cdef class Language:
return tokens return tokens
cdef _tokenize(self, Tokens tokens, unicode string): cdef _tokenize(self, Tokens tokens, unicode string):
cdef list lexemes cdef LexemeC** lexemes
if len(string) == 1: if string in self.cache:
lexemes = [self.lookup(string)] lexemes = <LexemeC**><size_t>self.cache[string]
elif string in self.cache:
lexemes = self.cache[string]
else: else:
lexemes = []
substrings = self._split(string) substrings = self._split(string)
lexemes = <LexemeC**>calloc(len(substrings) + 1, sizeof(LexemeC*))
for i, substring in enumerate(substrings): for i, substring in enumerate(substrings):
lexemes.append(self.lexicon.lookup(substring)) lexemes[i] = self.lexicon.lookup(substring)._c
self.cache[string] = lexemes lexemes[i + 1] = NULL
cdef Lexeme lexeme self.cache[string] = <size_t>lexemes
for lexeme in lexemes: cdef LexemeC* lexeme
tokens.append(lexeme) i = 0
while lexemes[i] != NULL:
tokens.push_back(lexemes[i])
i += 1
cdef list _split(self, unicode string): cdef list _split(self, unicode string):
"""Find how to split a contiguous span of non-space characters into substrings. """Find how to split a contiguous span of non-space characters into substrings.
@ -147,11 +148,13 @@ cdef class Language:
token_rules (list): A list of (chunk, tokens) pairs, where chunk is token_rules (list): A list of (chunk, tokens) pairs, where chunk is
a string and tokens is a list of strings. a string and tokens is a list of strings.
''' '''
cdef LexemeC** lexemes
for string, substrings in token_rules: for string, substrings in token_rules:
lexemes = [] lexemes = <LexemeC**>calloc(len(substrings) + 1, sizeof(LexemeC*))
for i, substring in enumerate(substrings): for i, substring in enumerate(substrings):
lexemes.append(self.lexicon.lookup(substring)) lexemes[i] = self.lexicon.lookup(substring)._c
self.cache[string] = lexemes lexemes[i + 1] = NULL
self.cache[string] = <size_t>lexemes
cdef class Lexicon: cdef class Lexicon: