* Working version that uses arrays for chunks, which should be more memory efficient

This commit is contained in:
Matthew Honnibal 2014-08-18 20:23:54 +02:00
parent 8d3f6082be
commit bbf9a2c944
2 changed files with 15 additions and 21 deletions

View File

@ -33,7 +33,7 @@ cdef class Language:
cdef Tokens tokenize(self, unicode text) cdef Tokens tokenize(self, unicode text)
cdef Lexeme* lookup(self, unicode string) except NULL cdef Lexeme* lookup(self, unicode string) except NULL
cdef Lexeme** lookup_chunk(self, char* chunk, size_t length) except NULL cdef Lexeme** lookup_chunk(self, Py_UNICODE* chunk, size_t length) except NULL
cdef Lexeme** new_chunk(self, unicode string, list substrings) except NULL cdef Lexeme** new_chunk(self, unicode string, list substrings) except NULL
cdef Lexeme* new_lexeme(self, unicode lex) except NULL cdef Lexeme* new_lexeme(self, unicode lex) except NULL

View File

@ -73,10 +73,9 @@ cdef class Language:
cdef Tokens tokenize(self, unicode string): cdef Tokens tokenize(self, unicode string):
cdef Lexeme** chunk cdef Lexeme** chunk
cdef Tokens tokens = Tokens(self) cdef Tokens tokens = Tokens(self)
cdef bytes byte_string = string.encode('utf8') cdef size_t length = len(string)
cdef size_t length = len(byte_string) cdef Py_UNICODE* characters = <Py_UNICODE*>string
cdef char* characters = <char*>byte_string cdef Py_UNICODE c
cdef char c
cdef size_t start = 0 cdef size_t start = 0
cdef size_t i cdef size_t i
for i in range(length): for i in range(length):
@ -86,7 +85,6 @@ cdef class Language:
chunk = self.lookup_chunk(&characters[start], i - start) chunk = self.lookup_chunk(&characters[start], i - start)
_extend(tokens, chunk) _extend(tokens, chunk)
start = i + 1 start = i + 1
i += 1
if start < i: if start < i:
chunk = self.lookup_chunk(&characters[start], length - start) chunk = self.lookup_chunk(&characters[start], length - start)
_extend(tokens, chunk) _extend(tokens, chunk)
@ -100,14 +98,12 @@ cdef class Language:
word = self.new_lexeme(string) word = self.new_lexeme(string)
return word return word
cdef Lexeme** lookup_chunk(self, char* c_string, size_t length) except NULL: cdef Lexeme** lookup_chunk(self, Py_UNICODE* c_string, size_t length) except NULL:
cdef StringHash h = mrmr.hash32(c_string, length * sizeof(char), 0) cdef StringHash h = mrmr.hash32(c_string, length * sizeof(Py_UNICODE), 0)
cdef Lexeme** chunk = <Lexeme**>self.chunks[h] cdef Lexeme** chunk = <Lexeme**>self.chunks[h]
cdef int split cdef int split
cdef unicode ustring
if chunk == NULL: if chunk == NULL:
ustring = c_string[:length].decode('utf8') chunk = self.new_chunk(c_string[:length], self.find_substrings(c_string[:length]))
chunk = self.new_chunk(ustring, self.find_substrings(ustring))
self.chunks[h] = <size_t>chunk self.chunks[h] = <size_t>chunk
return chunk return chunk
@ -177,16 +173,14 @@ cdef class Language:
def load_tokenization(self, token_rules=None): def load_tokenization(self, token_rules=None):
cdef StringHash h cdef StringHash h
cdef char* c_string cdef Py_UNICODE* c_string
cdef bytes byte_string cdef bytes byte_string
for chunk, tokens in token_rules: for chunk, tokens in token_rules:
byte_string = chunk.encode('utf8') length = len(chunk)
length = len(byte_string) c_string = <Py_UNICODE*>chunk
c_string = <char*>byte_string h = mrmr.hash32(c_string, length * sizeof(Py_UNICODE), 0)
h = mrmr.hash32(c_string, length * sizeof(char), 0)
self.chunks[h] = <size_t>self.new_chunk(chunk, tokens) self.chunks[h] = <size_t>self.new_chunk(chunk, tokens)
def load_clusters(self): def load_clusters(self):
cdef Lexeme* w cdef Lexeme* w
data_dir = path.join(path.dirname(__file__), '..', 'data', 'en') data_dir = path.join(path.dirname(__file__), '..', 'data', 'en')
@ -204,12 +198,12 @@ cdef class Language:
self.new_lexeme(token_string) self.new_lexeme(token_string)
cdef inline bint _is_whitespace(char c) nogil: cdef inline bint _is_whitespace(Py_UNICODE c) nogil:
if c == b' ': if c == ' ':
return True return True
elif c == b'\n': elif c == '\n':
return True return True
elif c == b'\t': elif c == '\t':
return True return True
else: else:
return False return False