* Tweak signatures and refactor slightly. Processing gigaword taking 8-9 mins. Tests passing, but some sort of memory bug on exit.

This commit is contained in:
Matthew Honnibal 2014-09-12 02:43:36 +02:00
parent 073ee0de63
commit e096f30161
2 changed files with 16 additions and 19 deletions

View File

@ -73,6 +73,6 @@ cdef class Language:
cpdef Tokens tokenize(self, unicode text) cpdef Tokens tokenize(self, unicode text)
cpdef Lexeme lookup(self, unicode text) cpdef Lexeme lookup(self, unicode text)
cdef _tokenize(self, Tokens tokens, Py_UNICODE* characters, size_t length) cdef int _tokenize(self, Tokens tokens, Py_UNICODE* characters, size_t length) except -1
cdef list _split(self, unicode string) cdef list _split(self, unicode string)
cdef int _split_one(self, unicode word) cdef int _split_one(self, unicode word)

View File

@ -100,32 +100,29 @@ cdef class Language:
self._tokenize(tokens, &characters[start], i - start) self._tokenize(tokens, &characters[start], i - start)
return tokens return tokens
cdef _tokenize(self, Tokens tokens, Py_UNICODE* characters, size_t length): cdef int _tokenize(self, Tokens tokens, Py_UNICODE* characters, size_t length) except -1:
cdef uint64_t hashed = hash64(characters, length * sizeof(Py_UNICODE), 0) cdef uint64_t hashed = hash64(characters, length * sizeof(Py_UNICODE), 0)
cdef unicode string
cdef bint free_chunk = False
cdef size_t i = 0 cdef size_t i = 0
cdef LexemeC** lexemes = <LexemeC**>self.cache[hashed] cdef LexemeC** lexemes = <LexemeC**>self.cache[hashed]
if lexemes is not NULL: if lexemes is not NULL:
while lexemes[i] != NULL: while lexemes[i] != NULL:
tokens.push_back(lexemes[i]) tokens.push_back(lexemes[i])
i += 1 i += 1
return 0
cdef unicode string = characters[:length]
cdef list substrings = self._split(string)
lexemes = <LexemeC**>calloc(len(substrings) + 1, sizeof(LexemeC*))
cdef unicode substring
for i, substring in enumerate(substrings):
lexemes[i] = <LexemeC*>self.lexicon.get(substring)
tokens.push_back(lexemes[i])
lexemes[i + 1] = NULL
if self.cache_size < 1000000:
self.cache[hashed] = <size_t>lexemes
self.cache_size += 1
else: else:
string = characters[:length] free(lexemes)
substrings = self._split(string)
lexemes = <LexemeC**>calloc(len(substrings) + 1, sizeof(LexemeC*))
for i, substring in enumerate(substrings):
lexemes[i] = <LexemeC*>self.lexicon.get(substring)
tokens.push_back(lexemes[i])
lexemes[i + 1] = NULL
# The intuition here is that if an element belongs in the cache, it
# has several chances to get in. And if the cache is large, we less
# believe that the element belongs there.
if self.cache_size == 0 or random.random() < (100000.0 / self.cache_size):
self.cache[hashed] = <size_t>lexemes
self.cache_size += 1
else:
free(lexemes)
cdef list _split(self, unicode string): cdef list _split(self, unicode string):
"""Find how to split a contiguous span of non-space characters into substrings. """Find how to split a contiguous span of non-space characters into substrings.