* Improve cache mechanism by including a random element depending on the size of the cache.

This commit is contained in:
Matthew Honnibal 2014-09-12 00:18:31 +02:00
parent c8f7c8bfde
commit 2389bd1b10
2 changed files with 20 additions and 9 deletions

View File

@ -11,6 +11,7 @@ from __future__ import unicode_literals
from libc.stdlib cimport calloc, free
import json
import random
from os import path
from .util import read_lang_data
@ -85,7 +86,7 @@ cdef class Language:
cdef size_t start = 0
cdef size_t i = 0
for c in string:
if c == ' ':
if c == ' ' or c == '\n' or c == '\t':
if start < i:
self._tokenize(tokens, string[start:i])
start = i + 1
@ -96,20 +97,27 @@ cdef class Language:
cdef _tokenize(self, Tokens tokens, unicode string):
cdef LexemeC** lexemes
cdef bint free_chunk = False
cdef size_t i = 0
if string in self.cache:
lexemes = <LexemeC**><size_t>self.cache[string]
while lexemes[i] != NULL:
tokens.push_back(lexemes[i])
i += 1
else:
substrings = self._split(string)
lexemes = <LexemeC**>calloc(len(substrings) + 1, sizeof(LexemeC*))
for i, substring in enumerate(substrings):
lexemes[i] = <LexemeC*>self.lexicon.get(substring)
lexemes[i + 1] = NULL
self.cache[string] = <size_t>lexemes
cdef LexemeC* lexeme
i = 0
while lexemes[i] != NULL:
tokens.push_back(lexemes[i])
i += 1
lexemes[i + 1] = NULL
# The intuition here is that if an element belongs in the cache, it
# has several chances to get in. And if the cache is large, we less
# believe that the element belongs there.
if not self.cache or random.random() < (100000.0 / len(self.cache)):
self.cache[string] = <size_t>lexemes
else:
free(lexemes)
cdef list _split(self, unicode string):
"""Find how to split a contiguous span of non-space characters into substrings.

View File

@ -24,12 +24,15 @@ cdef class Tokens:
>>> tokens.can_noun(1)
True
"""
def __cinit__(self, size=100):
assert size >= 1
def __cinit__(self, string_length=0):
size = int(string_length / 3) if string_length >= 3 else 1
self.lexemes = <LexemeC**>calloc(size, sizeof(LexemeC*))
self.size = size
self.length = 0
def __dealloc__(self):
free(self.lexemes)
def __getitem__(self, i):
if i >= self.length:
raise IndexError