mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
* Improve cache mechanism by including a random element depending on the size of the cache.
This commit is contained in:
parent
c8f7c8bfde
commit
2389bd1b10
|
@ -11,6 +11,7 @@ from __future__ import unicode_literals
|
||||||
from libc.stdlib cimport calloc, free
|
from libc.stdlib cimport calloc, free
|
||||||
|
|
||||||
import json
|
import json
|
||||||
|
import random
|
||||||
from os import path
|
from os import path
|
||||||
|
|
||||||
from .util import read_lang_data
|
from .util import read_lang_data
|
||||||
|
@ -85,7 +86,7 @@ cdef class Language:
|
||||||
cdef size_t start = 0
|
cdef size_t start = 0
|
||||||
cdef size_t i = 0
|
cdef size_t i = 0
|
||||||
for c in string:
|
for c in string:
|
||||||
if c == ' ':
|
if c == ' ' or c == '\n' or c == '\t':
|
||||||
if start < i:
|
if start < i:
|
||||||
self._tokenize(tokens, string[start:i])
|
self._tokenize(tokens, string[start:i])
|
||||||
start = i + 1
|
start = i + 1
|
||||||
|
@ -96,20 +97,27 @@ cdef class Language:
|
||||||
|
|
||||||
cdef _tokenize(self, Tokens tokens, unicode string):
|
cdef _tokenize(self, Tokens tokens, unicode string):
|
||||||
cdef LexemeC** lexemes
|
cdef LexemeC** lexemes
|
||||||
|
cdef bint free_chunk = False
|
||||||
|
cdef size_t i = 0
|
||||||
if string in self.cache:
|
if string in self.cache:
|
||||||
lexemes = <LexemeC**><size_t>self.cache[string]
|
lexemes = <LexemeC**><size_t>self.cache[string]
|
||||||
|
while lexemes[i] != NULL:
|
||||||
|
tokens.push_back(lexemes[i])
|
||||||
|
i += 1
|
||||||
else:
|
else:
|
||||||
substrings = self._split(string)
|
substrings = self._split(string)
|
||||||
lexemes = <LexemeC**>calloc(len(substrings) + 1, sizeof(LexemeC*))
|
lexemes = <LexemeC**>calloc(len(substrings) + 1, sizeof(LexemeC*))
|
||||||
for i, substring in enumerate(substrings):
|
for i, substring in enumerate(substrings):
|
||||||
lexemes[i] = <LexemeC*>self.lexicon.get(substring)
|
lexemes[i] = <LexemeC*>self.lexicon.get(substring)
|
||||||
|
tokens.push_back(lexemes[i])
|
||||||
lexemes[i + 1] = NULL
|
lexemes[i + 1] = NULL
|
||||||
self.cache[string] = <size_t>lexemes
|
# The intuition here is that if an element belongs in the cache, it
|
||||||
cdef LexemeC* lexeme
|
# has several chances to get in. And if the cache is large, we less
|
||||||
i = 0
|
# believe that the element belongs there.
|
||||||
while lexemes[i] != NULL:
|
if not self.cache or random.random() < (100000.0 / len(self.cache)):
|
||||||
tokens.push_back(lexemes[i])
|
self.cache[string] = <size_t>lexemes
|
||||||
i += 1
|
else:
|
||||||
|
free(lexemes)
|
||||||
|
|
||||||
cdef list _split(self, unicode string):
|
cdef list _split(self, unicode string):
|
||||||
"""Find how to split a contiguous span of non-space characters into substrings.
|
"""Find how to split a contiguous span of non-space characters into substrings.
|
||||||
|
|
|
@ -24,12 +24,15 @@ cdef class Tokens:
|
||||||
>>> tokens.can_noun(1)
|
>>> tokens.can_noun(1)
|
||||||
True
|
True
|
||||||
"""
|
"""
|
||||||
def __cinit__(self, size=100):
|
def __cinit__(self, string_length=0):
|
||||||
assert size >= 1
|
size = int(string_length / 3) if string_length >= 3 else 1
|
||||||
self.lexemes = <LexemeC**>calloc(size, sizeof(LexemeC*))
|
self.lexemes = <LexemeC**>calloc(size, sizeof(LexemeC*))
|
||||||
self.size = size
|
self.size = size
|
||||||
self.length = 0
|
self.length = 0
|
||||||
|
|
||||||
|
def __dealloc__(self):
|
||||||
|
free(self.lexemes)
|
||||||
|
|
||||||
def __getitem__(self, i):
|
def __getitem__(self, i):
|
||||||
if i >= self.length:
|
if i >= self.length:
|
||||||
raise IndexError
|
raise IndexError
|
||||||
|
|
Loading…
Reference in New Issue
Block a user