* Switch to 64 bit hashes, for better reliability

This commit is contained in:
Matthew Honnibal 2014-09-12 02:04:47 +02:00
parent 2389bd1b10
commit 3c928fb5e0

View File

@ -17,6 +17,7 @@ from os import path
from .util import read_lang_data from .util import read_lang_data
from spacy.tokens import Tokens from spacy.tokens import Tokens
from spacy.lexeme cimport LexemeC, lexeme_init from spacy.lexeme cimport LexemeC, lexeme_init
from murmurhash.mrmr cimport hash64
cdef class Language: cdef class Language:
@ -85,26 +86,32 @@ cdef class Language:
cdef size_t start = 0 cdef size_t start = 0
cdef size_t i = 0 cdef size_t i = 0
for c in string: cdef Py_UNICODE* characters = string
cdef Py_UNICODE c
for i in range(length):
c = characters[i]
if c == ' ' or c == '\n' or c == '\t': if c == ' ' or c == '\n' or c == '\t':
if start < i: if start < i:
self._tokenize(tokens, string[start:i]) self._tokenize(tokens, &characters[start], i - start)
start = i + 1 start = i + 1
i += 1 i += 1
if start < i: if start < i:
self._tokenize(tokens, string[start:i]) self._tokenize(tokens, &characters[start], i - start)
return tokens return tokens
cdef _tokenize(self, Tokens tokens, unicode string): cdef _tokenize(self, Tokens tokens, Py_UNICODE* characters, size_t length):
cdef uint64_t hashed = hash64(characters, length * sizeof(Py_UNICODE), 0)
cdef unicode string
cdef LexemeC** lexemes cdef LexemeC** lexemes
cdef bint free_chunk = False cdef bint free_chunk = False
cdef size_t i = 0 cdef size_t i = 0
if string in self.cache: if hashed in self.cache:
lexemes = <LexemeC**><size_t>self.cache[string] lexemes = <LexemeC**><size_t>self.cache[hashed]
while lexemes[i] != NULL: while lexemes[i] != NULL:
tokens.push_back(lexemes[i]) tokens.push_back(lexemes[i])
i += 1 i += 1
else: else:
string = characters[:length]
substrings = self._split(string) substrings = self._split(string)
lexemes = <LexemeC**>calloc(len(substrings) + 1, sizeof(LexemeC*)) lexemes = <LexemeC**>calloc(len(substrings) + 1, sizeof(LexemeC*))
for i, substring in enumerate(substrings): for i, substring in enumerate(substrings):
@ -115,7 +122,7 @@ cdef class Language:
# has several chances to get in. And if the cache is large, we less # has several chances to get in. And if the cache is large, we less
# believe that the element belongs there. # believe that the element belongs there.
if not self.cache or random.random() < (100000.0 / len(self.cache)): if not self.cache or random.random() < (100000.0 / len(self.cache)):
self.cache[string] = <size_t>lexemes self.cache[hashed] = <size_t>lexemes
else: else:
free(lexemes) free(lexemes)
@ -157,12 +164,14 @@ cdef class Language:
a string and tokens is a list of strings. a string and tokens is a list of strings.
''' '''
cdef LexemeC** lexemes cdef LexemeC** lexemes
cdef uint64_t hashed
for string, substrings in token_rules: for string, substrings in token_rules:
lexemes = <LexemeC**>calloc(len(substrings) + 1, sizeof(LexemeC*)) lexemes = <LexemeC**>calloc(len(substrings) + 1, sizeof(LexemeC*))
for i, substring in enumerate(substrings): for i, substring in enumerate(substrings):
lexemes[i] = <LexemeC*>self.lexicon.get(substring) lexemes[i] = <LexemeC*>self.lexicon.get(substring)
lexemes[i + 1] = NULL lexemes[i + 1] = NULL
self.cache[string] = <size_t>lexemes hashed = hash64(<Py_UNICODE*>string, len(string) * sizeof(Py_UNICODE), 0)
self.cache[hashed] = <size_t>lexemes
cdef class Lexicon: cdef class Lexicon: