Use 32 bit hashes for OOV, re Issue #589, Issue #285

This commit is contained in:
Matthew Honnibal 2016-11-01 13:27:13 +01:00
parent 6977a2b8cd
commit 62fc6b1afa

View File

@ -5,11 +5,12 @@ cimport cython
from libc.string cimport memcpy from libc.string cimport memcpy
from libc.stdint cimport uint64_t from libc.stdint cimport uint64_t
from murmurhash.mrmr cimport hash64 from murmurhash.mrmr cimport hash64, hash32
from preshed.maps cimport map_iter, key_t from preshed.maps cimport map_iter, key_t
from .typedefs cimport hash_t from .typedefs cimport hash_t
from libc.stdint cimport uint32_t
try: try:
import ujson as json import ujson as json
@ -26,6 +27,10 @@ cdef hash_t _hash_utf8(char* utf8_string, int length):
return hash64(utf8_string, length, 1) return hash64(utf8_string, length, 1)
cdef uint32_t _hash32_utf8(char* utf8_string, int length):
return hash32(utf8_string, length, 1)
cdef unicode _decode(const Utf8Str* string): cdef unicode _decode(const Utf8Str* string):
cdef int i, length cdef int i, length
if string.s[0] < sizeof(string.s) and string.s[0] != 0: if string.s[0] < sizeof(string.s) and string.s[0] != 0:
@ -84,7 +89,7 @@ cdef class StringStore:
self._resize_at = 10000 self._resize_at = 10000
self.c = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str)) self.c = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str))
self.size = 1 self.size = 1
self.is_frozen = False self.is_frozen = freeze
if strings is not None: if strings is not None:
for string in strings: for string in strings:
_ = self[string] _ = self[string]
@ -118,12 +123,14 @@ cdef class StringStore:
cdef bytes byte_string cdef bytes byte_string
cdef const Utf8Str* utf8str cdef const Utf8Str* utf8str
cdef uint64_t int_id cdef uint64_t int_id
cdef uint32_t oov_id
if isinstance(string_or_id, (int, long)): if isinstance(string_or_id, (int, long)):
int_id = string_or_id int_id = string_or_id
oov_id = string_or_id
if int_id < <uint64_t>self.size: if int_id < <uint64_t>self.size:
return _decode(&self.c[int_id]) return _decode(&self.c[int_id])
else: else:
utf8str = <Utf8Str*>self._oov.get(int_id) utf8str = <Utf8Str*>self._oov.get(oov_id)
if utf8str is not NULL: if utf8str is not NULL:
return _decode(utf8str) return _decode(utf8str)
else: else:
@ -137,10 +144,12 @@ cdef class StringStore:
raise TypeError(type(string_or_id)) raise TypeError(type(string_or_id))
utf8str = self._intern_utf8(byte_string, len(byte_string)) utf8str = self._intern_utf8(byte_string, len(byte_string))
if utf8str is NULL: if utf8str is NULL:
# TODO: We could get unlucky here, and hash into a value that # TODO: We need to use 32 bit here, for compatibility with the
# collides with the 'real' strings. All we have to do is offset # vocabulary values. This makes birthday paradox probabilities
# I think? # pretty bad.
return _hash_utf8(byte_string, len(byte_string)) # We could also get unlucky here, and hash into a value that
# collides with the 'real' strings.
return _hash32_utf8(byte_string, len(byte_string))
else: else:
return utf8str - self.c return utf8str - self.c
@ -199,11 +208,13 @@ cdef class StringStore:
if value is not NULL: if value is not NULL:
return value return value
if self.is_frozen: if self.is_frozen:
# OOV store uses 32 bit hashes. Pretty ugly :(
key32 = _hash32_utf8(utf8_string, length)
# Important: Make the OOV store own the memory. That way it's trivial # Important: Make the OOV store own the memory. That way it's trivial
# to flush them all. # to flush them all.
value = <Utf8Str*>self._oov.mem.alloc(1, sizeof(Utf8Str)) value = <Utf8Str*>self._oov.mem.alloc(1, sizeof(Utf8Str))
value[0] = _allocate(self._oov.mem, <unsigned char*>utf8_string, length) value[0] = _allocate(self._oov.mem, <unsigned char*>utf8_string, length)
self._oov.set(key, value) self._oov.set(key32, value)
return NULL return NULL
if self.size == self._resize_at: if self.size == self._resize_at: