mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-10 19:57:17 +03:00
Work on Issue #285: intern strings into document-specific pools, to address streaming data memory growth. StringStore.__getitem__ now raises KeyError when it can't find the string. Use StringStore.intern() to get the old behaviour. Still need to hunt down all uses of StringStore.__getitem__ in library and do testing, but logic looks good.
This commit is contained in:
parent
d3dc5718b2
commit
8423e8627f
|
@ -7,6 +7,8 @@ from libc.stdint cimport int64_t
|
|||
|
||||
from .typedefs cimport hash_t
|
||||
|
||||
DEF UINT64_MAX = 18446744073709551615
|
||||
|
||||
cpdef hash_t hash_string(unicode string) except 0
|
||||
|
||||
|
||||
|
@ -22,6 +24,10 @@ cdef class StringStore:
|
|||
|
||||
cdef public PreshMap _map
|
||||
cdef int64_t _resize_at
|
||||
cdef PreshMap oov_maps
|
||||
|
||||
cdef const Utf8Str* intern(self, unicode py_string) except NULL
|
||||
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length) except NULL
|
||||
cpdef int remove_oov_map(self, Pool mem) except -1
|
||||
|
||||
cdef hash_t intern(self, unicode py_string, Pool mem=*) except UINT64_MAX
|
||||
cdef const Utf8Str* _intern_utf8(self, const unsigned char* utf8_string,
|
||||
int length) except NULL
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
# cython: infer_types=True
|
||||
from __future__ import unicode_literals, absolute_import
|
||||
|
||||
cimport cython
|
||||
|
@ -6,7 +7,8 @@ from libc.stdint cimport uint64_t
|
|||
|
||||
from murmurhash.mrmr cimport hash64
|
||||
|
||||
from preshed.maps cimport map_iter, key_t
|
||||
from preshed.maps cimport map_init, map_set, map_get, map_iter
|
||||
from preshed.maps cimport MapStruct
|
||||
|
||||
from .typedefs cimport hash_t
|
||||
|
||||
|
@ -16,13 +18,17 @@ except ImportError:
|
|||
import json
|
||||
|
||||
|
||||
DEF UINT64_MAX = 18446744073709551615
|
||||
|
||||
|
||||
cpdef hash_t hash_string(unicode string) except 0:
|
||||
chars = string.encode('utf8')
|
||||
return _hash_utf8(chars, len(chars))
|
||||
byte_string = string.encode('utf8')
|
||||
cdef unsigned char* chars = byte_string
|
||||
return _hash_utf8(chars, len(byte_string))
|
||||
|
||||
|
||||
cdef hash_t _hash_utf8(char* utf8_string, int length):
|
||||
return hash64(utf8_string, length, 1)
|
||||
cdef hash_t _hash_utf8(const unsigned char* utf8_string, int length) nogil:
|
||||
return hash64(<void*>utf8_string, length, 1)
|
||||
|
||||
|
||||
cdef unicode _decode(const Utf8Str* string):
|
||||
|
@ -74,6 +80,7 @@ cdef class StringStore:
|
|||
def __init__(self, strings=None):
|
||||
self.mem = Pool()
|
||||
self._map = PreshMap()
|
||||
self.oov_maps = PreshMap()
|
||||
self._resize_at = 10000
|
||||
self.c = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str))
|
||||
self.size = 1
|
||||
|
@ -108,14 +115,21 @@ cdef class StringStore:
|
|||
byte_string = <bytes>string_or_id
|
||||
if len(byte_string) == 0:
|
||||
return 0
|
||||
utf8str = self._intern_utf8(byte_string, len(byte_string))
|
||||
return utf8str - self.c
|
||||
key = _hash_utf8(byte_string, len(byte_string))
|
||||
utf8str = <Utf8Str*>self._map.get(key)
|
||||
if utf8str is NULL:
|
||||
raise KeyError(byte_string)
|
||||
else:
|
||||
return utf8str - self.c
|
||||
elif isinstance(string_or_id, unicode):
|
||||
if len(<unicode>string_or_id) == 0:
|
||||
return 0
|
||||
byte_string = (<unicode>string_or_id).encode('utf8')
|
||||
utf8str = self._intern_utf8(byte_string, len(byte_string))
|
||||
return utf8str - self.c
|
||||
key = hash_string(string_or_id)
|
||||
utf8str = <Utf8Str*>self._map.get(key)
|
||||
if utf8str is NULL:
|
||||
raise KeyError(string_or_id)
|
||||
else:
|
||||
return utf8str - self.c
|
||||
else:
|
||||
raise TypeError(type(string_or_id))
|
||||
|
||||
|
@ -131,6 +145,8 @@ cdef class StringStore:
|
|||
yield _decode(&self.c[i]) if i > 0 else u''
|
||||
|
||||
def __reduce__(self):
|
||||
# TODO: Is it problematic that we don't save the OOV strings?
|
||||
# Probably yes? We're not restoring all the state...
|
||||
strings = [""]
|
||||
for i in range(1, self.size):
|
||||
string = &self.c[i]
|
||||
|
@ -138,27 +154,77 @@ cdef class StringStore:
|
|||
strings.append(py_string)
|
||||
return (StringStore, (strings,), None, None, None)
|
||||
|
||||
cdef const Utf8Str* intern(self, unicode py_string) except NULL:
|
||||
# 0 means missing, but we don't bother offsetting the index.
|
||||
cdef hash_t intern(self, unicode py_string, Pool mem=None) except UINT64_MAX:
|
||||
if mem is None:
|
||||
mem = self.mem
|
||||
cdef hash_t map_key = id(mem)
|
||||
cdef bytes byte_string = py_string.encode('utf8')
|
||||
return self._intern_utf8(byte_string, len(byte_string))
|
||||
cdef hash_t key = _hash_utf8(byte_string, len(byte_string))
|
||||
cdef const Utf8Str* utf8str = <Utf8Str*>self._map.get(key)
|
||||
cdef hash_t map_id = id(mem)
|
||||
cdef MapStruct* oov_map
|
||||
if utf8str is not NULL:
|
||||
return utf8str - self.c
|
||||
elif mem is None or mem is self.mem:
|
||||
utf8str = self._intern_utf8(byte_string, len(byte_string))
|
||||
return utf8str - self.c
|
||||
else:
|
||||
new_utf8str = <Utf8Str*>mem.alloc(sizeof(Utf8Str), 1)
|
||||
oov_map = <MapStruct*>self.oov_maps.get(map_key)
|
||||
if oov_map is NULL:
|
||||
oov_map = <MapStruct*>mem.alloc(sizeof(MapStruct), 1)
|
||||
map_init(mem, oov_map, 16)
|
||||
self.oov_maps.set(id(mem), oov_map)
|
||||
new_utf8str[0] = _allocate(mem, byte_string, len(byte_string))
|
||||
map_set(mem, oov_map, key, new_utf8str)
|
||||
return key
|
||||
|
||||
def decode_int(self, hash_t int_, Pool mem=None):
|
||||
cdef hash_t map_key
|
||||
if int_ == 0:
|
||||
return u''
|
||||
elif int_ < <uint64_t>self.size:
|
||||
return _decode(&self.c[int_])
|
||||
elif mem is None or mem is self.mem:
|
||||
raise IndexError(int_)
|
||||
else:
|
||||
map_key = id(mem)
|
||||
oov_map = <MapStruct*>self.oov_maps.get(map_key)
|
||||
if oov_map is NULL:
|
||||
raise IndexError(
|
||||
"Trying to decode integer into string, but it's not in " +
|
||||
"the main store, and the memory pool hasn't been seen before.\n" +
|
||||
("int_ == %d\n" % int_) +
|
||||
"id(mem) == %d" % map_key)
|
||||
else:
|
||||
utf8str = <const Utf8Str*>map_get(oov_map, int_)
|
||||
if utf8str is NULL:
|
||||
raise IndexError(
|
||||
"Trying to decode integer into string, but it's not in " +
|
||||
"the main store. The integer was also not found in the " +
|
||||
"indicated auxiliary pool " +
|
||||
"(which is usually specific to a document)." +
|
||||
("int_ == %d\n" % int_) +
|
||||
"id(mem) == %d" % map_key)
|
||||
return _decode(utf8str)
|
||||
|
||||
@cython.final
|
||||
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length) except NULL:
|
||||
# 0 means missing, but we don't bother offsetting the index.
|
||||
cdef hash_t key = _hash_utf8(utf8_string, length)
|
||||
value = <Utf8Str*>self._map.get(key)
|
||||
if value is not NULL:
|
||||
return value
|
||||
|
||||
cdef const Utf8Str* _intern_utf8(self, const unsigned char* utf8_string,
|
||||
int length) except NULL:
|
||||
if self.size == self._resize_at:
|
||||
self._realloc()
|
||||
self.c[self.size] = _allocate(self.mem, <unsigned char*>utf8_string, length)
|
||||
key = _hash_utf8(utf8_string, length)
|
||||
self.c[self.size] = _allocate(self.mem, utf8_string, length)
|
||||
self._map.set(key, <void*>&self.c[self.size])
|
||||
self.size += 1
|
||||
return &self.c[self.size-1]
|
||||
|
||||
cpdef int remove_oov_map(self, Pool mem) except -1:
|
||||
cdef hash_t key = id(mem)
|
||||
self._maps.pop(key)
|
||||
|
||||
def dump(self, file_):
|
||||
# TODO: Is it problematic that we don't save the OOV strings? No, right?
|
||||
string_data = json.dumps(list(self))
|
||||
if not isinstance(string_data, unicode):
|
||||
string_data = string_data.decode('utf8')
|
||||
|
@ -180,8 +246,8 @@ cdef class StringStore:
|
|||
# we resize our array. So, first we remap to indices, then we resize,
|
||||
# then we can acquire the new pointers.
|
||||
cdef Pool tmp_mem = Pool()
|
||||
keys = <key_t*>tmp_mem.alloc(self.size, sizeof(key_t))
|
||||
cdef key_t key
|
||||
keys = <hash_t*>tmp_mem.alloc(self.size, sizeof(hash_t))
|
||||
cdef hash_t key
|
||||
cdef void* value
|
||||
cdef const Utf8Str ptr
|
||||
cdef int i = 0
|
||||
|
|
|
@ -116,11 +116,11 @@ cdef class Token:
|
|||
|
||||
property text_with_ws:
|
||||
def __get__(self):
|
||||
cdef unicode orth = self.vocab.strings[self.c.lex.orth]
|
||||
orth_ = self.orth_
|
||||
if self.c.spacy:
|
||||
return orth + u' '
|
||||
return orth_ + u' '
|
||||
else:
|
||||
return orth
|
||||
return orth_
|
||||
|
||||
property prob:
|
||||
def __get__(self):
|
||||
|
@ -403,7 +403,7 @@ cdef class Token:
|
|||
|
||||
property ent_type_:
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.ent_type]
|
||||
return self.vocab.strings.decode_int(self.c.ent_type, mem=self.mem)
|
||||
|
||||
property ent_iob_:
|
||||
def __get__(self):
|
||||
|
@ -424,7 +424,7 @@ cdef class Token:
|
|||
property ent_id_:
|
||||
'''A (string) entity ID. Usually assigned by patterns in the Matcher.'''
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.ent_id]
|
||||
return self.vocab.strings.decode_int(self.c.ent_id, mem=self.mem)
|
||||
|
||||
def __set__(self, hash_t key):
|
||||
# TODO
|
||||
|
@ -438,35 +438,35 @@ cdef class Token:
|
|||
|
||||
property orth_:
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.lex.orth]
|
||||
return self.vocab.strings.decode_int(self.c.lex.orth, mem=self.mem)
|
||||
|
||||
property lower_:
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.lex.lower]
|
||||
return self.vocab.strings.decode_int(self.c.lex.lower, mem=self.mem)
|
||||
|
||||
property norm_:
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.lex.norm]
|
||||
return self.vocab.strings.decode_int(self.c.lex.norm, mem=self.mem)
|
||||
|
||||
property shape_:
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.lex.shape]
|
||||
return self.vocab.strings.decode_int(self.c.lex.shape, mem=self.mem)
|
||||
|
||||
property prefix_:
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.lex.prefix]
|
||||
return self.vocab.strings.decode_int(self.c.lex.prefix, mem=self.mem)
|
||||
|
||||
property suffix_:
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.lex.suffix]
|
||||
return self.vocab.strings.decode_int(self.c.lex.suffix, mem=self.mem)
|
||||
|
||||
property lang_:
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.lex.lang]
|
||||
return self.vocab.strings.decode_int(self.c.lex.lang, mem=self.mem)
|
||||
|
||||
property lemma_:
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.lemma]
|
||||
return self.vocab.strings.decode_int(self.c.lemma, mem=self.mem)
|
||||
|
||||
property pos_:
|
||||
def __get__(self):
|
||||
|
@ -474,13 +474,13 @@ cdef class Token:
|
|||
|
||||
property tag_:
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.tag]
|
||||
return self.vocab.strings.decode_int(self.c.tag, mem=self.mem)
|
||||
|
||||
property dep_:
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.dep]
|
||||
return self.vocab.decode_int(self.c.dep, mem=self.mem)
|
||||
def __set__(self, unicode label):
|
||||
self.c.dep = self.vocab.strings[label]
|
||||
self.c.dep = self.vocab.strings.intern(label, mem=self.mem)
|
||||
|
||||
property is_oov:
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_OOV)
|
||||
|
|
|
@ -1,6 +1,10 @@
|
|||
from libc.stdint cimport uint16_t, uint32_t, uint64_t, uintptr_t, int32_t
|
||||
from libc.stdint cimport uint8_t
|
||||
|
||||
from libc.stdint cimport UINT64_MAX as err_hash_t
|
||||
from libc.stdint cimport UINT64_MAX as err_flags_t
|
||||
from libc.stdint cimport UINT64_MAX as err_len_t
|
||||
from libc.stdint cimport UINT64_MAX as err_tag_t
|
||||
|
||||
ctypedef uint64_t hash_t
|
||||
ctypedef char* utf8_t
|
||||
|
|
|
@ -27,6 +27,7 @@ cdef struct _Cached:
|
|||
cdef class Vocab:
|
||||
cdef Pool mem
|
||||
cpdef readonly StringStore strings
|
||||
cpdef readonly dict oov_stores
|
||||
cpdef readonly Morphology morphology
|
||||
cdef readonly int length
|
||||
cdef public object _serializer
|
||||
|
|
|
@ -93,6 +93,7 @@ cdef class Vocab:
|
|||
self._by_hash = PreshMap()
|
||||
self._by_orth = PreshMap()
|
||||
self.strings = StringStore()
|
||||
self.oov_stores = {}
|
||||
# Load strings in a special order, so that we have an onset number for
|
||||
# the vocabulary. This way, when words are added in order, the orth ID
|
||||
# is the frequency rank of the word, plus a certain offset. The structural
|
||||
|
@ -140,7 +141,7 @@ cdef class Vocab:
|
|||
lex = <LexemeC*>self._by_hash.get(key)
|
||||
cdef size_t addr
|
||||
if lex != NULL:
|
||||
if lex.orth != self.strings[string]:
|
||||
if (string not in self.strings) or (lex.orth != self.strings[string]):
|
||||
raise LookupError.mismatched_strings(
|
||||
lex.orth, self.strings[string], self.strings[lex.orth], string)
|
||||
return lex
|
||||
|
@ -163,10 +164,10 @@ cdef class Vocab:
|
|||
cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL:
|
||||
cdef hash_t key
|
||||
cdef bint is_oov = mem is not self.mem
|
||||
if len(string) < 3:
|
||||
if len(string) < 3 or not is_oov:
|
||||
mem = self.mem
|
||||
lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
|
||||
lex.orth = self.strings[string]
|
||||
lex.orth = self.strings.intern(string, mem=mem)
|
||||
lex.length = len(string)
|
||||
lex.id = self.length
|
||||
lex.vector = <float*>mem.alloc(self.vectors_length, sizeof(float))
|
||||
|
@ -174,7 +175,7 @@ cdef class Vocab:
|
|||
for attr, func in self.lex_attr_getters.items():
|
||||
value = func(string)
|
||||
if isinstance(value, unicode):
|
||||
value = self.strings[value]
|
||||
value = self.strings.intern(value)
|
||||
if attr == PROB:
|
||||
lex.prob = value
|
||||
else:
|
||||
|
@ -205,7 +206,8 @@ cdef class Vocab:
|
|||
|
||||
def __getitem__(self, id_or_string):
|
||||
'''Retrieve a lexeme, given an int ID or a unicode string. If a previously
|
||||
unseen unicode string is given, a new lexeme is created and stored.
|
||||
unseen unicode string is given, a new lexeme is created and stored, and
|
||||
the string is interned in the vocabulary.
|
||||
|
||||
Args:
|
||||
id_or_string (int or unicode):
|
||||
|
@ -220,7 +222,7 @@ cdef class Vocab:
|
|||
'''
|
||||
cdef attr_t orth
|
||||
if type(id_or_string) == unicode:
|
||||
orth = self.strings[id_or_string]
|
||||
orth = self.strings.intern(id_or_string)
|
||||
else:
|
||||
orth = id_or_string
|
||||
return Lexeme(self, orth)
|
||||
|
@ -236,7 +238,7 @@ cdef class Vocab:
|
|||
if 'pos' in props:
|
||||
self.morphology.assign_tag(token, props['pos'])
|
||||
if 'L' in props:
|
||||
tokens[i].lemma = self.strings[props['L']]
|
||||
tokens[i].lemma = self.strings.intern(props['L'])
|
||||
for feature, value in props.get('morph', {}).items():
|
||||
self.morphology.assign_feature(&token.morph, feature, value)
|
||||
return tokens
|
||||
|
|
Loading…
Reference in New Issue
Block a user