From ca32a1ab017d6a83a7c3b26ae7cff3640690c888 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 30 Sep 2016 20:20:22 +0200 Subject: [PATCH] Revert "Work on Issue #285: intern strings into document-specific pools, to address streaming data memory growth. StringStore.__getitem__ now raises KeyError when it can't find the string. Use StringStore.intern() to get the old behaviour. Still need to hunt down all uses of StringStore.__getitem__ in library and do testing, but logic looks good." This reverts commit 8423e8627f0c9e54ab026d5c51701e9e0a9c38d2. --- spacy/strings.pxd | 10 +--- spacy/strings.pyx | 112 +++++++++-------------------------------- spacy/tokens/token.pyx | 32 ++++++------ spacy/typedefs.pxd | 4 -- spacy/vocab.pxd | 1 - spacy/vocab.pyx | 16 +++--- 6 files changed, 48 insertions(+), 127 deletions(-) diff --git a/spacy/strings.pxd b/spacy/strings.pxd index df61712c8..e2cd579c0 100644 --- a/spacy/strings.pxd +++ b/spacy/strings.pxd @@ -7,8 +7,6 @@ from libc.stdint cimport int64_t from .typedefs cimport hash_t -DEF UINT64_MAX = 18446744073709551615 - cpdef hash_t hash_string(unicode string) except 0 @@ -24,10 +22,6 @@ cdef class StringStore: cdef public PreshMap _map cdef int64_t _resize_at - cdef PreshMap oov_maps - cpdef int remove_oov_map(self, Pool mem) except -1 - - cdef hash_t intern(self, unicode py_string, Pool mem=*) except UINT64_MAX - cdef const Utf8Str* _intern_utf8(self, const unsigned char* utf8_string, - int length) except NULL + cdef const Utf8Str* intern(self, unicode py_string) except NULL + cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length) except NULL diff --git a/spacy/strings.pyx b/spacy/strings.pyx index 509b475e9..2e81bd87b 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -1,4 +1,3 @@ -# cython: infer_types=True from __future__ import unicode_literals, absolute_import cimport cython @@ -7,8 +6,7 @@ from libc.stdint cimport uint64_t from murmurhash.mrmr cimport hash64 -from preshed.maps cimport map_init, map_set, map_get, map_iter -from preshed.maps cimport MapStruct +from preshed.maps cimport map_iter, key_t from .typedefs cimport hash_t @@ -18,17 +16,13 @@ except ImportError: import json -DEF UINT64_MAX = 18446744073709551615 - - cpdef hash_t hash_string(unicode string) except 0: - byte_string = string.encode('utf8') - cdef unsigned char* chars = byte_string - return _hash_utf8(chars, len(byte_string)) + chars = string.encode('utf8') + return _hash_utf8(chars, len(chars)) -cdef hash_t _hash_utf8(const unsigned char* utf8_string, int length) nogil: - return hash64(utf8_string, length, 1) +cdef hash_t _hash_utf8(char* utf8_string, int length): + return hash64(utf8_string, length, 1) cdef unicode _decode(const Utf8Str* string): @@ -80,7 +74,6 @@ cdef class StringStore: def __init__(self, strings=None): self.mem = Pool() self._map = PreshMap() - self.oov_maps = PreshMap() self._resize_at = 10000 self.c = self.mem.alloc(self._resize_at, sizeof(Utf8Str)) self.size = 1 @@ -115,21 +108,14 @@ cdef class StringStore: byte_string = string_or_id if len(byte_string) == 0: return 0 - key = _hash_utf8(byte_string, len(byte_string)) - utf8str = self._map.get(key) - if utf8str is NULL: - raise KeyError(byte_string) - else: - return utf8str - self.c + utf8str = self._intern_utf8(byte_string, len(byte_string)) + return utf8str - self.c elif isinstance(string_or_id, unicode): if len(string_or_id) == 0: return 0 - key = hash_string(string_or_id) - utf8str = self._map.get(key) - if utf8str is NULL: - raise KeyError(string_or_id) - else: - return utf8str - self.c + byte_string = (string_or_id).encode('utf8') + utf8str = self._intern_utf8(byte_string, len(byte_string)) + return utf8str - self.c else: raise TypeError(type(string_or_id)) @@ -145,8 +131,6 @@ cdef class StringStore: yield _decode(&self.c[i]) if i > 0 else u'' def __reduce__(self): - # TODO: Is it problematic that we don't save the OOV strings? - # Probably yes? We're not restoring all the state... strings = [""] for i in range(1, self.size): string = &self.c[i] @@ -154,77 +138,27 @@ cdef class StringStore: strings.append(py_string) return (StringStore, (strings,), None, None, None) - cdef hash_t intern(self, unicode py_string, Pool mem=None) except UINT64_MAX: - if mem is None: - mem = self.mem - cdef hash_t map_key = id(mem) + cdef const Utf8Str* intern(self, unicode py_string) except NULL: + # 0 means missing, but we don't bother offsetting the index. cdef bytes byte_string = py_string.encode('utf8') - cdef hash_t key = _hash_utf8(byte_string, len(byte_string)) - cdef const Utf8Str* utf8str = self._map.get(key) - cdef hash_t map_id = id(mem) - cdef MapStruct* oov_map - if utf8str is not NULL: - return utf8str - self.c - elif mem is None or mem is self.mem: - utf8str = self._intern_utf8(byte_string, len(byte_string)) - return utf8str - self.c - else: - new_utf8str = mem.alloc(sizeof(Utf8Str), 1) - oov_map = self.oov_maps.get(map_key) - if oov_map is NULL: - oov_map = mem.alloc(sizeof(MapStruct), 1) - map_init(mem, oov_map, 16) - self.oov_maps.set(id(mem), oov_map) - new_utf8str[0] = _allocate(mem, byte_string, len(byte_string)) - map_set(mem, oov_map, key, new_utf8str) - return key - - def decode_int(self, hash_t int_, Pool mem=None): - cdef hash_t map_key - if int_ == 0: - return u'' - elif int_ < self.size: - return _decode(&self.c[int_]) - elif mem is None or mem is self.mem: - raise IndexError(int_) - else: - map_key = id(mem) - oov_map = self.oov_maps.get(map_key) - if oov_map is NULL: - raise IndexError( - "Trying to decode integer into string, but it's not in " + - "the main store, and the memory pool hasn't been seen before.\n" + - ("int_ == %d\n" % int_) + - "id(mem) == %d" % map_key) - else: - utf8str = map_get(oov_map, int_) - if utf8str is NULL: - raise IndexError( - "Trying to decode integer into string, but it's not in " + - "the main store. The integer was also not found in the " + - "indicated auxiliary pool " + - "(which is usually specific to a document)." + - ("int_ == %d\n" % int_) + - "id(mem) == %d" % map_key) - return _decode(utf8str) + return self._intern_utf8(byte_string, len(byte_string)) @cython.final - cdef const Utf8Str* _intern_utf8(self, const unsigned char* utf8_string, - int length) except NULL: + cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length) except NULL: + # 0 means missing, but we don't bother offsetting the index. + cdef hash_t key = _hash_utf8(utf8_string, length) + value = self._map.get(key) + if value is not NULL: + return value + if self.size == self._resize_at: self._realloc() - key = _hash_utf8(utf8_string, length) - self.c[self.size] = _allocate(self.mem, utf8_string, length) + self.c[self.size] = _allocate(self.mem, utf8_string, length) self._map.set(key, &self.c[self.size]) self.size += 1 return &self.c[self.size-1] - cpdef int remove_oov_map(self, Pool mem) except -1: - cdef hash_t key = id(mem) - self._maps.pop(key) - def dump(self, file_): - # TODO: Is it problematic that we don't save the OOV strings? No, right? string_data = json.dumps(list(self)) if not isinstance(string_data, unicode): string_data = string_data.decode('utf8') @@ -246,8 +180,8 @@ cdef class StringStore: # we resize our array. So, first we remap to indices, then we resize, # then we can acquire the new pointers. cdef Pool tmp_mem = Pool() - keys = tmp_mem.alloc(self.size, sizeof(hash_t)) - cdef hash_t key + keys = tmp_mem.alloc(self.size, sizeof(key_t)) + cdef key_t key cdef void* value cdef const Utf8Str ptr cdef int i = 0 diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 9320cb85a..9b612a867 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -116,11 +116,11 @@ cdef class Token: property text_with_ws: def __get__(self): - orth_ = self.orth_ + cdef unicode orth = self.vocab.strings[self.c.lex.orth] if self.c.spacy: - return orth_ + u' ' + return orth + u' ' else: - return orth_ + return orth property prob: def __get__(self): @@ -403,7 +403,7 @@ cdef class Token: property ent_type_: def __get__(self): - return self.vocab.strings.decode_int(self.c.ent_type, mem=self.mem) + return self.vocab.strings[self.c.ent_type] property ent_iob_: def __get__(self): @@ -424,7 +424,7 @@ cdef class Token: property ent_id_: '''A (string) entity ID. Usually assigned by patterns in the Matcher.''' def __get__(self): - return self.vocab.strings.decode_int(self.c.ent_id, mem=self.mem) + return self.vocab.strings[self.c.ent_id] def __set__(self, hash_t key): # TODO @@ -438,35 +438,35 @@ cdef class Token: property orth_: def __get__(self): - return self.vocab.strings.decode_int(self.c.lex.orth, mem=self.mem) + return self.vocab.strings[self.c.lex.orth] property lower_: def __get__(self): - return self.vocab.strings.decode_int(self.c.lex.lower, mem=self.mem) + return self.vocab.strings[self.c.lex.lower] property norm_: def __get__(self): - return self.vocab.strings.decode_int(self.c.lex.norm, mem=self.mem) + return self.vocab.strings[self.c.lex.norm] property shape_: def __get__(self): - return self.vocab.strings.decode_int(self.c.lex.shape, mem=self.mem) + return self.vocab.strings[self.c.lex.shape] property prefix_: def __get__(self): - return self.vocab.strings.decode_int(self.c.lex.prefix, mem=self.mem) + return self.vocab.strings[self.c.lex.prefix] property suffix_: def __get__(self): - return self.vocab.strings.decode_int(self.c.lex.suffix, mem=self.mem) + return self.vocab.strings[self.c.lex.suffix] property lang_: def __get__(self): - return self.vocab.strings.decode_int(self.c.lex.lang, mem=self.mem) + return self.vocab.strings[self.c.lex.lang] property lemma_: def __get__(self): - return self.vocab.strings.decode_int(self.c.lemma, mem=self.mem) + return self.vocab.strings[self.c.lemma] property pos_: def __get__(self): @@ -474,13 +474,13 @@ cdef class Token: property tag_: def __get__(self): - return self.vocab.strings.decode_int(self.c.tag, mem=self.mem) + return self.vocab.strings[self.c.tag] property dep_: def __get__(self): - return self.vocab.decode_int(self.c.dep, mem=self.mem) + return self.vocab.strings[self.c.dep] def __set__(self, unicode label): - self.c.dep = self.vocab.strings.intern(label, mem=self.mem) + self.c.dep = self.vocab.strings[label] property is_oov: def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_OOV) diff --git a/spacy/typedefs.pxd b/spacy/typedefs.pxd index 65ab7fe62..bd863d247 100644 --- a/spacy/typedefs.pxd +++ b/spacy/typedefs.pxd @@ -1,10 +1,6 @@ from libc.stdint cimport uint16_t, uint32_t, uint64_t, uintptr_t, int32_t from libc.stdint cimport uint8_t -from libc.stdint cimport UINT64_MAX as err_hash_t -from libc.stdint cimport UINT64_MAX as err_flags_t -from libc.stdint cimport UINT64_MAX as err_len_t -from libc.stdint cimport UINT64_MAX as err_tag_t ctypedef uint64_t hash_t ctypedef char* utf8_t diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd index f136ef900..8e0e363bc 100644 --- a/spacy/vocab.pxd +++ b/spacy/vocab.pxd @@ -27,7 +27,6 @@ cdef struct _Cached: cdef class Vocab: cdef Pool mem cpdef readonly StringStore strings - cpdef readonly dict oov_stores cpdef readonly Morphology morphology cdef readonly int length cdef public object _serializer diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index d7dca9cf9..764e70c49 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -93,7 +93,6 @@ cdef class Vocab: self._by_hash = PreshMap() self._by_orth = PreshMap() self.strings = StringStore() - self.oov_stores = {} # Load strings in a special order, so that we have an onset number for # the vocabulary. This way, when words are added in order, the orth ID # is the frequency rank of the word, plus a certain offset. The structural @@ -141,7 +140,7 @@ cdef class Vocab: lex = self._by_hash.get(key) cdef size_t addr if lex != NULL: - if (string not in self.strings) or (lex.orth != self.strings[string]): + if lex.orth != self.strings[string]: raise LookupError.mismatched_strings( lex.orth, self.strings[string], self.strings[lex.orth], string) return lex @@ -164,10 +163,10 @@ cdef class Vocab: cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL: cdef hash_t key cdef bint is_oov = mem is not self.mem - if len(string) < 3 or not is_oov: + if len(string) < 3: mem = self.mem lex = mem.alloc(sizeof(LexemeC), 1) - lex.orth = self.strings.intern(string, mem=mem) + lex.orth = self.strings[string] lex.length = len(string) lex.id = self.length lex.vector = mem.alloc(self.vectors_length, sizeof(float)) @@ -175,7 +174,7 @@ cdef class Vocab: for attr, func in self.lex_attr_getters.items(): value = func(string) if isinstance(value, unicode): - value = self.strings.intern(value) + value = self.strings[value] if attr == PROB: lex.prob = value else: @@ -206,8 +205,7 @@ cdef class Vocab: def __getitem__(self, id_or_string): '''Retrieve a lexeme, given an int ID or a unicode string. If a previously - unseen unicode string is given, a new lexeme is created and stored, and - the string is interned in the vocabulary. + unseen unicode string is given, a new lexeme is created and stored. Args: id_or_string (int or unicode): @@ -222,7 +220,7 @@ cdef class Vocab: ''' cdef attr_t orth if type(id_or_string) == unicode: - orth = self.strings.intern(id_or_string) + orth = self.strings[id_or_string] else: orth = id_or_string return Lexeme(self, orth) @@ -238,7 +236,7 @@ cdef class Vocab: if 'pos' in props: self.morphology.assign_tag(token, props['pos']) if 'L' in props: - tokens[i].lemma = self.strings.intern(props['L']) + tokens[i].lemma = self.strings[props['L']] for feature, value in props.get('morph', {}).items(): self.morphology.assign_feature(&token.morph, feature, value) return tokens