From ca32a1ab017d6a83a7c3b26ae7cff3640690c888 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Fri, 30 Sep 2016 20:20:22 +0200
Subject: [PATCH] Revert "Work on Issue #285: intern strings into
 document-specific pools, to address streaming data memory growth.
 StringStore.__getitem__ now raises KeyError when it can't find the string.
 Use StringStore.intern() to get the old behaviour. Still need to hunt down
 all uses of StringStore.__getitem__ in library and do testing, but logic
 looks good."

This reverts commit 8423e8627f0c9e54ab026d5c51701e9e0a9c38d2.
---
 spacy/strings.pxd      |  10 +---
 spacy/strings.pyx      | 112 +++++++++--------------------------------
 spacy/tokens/token.pyx |  32 ++++++------
 spacy/typedefs.pxd     |   4 --
 spacy/vocab.pxd        |   1 -
 spacy/vocab.pyx        |  16 +++---
 6 files changed, 48 insertions(+), 127 deletions(-)
diff --git a/spacy/strings.pxd b/spacy/strings.pxd
index df61712c8..e2cd579c0 100644
--- a/spacy/strings.pxd
+++ b/spacy/strings.pxd
@@ -7,8 +7,6 @@ from libc.stdint cimport int64_t
 
 from .typedefs cimport hash_t
 
-DEF UINT64_MAX = 18446744073709551615
-
 cpdef hash_t hash_string(unicode string) except 0
 
 
@@ -24,10 +22,6 @@ cdef class StringStore:
 
     cdef public PreshMap _map
     cdef int64_t _resize_at
-    cdef PreshMap oov_maps
 
-    cpdef int remove_oov_map(self, Pool mem) except -1
-
-    cdef hash_t intern(self, unicode py_string, Pool mem=*) except UINT64_MAX
-    cdef const Utf8Str* _intern_utf8(self, const unsigned char* utf8_string,
-                                     int length) except NULL
+    cdef const Utf8Str* intern(self, unicode py_string) except NULL
+    cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length) except NULL
diff --git a/spacy/strings.pyx b/spacy/strings.pyx
index 509b475e9..2e81bd87b 100644
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@@ -1,4 +1,3 @@
-# cython: infer_types=True
 from __future__ import unicode_literals, absolute_import
 
 cimport cython
@@ -7,8 +6,7 @@ from libc.stdint cimport uint64_t
 
 from murmurhash.mrmr cimport hash64
 
-from preshed.maps cimport map_init, map_set, map_get, map_iter
-from preshed.maps cimport MapStruct
+from preshed.maps cimport map_iter, key_t
 
 from .typedefs cimport hash_t
 
@@ -18,17 +16,13 @@ except ImportError:
     import json
 
 
-DEF UINT64_MAX = 18446744073709551615
-
-
 cpdef hash_t hash_string(unicode string) except 0:
-    byte_string = string.encode('utf8')
-    cdef unsigned char* chars = byte_string
-    return _hash_utf8(chars, len(byte_string))
+    chars = string.encode('utf8')
+    return _hash_utf8(chars, len(chars))
 
 
-cdef hash_t _hash_utf8(const unsigned char* utf8_string, int length) nogil:
-    return hash64(<void*>utf8_string, length, 1)
+cdef hash_t _hash_utf8(char* utf8_string, int length):
+    return hash64(utf8_string, length, 1)
 
 
 cdef unicode _decode(const Utf8Str* string):
@@ -80,7 +74,6 @@ cdef class StringStore:
     def __init__(self, strings=None):
         self.mem = Pool()
         self._map = PreshMap()
-        self.oov_maps = PreshMap()
         self._resize_at = 10000
         self.c = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str))
         self.size = 1
@@ -115,21 +108,14 @@ cdef class StringStore:
             byte_string = <bytes>string_or_id
             if len(byte_string) == 0:
                 return 0
-            key = _hash_utf8(byte_string, len(byte_string))
-            utf8str = <Utf8Str*>self._map.get(key)
-            if utf8str is NULL:
-                raise KeyError(byte_string)
-            else:
-                return utf8str - self.c
+            utf8str = self._intern_utf8(byte_string, len(byte_string))
+            return utf8str - self.c
         elif isinstance(string_or_id, unicode):
             if len(<unicode>string_or_id) == 0:
                 return 0
-            key = hash_string(string_or_id)
-            utf8str = <Utf8Str*>self._map.get(key)
-            if utf8str is NULL:
-                raise KeyError(string_or_id)
-            else:
-                return utf8str - self.c
+            byte_string = (<unicode>string_or_id).encode('utf8')
+            utf8str = self._intern_utf8(byte_string, len(byte_string))
+            return utf8str - self.c
         else:
             raise TypeError(type(string_or_id))
 
@@ -145,8 +131,6 @@ cdef class StringStore:
             yield _decode(&self.c[i]) if i > 0 else u''
 
     def __reduce__(self):
-        # TODO: Is it problematic that we don't save the OOV strings?
-        # Probably yes? We're not restoring all the state...
         strings = [""]
         for i in range(1, self.size):
             string = &self.c[i]
@@ -154,77 +138,27 @@ cdef class StringStore:
             strings.append(py_string)
         return (StringStore, (strings,), None, None, None)
 
-    cdef hash_t intern(self, unicode py_string, Pool mem=None) except UINT64_MAX:
-        if mem is None:
-            mem = self.mem
-        cdef hash_t map_key = id(mem)
+    cdef const Utf8Str* intern(self, unicode py_string) except NULL:
+        # 0 means missing, but we don't bother offsetting the index.
         cdef bytes byte_string = py_string.encode('utf8')
-        cdef hash_t key = _hash_utf8(byte_string, len(byte_string))
-        cdef const Utf8Str* utf8str = <Utf8Str*>self._map.get(key)
-        cdef hash_t map_id = id(mem)
-        cdef MapStruct* oov_map
-        if utf8str is not NULL:
-            return utf8str - self.c
-        elif mem is None or mem is self.mem:
-            utf8str = self._intern_utf8(byte_string, len(byte_string))
-            return utf8str - self.c
-        else:
-            new_utf8str = <Utf8Str*>mem.alloc(sizeof(Utf8Str), 1)
-            oov_map = <MapStruct*>self.oov_maps.get(map_key)
-            if oov_map is NULL:
-                oov_map = <MapStruct*>mem.alloc(sizeof(MapStruct), 1)
-                map_init(mem, oov_map, 16)
-                self.oov_maps.set(id(mem), oov_map)
-            new_utf8str[0] = _allocate(mem, byte_string, len(byte_string))
-            map_set(mem, oov_map, key, new_utf8str)
-            return key
-
-    def decode_int(self, hash_t int_, Pool mem=None):
-        cdef hash_t map_key
-        if int_ == 0:
-            return u''
-        elif int_ < <uint64_t>self.size:
-            return _decode(&self.c[int_])
-        elif mem is None or mem is self.mem:
-            raise IndexError(int_)
-        else:
-            map_key = id(mem)
-            oov_map = <MapStruct*>self.oov_maps.get(map_key)
-            if oov_map is NULL:
-                raise IndexError(
-                    "Trying to decode integer into string, but it's not in " +
-                    "the main store, and the memory pool hasn't been seen before.\n" +
-                    ("int_ == %d\n" % int_) + 
-                    "id(mem) == %d" % map_key) 
-            else:
-                utf8str = <const Utf8Str*>map_get(oov_map, int_)
-                if utf8str is NULL:
-                    raise IndexError(
-                        "Trying to decode integer into string, but it's not in " +
-                        "the main store. The integer was also not found in the " +
-                        "indicated auxiliary pool " +
-                        "(which is usually specific to a document)." +
-                        ("int_ == %d\n" % int_) +
-                        "id(mem) == %d" % map_key)
-                return _decode(utf8str)
+        return self._intern_utf8(byte_string, len(byte_string))
 
     @cython.final
-    cdef const Utf8Str* _intern_utf8(self, const unsigned char* utf8_string,
-            int length) except NULL:
+    cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length) except NULL:
+        # 0 means missing, but we don't bother offsetting the index.
+        cdef hash_t key = _hash_utf8(utf8_string, length)
+        value = <Utf8Str*>self._map.get(key)
+        if value is not NULL:
+            return value
+
         if self.size == self._resize_at:
             self._realloc()
-        key = _hash_utf8(utf8_string, length)
-        self.c[self.size] = _allocate(self.mem, utf8_string, length)
+        self.c[self.size] = _allocate(self.mem, <unsigned char*>utf8_string, length)
         self._map.set(key, <void*>&self.c[self.size])
         self.size += 1
         return &self.c[self.size-1]
 
-    cpdef int remove_oov_map(self, Pool mem) except -1:
-        cdef hash_t key = id(mem)
-        self._maps.pop(key)
-
     def dump(self, file_):
-        # TODO: Is it problematic that we don't save the OOV strings? No, right?
         string_data = json.dumps(list(self))
         if not isinstance(string_data, unicode):
             string_data = string_data.decode('utf8')
@@ -246,8 +180,8 @@ cdef class StringStore:
         # we resize our array. So, first we remap to indices, then we resize,
         # then we can acquire the new pointers.
         cdef Pool tmp_mem = Pool()
-        keys = <hash_t*>tmp_mem.alloc(self.size, sizeof(hash_t))
-        cdef hash_t key
+        keys = <key_t*>tmp_mem.alloc(self.size, sizeof(key_t))
+        cdef key_t key
         cdef void* value
         cdef const Utf8Str ptr
         cdef int i = 0
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index 9320cb85a..9b612a867 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -116,11 +116,11 @@ cdef class Token:
 
     property text_with_ws:
         def __get__(self):
-            orth_ = self.orth_
+            cdef unicode orth = self.vocab.strings[self.c.lex.orth]
             if self.c.spacy:
-                return orth_ + u' '
+                return orth + u' '
             else:
-                return orth_
+                return orth
 
     property prob:
         def __get__(self):
@@ -403,7 +403,7 @@ cdef class Token:
 
     property ent_type_:
         def __get__(self):
-            return self.vocab.strings.decode_int(self.c.ent_type, mem=self.mem)
+            return self.vocab.strings[self.c.ent_type]
 
     property ent_iob_:
         def __get__(self):
@@ -424,7 +424,7 @@ cdef class Token:
     property ent_id_:
         '''A (string) entity ID. Usually assigned by patterns in the Matcher.'''
         def __get__(self):
-            return self.vocab.strings.decode_int(self.c.ent_id, mem=self.mem)
+            return self.vocab.strings[self.c.ent_id]
 
         def __set__(self, hash_t key):
             # TODO
@@ -438,35 +438,35 @@ cdef class Token:
 
     property orth_:
         def __get__(self):
-            return self.vocab.strings.decode_int(self.c.lex.orth, mem=self.mem)
+            return self.vocab.strings[self.c.lex.orth]
 
     property lower_:
         def __get__(self):
-            return self.vocab.strings.decode_int(self.c.lex.lower, mem=self.mem)
+            return self.vocab.strings[self.c.lex.lower]
 
     property norm_:
         def __get__(self):
-            return self.vocab.strings.decode_int(self.c.lex.norm, mem=self.mem)
+            return self.vocab.strings[self.c.lex.norm]
 
     property shape_:
         def __get__(self):
-            return self.vocab.strings.decode_int(self.c.lex.shape, mem=self.mem)
+            return self.vocab.strings[self.c.lex.shape]
 
     property prefix_:
         def __get__(self):
-            return self.vocab.strings.decode_int(self.c.lex.prefix, mem=self.mem)
+            return self.vocab.strings[self.c.lex.prefix]
 
     property suffix_:
         def __get__(self):
-            return self.vocab.strings.decode_int(self.c.lex.suffix, mem=self.mem)
+            return self.vocab.strings[self.c.lex.suffix]
 
     property lang_:
         def __get__(self):
-            return self.vocab.strings.decode_int(self.c.lex.lang, mem=self.mem)
+            return self.vocab.strings[self.c.lex.lang]
 
     property lemma_:
         def __get__(self):
-            return self.vocab.strings.decode_int(self.c.lemma, mem=self.mem)
+            return self.vocab.strings[self.c.lemma]
 
     property pos_:
         def __get__(self):
@@ -474,13 +474,13 @@ cdef class Token:
 
     property tag_:
         def __get__(self):
-            return self.vocab.strings.decode_int(self.c.tag, mem=self.mem)
+            return self.vocab.strings[self.c.tag]
 
     property dep_:
         def __get__(self):
-            return self.vocab.decode_int(self.c.dep, mem=self.mem)
+            return self.vocab.strings[self.c.dep]
         def __set__(self, unicode label):
-            self.c.dep = self.vocab.strings.intern(label, mem=self.mem)
+            self.c.dep = self.vocab.strings[label]
 
     property is_oov:
         def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_OOV)
diff --git a/spacy/typedefs.pxd b/spacy/typedefs.pxd
index 65ab7fe62..bd863d247 100644
--- a/spacy/typedefs.pxd
+++ b/spacy/typedefs.pxd
@@ -1,10 +1,6 @@
 from libc.stdint cimport uint16_t, uint32_t, uint64_t, uintptr_t, int32_t
 from libc.stdint cimport uint8_t
 
-from libc.stdint cimport UINT64_MAX as err_hash_t
-from libc.stdint cimport UINT64_MAX as err_flags_t
-from libc.stdint cimport UINT64_MAX as err_len_t
-from libc.stdint cimport UINT64_MAX as err_tag_t
 
 ctypedef uint64_t hash_t
 ctypedef char* utf8_t
diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd
index f136ef900..8e0e363bc 100644
--- a/spacy/vocab.pxd
+++ b/spacy/vocab.pxd
@@ -27,7 +27,6 @@ cdef struct _Cached:
 cdef class Vocab:
     cdef Pool mem
     cpdef readonly StringStore strings
-    cpdef readonly dict oov_stores
     cpdef readonly Morphology morphology
     cdef readonly int length
     cdef public object _serializer
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index d7dca9cf9..764e70c49 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -93,7 +93,6 @@ cdef class Vocab:
         self._by_hash = PreshMap()
         self._by_orth = PreshMap()
         self.strings = StringStore()
-        self.oov_stores = {}
         # Load strings in a special order, so that we have an onset number for
         # the vocabulary. This way, when words are added in order, the orth ID
         # is the frequency rank of the word, plus a certain offset. The structural
@@ -141,7 +140,7 @@ cdef class Vocab:
         lex = <LexemeC*>self._by_hash.get(key)
         cdef size_t addr
         if lex != NULL:
-            if (string not in self.strings) or (lex.orth != self.strings[string]):
+            if lex.orth != self.strings[string]:
                 raise LookupError.mismatched_strings(
                     lex.orth, self.strings[string], self.strings[lex.orth], string)
             return lex
@@ -164,10 +163,10 @@ cdef class Vocab:
     cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL:
         cdef hash_t key
         cdef bint is_oov = mem is not self.mem
-        if len(string) < 3 or not is_oov:
+        if len(string) < 3:
             mem = self.mem
         lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
-        lex.orth = self.strings.intern(string, mem=mem)
+        lex.orth = self.strings[string]
         lex.length = len(string)
         lex.id = self.length
         lex.vector = <float*>mem.alloc(self.vectors_length, sizeof(float))
@@ -175,7 +174,7 @@ cdef class Vocab:
             for attr, func in self.lex_attr_getters.items():
                 value = func(string)
                 if isinstance(value, unicode):
-                    value = self.strings.intern(value)
+                    value = self.strings[value]
                 if attr == PROB:
                     lex.prob = value
                 else:
@@ -206,8 +205,7 @@ cdef class Vocab:
 
     def __getitem__(self,  id_or_string):
         '''Retrieve a lexeme, given an int ID or a unicode string.  If a previously
-        unseen unicode string is given, a new lexeme is created and stored, and
-        the string is interned in the vocabulary.
+        unseen unicode string is given, a new lexeme is created and stored.
 
         Args:
             id_or_string (int or unicode):
@@ -222,7 +220,7 @@ cdef class Vocab:
         '''
         cdef attr_t orth
         if type(id_or_string) == unicode:
-            orth = self.strings.intern(id_or_string)
+            orth = self.strings[id_or_string]
         else:
             orth = id_or_string
         return Lexeme(self, orth)
@@ -238,7 +236,7 @@ cdef class Vocab:
             if 'pos' in props:
                 self.morphology.assign_tag(token, props['pos'])
             if 'L' in props:
-                tokens[i].lemma = self.strings.intern(props['L'])
+                tokens[i].lemma = self.strings[props['L']]
             for feature, value in props.get('morph', {}).items():
                 self.morphology.assign_feature(&token.morph, feature, value)
         return tokens