Work on changing StringStore to return hashes.

2025-09-05 11:54:54 +03:00 · 2017-05-28 12:36:27 +02:00 · 2017-05-28 12:36:27 +02:00 · a5606c3eda
commit a5606c3eda
parent 39293ab2ee
5 changed files with 82 additions and 127 deletions
--- a/spacy/strings.pxd
+++ b/spacy/strings.pxd
@ -1,4 +1,5 @@
 from libc.stdint cimport int64_t
 from libcpp.vector cimport vector
 from cymem.cymem cimport Pool
 from preshed.maps cimport PreshMap
@ -8,6 +9,9 @@ from .typedefs cimport attr_t, hash_t
 cpdef hash_t hash_string(unicode string) except 0
 cdef hash_t hash_utf8(char* utf8_string, int length) nogil
 cdef unicode decode_Utf8Str(const Utf8Str* string)
 ctypedef union Utf8Str:
@ -17,13 +21,11 @@ ctypedef union Utf8Str:
 cdef class StringStore:
    cdef Pool mem
    cdef Utf8Str* c
    cdef int64_t size
    cdef bint is_frozen
    cdef vector[hash_t] keys
    cdef public PreshMap _map
    cdef public PreshMap _oov
    cdef int64_t _resize_at
    cdef const Utf8Str* intern_unicode(self, unicode py_string)
    cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length)
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@ -28,7 +28,7 @@ cdef uint32_t hash32_utf8(char* utf8_string, int length) nogil:
    return hash32(utf8_string, length, 1)
-cdef unicode _decode(const Utf8Str* string):
+cdef unicode decode_Utf8Str(const Utf8Str* string):
    cdef int i, length
    if string.s[0] < sizeof(string.s) and string.s[0] != 0:
        return string.s[1:string.s[0]+1].decode('utf8')
@ -45,10 +45,10 @@ cdef unicode _decode(const Utf8Str* string):
        return string.p[i:length + i].decode('utf8')
-cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, uint32_t length) except *:
+cdef Utf8Str* _allocate(Pool mem, const unsigned char* chars, uint32_t length) except *:
    cdef int n_length_bytes
    cdef int i
-    cdef Utf8Str string
+    cdef Utf8Str* string = <Utf8Str*>mem.alloc(1, sizeof(Utf8Str))
    cdef uint32_t ulength = length
    if length < sizeof(string.s):
        string.s[0] = <unsigned char>length
@ -73,7 +73,7 @@ cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, uint32_t length) ex
 cdef class StringStore:
-    """Map strings to and from integer IDs."""
+    """Lookup strings by 64-bit hash"""
    def __init__(self, strings=None, freeze=False):
        """Create the StringStore.
@ -83,68 +83,56 @@ cdef class StringStore:
        self.mem = Pool()
        self._map = PreshMap()
        self._oov = PreshMap()
        self._resize_at = 10000
        self.c = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str))
        self.size = 1
        self.is_frozen = freeze
        if strings is not None:
            for string in strings:
-                _ = self[string]
+                self.add(string)
    property size:
        def __get__(self):
            return self.size -1
    def __len__(self):
        """The number of strings in the store.
        RETURNS (int): The number of strings in the store.
        """
        return self.size-1
    def __getitem__(self, object string_or_id):
-        """Retrieve a string from a given integer ID, or vice versa.
+        """Retrieve a string from a given hash ID, or vice versa.
-        string_or_id (bytes or unicode or int): The value to encode.
+        string_or_id (bytes or unicode or uint64): The value to encode.
-        Returns (unicode or int): The value to be retrieved.
+        Returns (unicode or uint64): The value to be retrieved.
        """
        if isinstance(string_or_id, basestring) and len(string_or_id) == 0:
            return 0
        elif string_or_id == 0:
            return u''
-        cdef bytes byte_string
+        cdef hash_t key
-        cdef const Utf8Str* utf8str
+
-        cdef uint64_t int_id
+        if isinstance(string_or_id, unicode):
-        cdef uint32_t oov_id
+            key = hash_string(string_or_id)
-        if isinstance(string_or_id, (int, long)):
+            return key
-            int_id = string_or_id
+        elif isinstance(string_or_id, bytes):
-            oov_id = string_or_id
+            key = hash_utf8(string_or_id, len(string_or_id))
-            if int_id < <uint64_t>self.size:
+            return key
                return _decode(&self.c[int_id])
        else:
-                utf8str = <Utf8Str*>self._oov.get(oov_id)
+            key = string_or_id
-                if utf8str is not NULL:
+            utf8str = <Utf8Str*>self._map.get(key)
                    return _decode(utf8str)
                else:
                    raise IndexError(string_or_id)
        else:
            if isinstance(string_or_id, bytes):
                byte_string = <bytes>string_or_id
            elif isinstance(string_or_id, unicode):
                byte_string = (<unicode>string_or_id).encode('utf8')
            else:
                raise TypeError(type(string_or_id))
            utf8str = self._intern_utf8(byte_string, len(byte_string))
            if utf8str is NULL:
-                # TODO: We need to use 32 bit here, for compatibility with the
+                raise KeyError(string_or_id)
                # vocabulary values. This makes birthday paradox probabilities
                # pretty bad.
                # We could also get unlucky here, and hash into a value that
                # collides with the 'real' strings.
                return hash32_utf8(byte_string, len(byte_string))
            else:
-                return utf8str - self.c
+                return decode_Utf8Str(utf8str)
    def add(self, string):
        if isinstance(string, unicode):
            key = hash_string(string)
            self.intern_unicode(string)
        elif isinstance(string, bytes):
            key = hash_utf8(string, len(string))
            self._intern_utf8(string, len(string))
        else:
            raise TypeError(
                "Can only add unicode or bytes. Got type: %s" % type(string))
        return key
    def __len__(self):
        """The number of strings in the store.
        RETURNS (int): The number of strings in the store.
        """
        return self.keys.size()
    def __contains__(self, unicode string not None):
        """Check whether a string is in the store.
@ -163,16 +151,15 @@ cdef class StringStore:
        YIELDS (unicode): A string in the store.
        """
        cdef int i
-        for i in range(self.size):
+        cdef hash_t key
-            yield _decode(&self.c[i]) if i > 0 else u''
+        for i in range(self.keys.size()):
            key = self.keys[i]
            utf8str = <Utf8Str*>self._map.get(key)
            yield decode_Utf8Str(utf8str)
        # TODO: Iterate OOV here?
    def __reduce__(self):
-        strings = [""]
+        strings = list(self)
        for i in range(1, self.size):
            string = &self.c[i]
            py_string = _decode(string)
            strings.append(py_string)
        return (StringStore, (strings,), None, None, None)
    def to_disk(self, path):
@ -230,11 +217,9 @@ cdef class StringStore:
        self.mem = Pool()
        self._map = PreshMap()
        self._oov = PreshMap()
-        self._resize_at = 10000
+        self.keys.clear()
        self.c = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str))
        self.size = 1
        for string in strings:
-            _ = self[string]
+            self.add(string)
        self.is_frozen = freeze
    cdef const Utf8Str* intern_unicode(self, unicode py_string):
@ -258,39 +243,11 @@ cdef class StringStore:
            key32 = hash32_utf8(utf8_string, length)
            # Important: Make the OOV store own the memory. That way it's trivial
            # to flush them all.
-            value = <Utf8Str*>self._oov.mem.alloc(1, sizeof(Utf8Str))
+            value = _allocate(self._oov.mem, <unsigned char*>utf8_string, length)
            value[0] = _allocate(self._oov.mem, <unsigned char*>utf8_string, length)
            self._oov.set(key32, value)
            return NULL
-        if self.size == self._resize_at:
+        value = _allocate(self.mem, <unsigned char*>utf8_string, length)
-            self._realloc()
+        self._map.set(key, value)
-        self.c[self.size] = _allocate(self.mem, <unsigned char*>utf8_string, length)
+        self.keys.push_back(key)
-        self._map.set(key, <void*>&self.c[self.size])
+        return value
        self.size += 1
        return &self.c[self.size-1]
    def _realloc(self):
        # We want to map straight to pointers, but they'll be invalidated if
        # we resize our array. So, first we remap to indices, then we resize,
        # then we can acquire the new pointers.
        cdef Pool tmp_mem = Pool()
        keys = <key_t*>tmp_mem.alloc(self.size, sizeof(key_t))
        cdef key_t key
        cdef void* value
        cdef const Utf8Str ptr
        cdef int i = 0
        cdef size_t offset
        while map_iter(self._map.c_map, &i, &key, &value):
            # Find array index with pointer arithmetic
            offset = ((<Utf8Str*>value) - self.c)
            keys[offset] = key
        self._resize_at *= 2
        cdef size_t new_size = self._resize_at * sizeof(Utf8Str)
        self.c = <Utf8Str*>self.mem.realloc(self.c, new_size)
        self._map = PreshMap(self.size)
        for i in range(self.size):
            if keys[i]:
                self._map.set(keys[i], &self.c[i])
--- a/spacy/tests/stringstore/test_stringstore.py
+++ b/spacy/tests/stringstore/test_stringstore.py
@ -8,69 +8,65 @@ import pytest
@pytest.mark.parametrize('text1,text2,text3', [(b'Hello', b'goodbye', b'hello')])
 def test_stringstore_save_bytes(stringstore, text1, text2, text3):
-    i = stringstore[text1]
+    key = stringstore.add(text1)
-    assert i == 1
+    assert stringstore[text1] == key
-    assert stringstore[text1] == 1
+    assert stringstore[text2] != key
-    assert stringstore[text2] != i
+    assert stringstore[text3] != key
    assert stringstore[text3] != i
    assert i == 1
@pytest.mark.parametrize('text1,text2,text3', [('Hello', 'goodbye', 'hello')])
 def test_stringstore_save_unicode(stringstore, text1, text2, text3):
-    i = stringstore[text1]
+    key = stringstore.add(text1)
-    assert i == 1
+    assert stringstore[text1] == key
-    assert stringstore[text1] == 1
+    assert stringstore[text2] != key
-    assert stringstore[text2] != i
+    assert stringstore[text3] != key
    assert stringstore[text3] != i
    assert i == 1
@pytest.mark.parametrize('text', [b'A'])
 def test_stringstore_retrieve_id(stringstore, text):
-    i = stringstore[text]
+    key = stringstore.add(text)
-    assert stringstore.size == 1
+    assert len(stringstore) == 1
-    assert stringstore[1] == text.decode('utf8')
+    assert stringstore[key] == text.decode('utf8')
-    with pytest.raises(IndexError):
+    with pytest.raises(KeyError):
        stringstore[2]
@pytest.mark.parametrize('text1,text2', [(b'0123456789', b'A')])
 def test_stringstore_med_string(stringstore, text1, text2):
-    store = stringstore[text1]
+    store = stringstore.add(text1)
    assert stringstore[store] == text1.decode('utf8')
-    dummy = stringstore[text2]
+    dummy = stringstore.add(text2)
    assert stringstore[text1] == store
 def test_stringstore_long_string(stringstore):
    text = "INFORMATIVE](http://www.google.com/search?as_q=RedditMonkey&amp;hl=en&amp;num=50&amp;btnG=Google+Search&amp;as_epq=&amp;as_oq=&amp;as_eq=&amp;lr=&amp;as_ft=i&amp;as_filetype=&amp;as_qdr=all&amp;as_nlo=&amp;as_nhi=&amp;as_occt=any&amp;as_dt=i&amp;as_sitesearch=&amp;as_rights=&amp;safe=off"
-    store = stringstore[text]
+    store = stringstore.add(text)
    assert stringstore[store] == text
@pytest.mark.parametrize('factor', [254, 255, 256])
 def test_stringstore_multiply(stringstore, factor):
    text = 'a' * factor
-    store = stringstore[text]
+    store = stringstore.add(text)
    assert stringstore[store] == text
 def test_stringstore_massive_strings(stringstore):
    text = 'a' * 511
-    store = stringstore[text]
+    store = stringstore.add(text)
    assert stringstore[store] == text
    text2 = 'z' * 512
-    store = stringstore[text2]
+    store = stringstore.add(text2)
    assert stringstore[store] == text2
    text3 = '1' * 513
-    store = stringstore[text3]
+    store = stringstore.add(text3)
    assert stringstore[store] == text3
@pytest.mark.parametrize('text', ["qqqqq"])
 def test_stringstore_to_bytes(stringstore, text):
-    store = stringstore[text]
+    store = stringstore.add(text)
    serialized = stringstore.to_bytes()
    new_stringstore = StringStore().from_bytes(serialized)
    assert new_stringstore[store] == text
--- a/spacy/typedefs.pxd
+++ b/spacy/typedefs.pxd
@ -4,7 +4,7 @@ from libc.stdint cimport uint8_t
 ctypedef uint64_t hash_t
 ctypedef char* utf8_t
-ctypedef int32_t attr_t
+ctypedef uint64_t attr_t
 ctypedef uint64_t flags_t
 ctypedef uint16_t len_t
 ctypedef uint16_t tag_t
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -172,7 +172,7 @@ cdef class Vocab:
            for attr, func in self.lex_attr_getters.items():
                value = func(string)
                if isinstance(value, unicode):
-                    value = self.strings[value]
+                    value = self.strings.add(value)
                if attr == PROB:
                    lex.prob = value
                elif value is not None:
@ -227,7 +227,7 @@ cdef class Vocab:
        """
        cdef attr_t orth
        if type(id_or_string) == unicode:
-            orth = self.strings[id_or_string]
+            orth = self.strings.add(id_or_string)
        else:
            orth = id_or_string
        return Lexeme(self, orth)
@ -291,7 +291,7 @@ cdef class Vocab:
        with (path / 'vocab' / 'strings.json').open('r', encoding='utf8') as file_:
            strings_list = ujson.load(file_)
        for string in strings_list:
-            self.strings[string]
+            self.strings.add(string)
        self.load_lexemes(path / 'lexemes.bin')
    def to_bytes(self, **exclude):