* Progress to getting WordTree working. Tests pass, but so far it's slower.

2025-10-18 09:44:16 +03:00 · 2014-08-16 19:59:38 +02:00 · 2014-08-16 19:59:38 +02:00 · 34b68a18ab
commit 34b68a18ab
parent 865cacfaf7
4 changed files with 84 additions and 21 deletions
--- a/spacy/_hashing.pxd
+++ b/spacy/_hashing.pxd
@ -1,5 +1,7 @@
 from libc.stdint cimport uint64_t

+from chartree cimport CharTree
+

 cdef class FixedTable:
    cdef size_t size
@ -9,3 +11,15 @@ cdef class FixedTable:
    cdef size_t insert(self, uint64_t key, size_t value) nogil
    cdef size_t get(self, uint64_t key) nogil
    cdef int erase(self, uint64_t key) nogil
+
+
+cdef class WordTree:
+    cdef size_t max_length
+    cdef size_t default
+    cdef CharTree* _trees
+    cdef dict _dict
+
+    cdef size_t get(self, unicode string) except *
+    cdef int set(self, unicode string, size_t value) except *
+    cdef bint contains(self, unicode string) except *
+    
--- a/spacy/_hashing.pyx
+++ b/spacy/_hashing.pyx
@ -1,6 +1,8 @@
 from libc.stdlib cimport calloc, free
 import cython

+cimport chartree
+

 cdef class FixedTable:
    def __cinit__(self, const size_t size):
@ -51,3 +53,46 @@ cdef class FixedTable:
@cython.cdivision
 cdef inline size_t _find(uint64_t key, size_t size) nogil:
    return key % size
+
+
+cdef class WordTree:
+    def __cinit__(self, size_t default, size_t max_length):
+        self.max_length = max_length
+        self.default = default
+        self._trees = <CharTree*>calloc(max_length, sizeof(CharTree))
+        for i in range(self.max_length):
+            chartree.init(&self._trees[i], i)
+        self._dict = {}
+
+    cdef size_t get(self, unicode ustring) except *:
+        cdef bytes bstring = ustring.encode('utf8')
+        cdef size_t length = len(bstring)
+        if length >= self.max_length:
+            return self._dict.get(bstring, 0)
+        else:
+            return chartree.getitem(&self._trees[length], bstring)
+
+    cdef int set(self, unicode ustring, size_t value) except *:
+        cdef bytes bstring = ustring.encode('utf8')
+        cdef size_t length = len(bstring)
+        if length >= self.max_length:
+            self._dict[bstring] = value
+        else:
+            chartree.setitem(&self._trees[length], bstring, value)
+
+    cdef bint contains(self, unicode ustring) except *:
+        cdef bytes bstring = ustring.encode('utf8')
+        cdef size_t length = len(bstring)
+        if length >= self.max_length:
+            return bstring in self._dict
+        else:
+            return chartree.contains(&self._trees[length], bstring)
+
+    def __getitem__(self, unicode key):
+        return self.get(key)
+
+    def __setitem__(self, unicode key, size_t value):
+        self.set(key, value)
+    
+    def __contains__(self, unicode key):
+        return self.contains(key)
--- a/spacy/spacy.pxd
+++ b/spacy/spacy.pxd
@ -4,6 +4,7 @@ from libc.stdint cimport uint64_t

 from sparsehash.dense_hash_map cimport dense_hash_map
 from _hashing cimport FixedTable
+from _hashing cimport WordTree

 # Circular import problems here
 ctypedef size_t Lexeme_addr
@ -22,11 +23,12 @@ ctypedef int ClusterID
 from spacy.lexeme cimport Lexeme
 from spacy.lexeme cimport Distribution
 from spacy.lexeme cimport Orthography
+from spacy._hashing cimport WordTree


 cdef class Language:
    cdef object name
-    cdef Vocab* vocab
+    cdef WordTree vocab
    cdef Vocab* distri
    cdef Vocab* ortho
    cdef dict bacov
@ -38,7 +40,7 @@ cdef class Language:
    cdef Orthography* lookup_orth(self, StringHash key, unicode lex) except NULL
    cdef Distribution* lookup_dist(self, StringHash key) except NULL
    
-    cdef Lexeme* new_lexeme(self, StringHash key, unicode lex) except NULL
+    cdef Lexeme* new_lexeme(self, unicode key, unicode lex) except NULL
    cdef Orthography* new_orth(self, StringHash hashed, unicode lex) except NULL
    cdef Distribution* new_dist(self, StringHash key) except NULL
    
--- a/spacy/spacy.pyx
+++ b/spacy/spacy.pyx
@ -5,6 +5,7 @@ from libc.stdlib cimport calloc, free
 from libcpp.pair cimport pair
 from cython.operator cimport dereference as deref

+from murmurhash cimport mrmr
 from spacy.lexeme cimport Lexeme
 from spacy.lexeme cimport BLANK_WORD

@ -15,6 +16,13 @@ from os import path
 cimport cython


+#cdef inline StringHash hash_string(unicode string, size_t length):
+#    '''Hash unicode with MurmurHash64A'''
+#    return hash(string)
+#    #cdef bytes byte_string = string.encode('utf8')
+#    #return mrmr.hash32(<char*>byte_string, len(byte_string) * sizeof(char), 0)
+
+
 def get_normalized(unicode lex, size_t length):
    if lex.isalpha() and lex.islower():
        return lex
@ -56,10 +64,9 @@ cdef class Language:
    def __cinit__(self, name):
        self.name = name
        self.bacov = {}
-        self.vocab = new Vocab()
+        self.vocab = WordTree(0, 5)
        self.ortho = new Vocab()
        self.distri = new Vocab()
-        self.vocab[0].set_empty_key(0)
        self.distri[0].set_empty_key(0)
        self.ortho[0].set_empty_key(0)
        self.load_tokenization(util.read_tokenization(name))
@ -93,9 +100,9 @@ cdef class Language:

        cdef StringHash hashed = hash(string)
        # First, check words seen 2+ times
-        cdef Lexeme* word_ptr = <Lexeme*>self.vocab[0][hashed]
+        cdef Lexeme* word_ptr = <Lexeme*>self.vocab.get(string)
        if word_ptr == NULL:
-            word_ptr = self.new_lexeme(hashed, string)
+            word_ptr = self.new_lexeme(string, string)
        return <Lexeme_addr>word_ptr

    cdef Lexeme_addr lookup_chunk(self, unicode string) except 0:
@ -106,18 +113,16 @@ cdef class Language:
        cdef size_t length = len(string)
        if length == 0:
            return <Lexeme_addr>&BLANK_WORD
-        cdef StringHash hashed = hash(string)
        # First, check words seen 2+ times
-        cdef Lexeme* word_ptr = <Lexeme*>self.vocab[0][hashed]
+        cdef Lexeme* word_ptr = <Lexeme*>self.vocab.get(string)
        cdef int split
        if word_ptr == NULL:
            split = self.find_split(string, length)
            if split != 0 and split != -1 and split < length:
-                word_ptr = self.new_lexeme(hashed, string[:split])
+                word_ptr = self.new_lexeme(string, string[:split])
                word_ptr.tail = <Lexeme*>self.lookup_chunk(string[split:])
-                self.bacov[hashed] = string
            else:
-                word_ptr = self.new_lexeme(hashed, string)
+                word_ptr = self.new_lexeme(string, string)
        return <Lexeme_addr>word_ptr

    cdef Orthography* lookup_orth(self, StringHash hashed, unicode lex):
@ -132,14 +137,15 @@ cdef class Language:
            dist = self.new_dist(hashed)
        return dist

-    cdef Lexeme* new_lexeme(self, StringHash key, unicode string) except NULL:
+    cdef Lexeme* new_lexeme(self, unicode key, unicode string) except NULL:
        cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
-        word.sic = key
+        word.sic = hash(key)
        word.lex = hash(string)
        self.bacov[word.lex] = string
+        self.bacov[word.sic] = key
        word.orth = self.lookup_orth(word.lex, string)
        word.dist = self.lookup_dist(word.lex)
-        self.vocab[0][key] = <size_t>word
+        self.vocab.set(key, <size_t>word)
        return word   

    cdef Orthography* new_orth(self, StringHash hashed, unicode lex) except NULL:
@ -185,13 +191,10 @@ cdef class Language:
        cdef Lexeme* word
        cdef StringHash hashed
        for chunk, lex, tokens in token_rules:
-            hashed = hash(chunk)
-            word = <Lexeme*>self.new_lexeme(hashed, lex)
+            word = <Lexeme*>self.new_lexeme(chunk, lex)
            for i, lex in enumerate(tokens):
                token_string = '%s:@:%d:@:%s' % (chunk, i, lex)
-                length = len(token_string)
-                hashed = hash(token_string)
-                word.tail = <Lexeme*>self.new_lexeme(hashed, lex)
+                word.tail = <Lexeme*>self.new_lexeme(token_string, lex)
                word = word.tail

    def load_clusters(self):
@ -208,8 +211,7 @@ cdef class Language:
                # the first 4 bits. See redshift._parse_features.pyx
                cluster = int(cluster_str[::-1], 2)
                upper_pc, title_pc = case_stats.get(token_string.lower(), (0.0, 0.0))
-                hashed = hash(token_string)
-                word = self.init_lexeme(hashed, token_string)
+                word = self.new_lexeme(token_string, token_string)


 cdef inline bint _is_whitespace(unsigned char c) nogil: