* Shifting to WordTree instead of dense_hash_map for storage.

2024-11-10 19:57:17 +03:00 · 2014-08-15 23:06:46 +02:00 · 2014-08-15 23:06:46 +02:00 · 3ada25b92d
commit 3ada25b92d
parent f11c8e22eb
7 changed files with 137 additions and 101 deletions
--- a/spacy/_hashing.pxd
+++ b/spacy/_hashing.pxd
@ -1,6 +1,9 @@
 from libc.stdint cimport uint64_t

+from chartree cimport CharTree

+
+cdef bytes to_utf8(unicode string)
 cdef class FixedTable:
    cdef size_t size
    cdef uint64_t* keys
@ -9,3 +12,15 @@ cdef class FixedTable:
    cdef size_t insert(self, uint64_t key, size_t value) nogil
    cdef size_t get(self, uint64_t key) nogil
    cdef int erase(self, uint64_t key) nogil
+
+
+cdef class WordTree:
+    cdef size_t max_length
+    cdef size_t default
+    cdef CharTree* _trees
+    cdef dict _dict
+
+    cdef size_t get(self, bytes string) except *
+    cdef int set(self, bytes string, size_t value) except *
+    cdef bint contains(self, bytes string) except *
+    
--- a/spacy/_hashing.pyx
+++ b/spacy/_hashing.pyx
@ -1,6 +1,8 @@
 from libc.stdlib cimport calloc, free
 import cython

+cimport chartree
+

 cdef class FixedTable:
    def __cinit__(self, const size_t size):
@ -51,3 +53,54 @@ cdef class FixedTable:
@cython.cdivision
 cdef inline size_t _find(uint64_t key, size_t size) nogil:
    return key % size
+
+
+cdef bytes to_utf8(unicode string):
+    cdef bytes py_byte_string = string.encode('UTF-8')
+    return py_byte_string
+
+
+cdef unicode to_unicode(unsigned char[:] c_string, size_t length):
+    # This prevents a call to strlen
+    cdef bytes py_string = <bytes>c_string[:length]
+    return py_string.decode('utf8')
+
+
+cdef class WordTree:
+    def __cinit__(self, size_t default, size_t max_length):
+        self.max_length = max_length
+        self.default = default
+        self._trees = <CharTree*>calloc(max_length, sizeof(CharTree))
+        for i in range(self.max_length):
+            chartree.init(&self._trees[i], i)
+        self._dict = {}
+
+    cdef size_t get(self, bytes string) except *:
+        cdef size_t length = len(string)
+        if length >= self.max_length:
+            return self._dict.get(string, 0)
+        else:
+            return chartree.getitem(&self._trees[length], string)
+
+    cdef int set(self, bytes string, size_t value) except *:
+        cdef size_t length = len(string)
+        if length >= self.max_length:
+            self._dict[string] = value
+        else:
+            chartree.setitem(&self._trees[length], string, value)
+
+    cdef bint contains(self, bytes string) except *:
+        cdef size_t length = len(string)
+        if length >= self.max_length:
+            return string in self._dict
+        else:
+            return chartree.contains(&self._trees[length], string)
+
+    def __getitem__(self, unicode key):
+        return self.get(to_utf8(key))
+
+    def __setitem__(self, unicode key, size_t value):
+        self.set(to_utf8(key), value)
+    
+    def __contains__(self, unicode key):
+        return self.contains(to_utf8(key))
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@ -43,9 +43,8 @@ cdef bint is_punct(unicode word, size_t i, size_t length):
    # Don't count commas as punct if the next char is a number
    if word[i] == "," and i < (length - 1) and word[i+1].isdigit():
        return False
-    # Don't count periods as punct if the next char is not whitespace
-    if word[i] == "." and i < (length - 1) and not word[i+1].isspace():
-        return False
+    if word[i] == ".":
+        return True
    return not word[i].isalnum()


@ -62,3 +61,6 @@ cpdef Lexeme_addr lookup(unicode string) except 0:

 cpdef unicode unhash(StringHash hash_value):
    return EN.unhash(hash_value)
+
+def words():
+    return EN.words
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -20,6 +20,11 @@ from spacy.spacy cimport StringHash
 #SHAPE = StringAttr.shape
 #LAST3 = StringAttr.last3

+cdef Lexeme* init(StringHash hashed, bytes lex_string) except NULL:
+    cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
+    word.lex = hashed
+    return word
+

 cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0:
    if attr == SIC:
--- a/spacy/spacy.pxd
+++ b/spacy/spacy.pxd
@ -4,6 +4,7 @@ from libc.stdint cimport uint64_t

 from sparsehash.dense_hash_map cimport dense_hash_map
 from _hashing cimport FixedTable
+from _hashing cimport WordTree

 # Circular import problems here
 ctypedef size_t Lexeme_addr
@ -26,7 +27,7 @@ from spacy.lexeme cimport Orthography

 cdef class Language:
    cdef object name
-    cdef Vocab* vocab
+    cdef WordTree vocab
    cdef Vocab* distri
    cdef Vocab* ortho
    cdef dict bacov
@ -37,7 +38,7 @@ cdef class Language:
    cdef unicode unhash(self, StringHash hashed)
    
    cpdef Tokens tokenize(self, unicode text)
-    cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length)
+    cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length) except NULL
    cdef Lexeme* init_lexeme(self, unicode string, StringHash hashed,
                             int split, size_t length)
    cdef Orthography* init_orth(self, StringHash hashed, unicode lex)
--- a/spacy/spacy.pyx
+++ b/spacy/spacy.pyx
@ -5,12 +5,12 @@ from libc.stdlib cimport calloc, free
 from libcpp.pair cimport pair
 from cython.operator cimport dereference as deref

-from murmurhash cimport mrmr
 from spacy.lexeme cimport Lexeme
 from spacy.lexeme cimport BLANK_WORD

 from spacy.string_tools cimport substr
-
+from _hashing cimport WordTree
+from _hashing cimport to_utf8

 from . import util
 from os import path
@ -58,28 +58,27 @@ cdef class Language:
    def __cinit__(self, name):
        self.name = name
        self.bacov = {}
-        self.vocab = new Vocab()
+        self.vocab = WordTree(0, 10)
        self.ortho = new Vocab()
        self.distri = new Vocab()
-        self.vocab[0].set_empty_key(0)
        self.distri[0].set_empty_key(0)
        self.ortho[0].set_empty_key(0)
-        self.vocab[0].set_deleted_key(1)
        self.distri[0].set_deleted_key(1)
        self.ortho[0].set_deleted_key(1)
        self.load_tokenization(util.read_tokenization(name))

+    property words:
+        def __get__(self):
+            return self.bacov.keys()
+
    def load_tokenization(self, token_rules=None):
        cdef Lexeme* word
        cdef StringHash hashed
        for chunk, lex, tokens in token_rules:
-            hashed = self.hash_string(chunk, len(chunk))
-            word = self._add(hashed, lex, len(lex), len(lex))
+            word = self.init_lexeme(chunk)
            for i, lex in enumerate(tokens):
                token_string = '%s:@:%d:@:%s' % (chunk, i, lex)
-                length = len(token_string)
-                hashed = self.hash_string(token_string, length)
-                word.tail = self._add(hashed, lex, 0, len(lex))
+                word.tail = self.init_lexeme(lex)
                word = word.tail

    def load_clusters(self):
@ -89,111 +88,59 @@ cdef class Language:
        brown_loc = path.join(data_dir, 'clusters')
        cdef size_t start 
        cdef int end 
+        cdef unicode token_unicode
+        cdef bytes token_bytes
        with util.utf8open(brown_loc) as browns_file:
            for i, line in enumerate(browns_file):
-                cluster_str, token_string, freq_str = line.split()
+                cluster_str, token_unicode, freq_str = line.split()
+                token_bytes = token_unicode.encode('utf8')
                # Decode as a little-endian string, so that we can do & 15 to get
                # the first 4 bits. See redshift._parse_features.pyx
                cluster = int(cluster_str[::-1], 2)
                upper_pc, title_pc = case_stats.get(token_string.lower(), (0.0, 0.0))
-                hashed = self.hash_string(token_string, len(token_string))
-                word = self._add(hashed, token_string,
-                                len(token_string), len(token_string))
-   
-    cdef StringHash hash_string(self, Py_UNICODE* s, size_t length) except 0:
-        '''Hash unicode with MurmurHash64A'''
-        return mrmr.hash32(<Py_UNICODE*>s, length * sizeof(Py_UNICODE), 0)
-
-    cdef unicode unhash(self, StringHash hash_value):
-        '''Fetch a string from the reverse index, given its hash value.'''
-        return self.bacov[hash_value].decode('utf8')
-
-    cdef Lexeme_addr lookup(self, int start, Py_UNICODE* string, size_t length) except 0:
-        '''Fetch a Lexeme representing a word string. If the word has not been seen,
-        construct one, splitting off any attached punctuation or clitics.  A
-        reference to BLANK_WORD is returned for the empty string.
-    
-        To specify the boundaries of the word if it has not been seen, use lookup_chunk.
-        '''
-        if length == 0:
-            return <Lexeme_addr>&BLANK_WORD
-        cdef StringHash hashed = self.hash_string(string, length)
-        # First, check words seen 2+ times
-        cdef Lexeme* word_ptr = <Lexeme*>self.vocab[0][hashed]
-        if word_ptr == NULL:
-            start = self.find_split(string, length) if start == -1 else start
-            word_ptr = self._add(hashed, string, start, length)
-        return <Lexeme_addr>word_ptr
-
-    cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length):
-        cdef size_t i
-        word = self.init_lexeme(string, hashed, split, length)
-        self.vocab[0][hashed] = <size_t>word
-        self.bacov[hashed] = string.encode('utf8')
-        return word   
-
-    cpdef Tokens tokenize(self, unicode string):
-        cdef size_t length = len(string)
-        cdef Py_UNICODE* characters = <Py_UNICODE*>string
-
-        cdef size_t i
-        cdef Py_UNICODE c
+                word = self.init_lexeme(token_bytes)

+    cpdef Tokens tokenize(self, unicode unicode_string):
+        cdef bytes characters = unicode_string.encode('utf8')
+        cdef size_t length = len(characters)
+        
        cdef Tokens tokens = Tokens(self)
-        cdef Py_UNICODE* current = <Py_UNICODE*>calloc(len(string), sizeof(Py_UNICODE))
-        cdef size_t word_len = 0
+        cdef size_t start = 0
+
        cdef Lexeme* token
+        cdef size_t i
+        cdef unsigned char c
+
        for i in range(length):
            c = characters[i]
-            if _is_whitespace(c):
-                if word_len != 0:
-                    token = <Lexeme*>self.lookup(-1, current, word_len)
+            if c == b' ':
+                if start < i:
+                    token = <Lexeme*>self.lookup(characters[start:i])
                    while token != NULL:
                        tokens.append(<Lexeme_addr>token)
                        token = token.tail
-                        for j in range(word_len+1):
-                            current[j] = 0
-                    word_len = 0
-            else:
-                current[word_len] = c
-                word_len += 1
-        if word_len != 0:
-            token = <Lexeme*>self.lookup(-1, current, word_len)
+                start = i + 1
+        if start < i:
+            token = <Lexeme*>self.lookup(characters[start:])
            while token != NULL:
                tokens.append(<Lexeme_addr>token)
                token = token.tail
-        free(current)
        return tokens

-    cdef int find_split(self, unicode word, size_t length):
-        return -1
-
-    cdef Lexeme* init_lexeme(self, unicode string, StringHash hashed,
-                             int split, size_t length):
-        cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
-    
-        word.sic = hashed
-    
-        cdef unicode tail_string
-        cdef unicode lex 
-        if split != 0 and split < length:
-            lex = substr(string, 0, split, length)
-            tail_string = substr(string, split, length, length)
-        else:
-            lex = string
-            tail_string = ''
-    
-        word.lex = self.hash_string(lex, len(lex))
-        self.bacov[word.lex] = lex.encode('utf8')
-        word.orth = <Orthography*>self.ortho[0][word.lex]
-        if word.orth == NULL:
-            word.orth = self.init_orth(word.lex, lex)
-        word.dist = <Distribution*>self.distri[0][word.lex]
-    
-        # Now recurse, and deal with the tail
-        if tail_string:
-            word.tail = <Lexeme*>self.lookup(-1, tail_string, len(tail_string))
-        return word
+    cdef Lexeme_addr lookup(self, bytes string) except 0:
+        '''Fetch a Lexeme representing a word string. If the word has not been seen,
+        construct one, splitting off any attached punctuation or clitics.  A
+        reference to BLANK_WORD is returned for the empty string.
+        '''
+        cdef size_t length = len(string)
+        if length == 0:
+            return <Lexeme_addr>&BLANK_WORD
+        cdef Lexeme* word_ptr = <Lexeme*>self.vocab.get(string)
+        if word_ptr == NULL:
+            start = self.find_split(string, length)
+            word_ptr = self.init_lexeme(string[)
+            self.vocab.set(string[start:], <size_t>word_ptr)
+        return <Lexeme_addr>word_ptr

    cdef Orthography* init_orth(self, StringHash hashed, unicode lex):
        cdef Orthography* orth = <Orthography*>calloc(1, sizeof(Orthography))
@ -219,6 +166,13 @@ cdef class Language:
        self.ortho[0][hashed] = <size_t>orth
        return orth

+    cdef unicode unhash(self, StringHash hash_value):
+        '''Fetch a string from the reverse index, given its hash value.'''
+        return self.bacov[hash_value].decode('utf8')
+
+    cdef int find_split(self, unicode word, size_t length):
+        return -1
+

 cdef inline bint _is_whitespace(Py_UNICODE c) nogil:
    if c == ' ':
--- a/spacy/string_tools.pyx
+++ b/spacy/string_tools.pyx
@ -1,4 +1,10 @@
 # cython: profile=True
+from murmurhash cimport mrmr
+
+
+cdef StringHash hash_string(self, unsigned char* s, size_t length) except 0:
+    '''Hash bytes with MurmurHash32'''
+    return mrmr.hash32(s, length * sizeof(unsigned char), 0)


 cpdef unicode substr(unicode string, int start, int end, size_t length):