* Remove dependence on murmurhash

2025-11-24 03:46:02 +03:00 · 2014-08-16 17:37:09 +02:00 · 2014-08-16 17:37:09 +02:00 · 865cacfaf7
commit 865cacfaf7
parent 515d41d325
3 changed files with 9 additions and 21 deletions
--- a/setup.py
+++ b/setup.py
@ -44,8 +44,6 @@ else:
    # If you're not using virtualenv, set your include dir here.
    pass
 print includes
 print cython_includes
 exts = [
    Extension("spacy.en", ["spacy/en.pyx"], language="c++",
@ -64,8 +62,6 @@ exts = [
              cython_include_dirs=cython_includes),
    Extension("spacy.string_tools", ["spacy/string_tools.pyx"], language="c++",
              include_dirs=includes, cython_include_dirs=cython_includes),
    Extension("murmurhash.mrmr", ["murmurhash/mrmr.pyx", 'murmurhash/MurmurHash2.cpp', 'murmurhash/MurmurHash3.cpp'], language="c++",
              include_dirs=includes, cython_include_dirs=cython_includes)
 ]
--- a/spacy/spacy.pyx
+++ b/spacy/spacy.pyx
@ -5,7 +5,6 @@ from libc.stdlib cimport calloc, free
 from libcpp.pair cimport pair
 from cython.operator cimport dereference as deref
 from murmurhash cimport mrmr
 from spacy.lexeme cimport Lexeme
 from spacy.lexeme cimport BLANK_WORD
@ -16,11 +15,6 @@ from os import path
 cimport cython
 cdef inline StringHash hash_string(Py_UNICODE* string, size_t length) nogil:
    '''Hash unicode with MurmurHash64A'''
    return mrmr.hash32(<Py_UNICODE*>string, length * sizeof(Py_UNICODE), 0)
 def get_normalized(unicode lex, size_t length):
    if lex.isalpha() and lex.islower():
        return lex
@ -97,7 +91,7 @@ cdef class Language:
        if length == 0:
            return <Lexeme_addr>&BLANK_WORD
-        cdef StringHash hashed = hash_string(string, len(string))
+        cdef StringHash hashed = hash(string)
        # First, check words seen 2+ times
        cdef Lexeme* word_ptr = <Lexeme*>self.vocab[0][hashed]
        if word_ptr == NULL:
@ -112,7 +106,7 @@ cdef class Language:
        cdef size_t length = len(string)
        if length == 0:
            return <Lexeme_addr>&BLANK_WORD
-        cdef StringHash hashed = hash_string(string, length)
+        cdef StringHash hashed = hash(string)
        # First, check words seen 2+ times
        cdef Lexeme* word_ptr = <Lexeme*>self.vocab[0][hashed]
        cdef int split
@ -141,7 +135,7 @@ cdef class Language:
    cdef Lexeme* new_lexeme(self, StringHash key, unicode string) except NULL:
        cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
        word.sic = key
-        word.lex = hash_string(string, len(string))
+        word.lex = hash(string)
        self.bacov[word.lex] = string
        word.orth = self.lookup_orth(word.lex, string)
        word.dist = self.lookup_dist(word.lex)
@ -162,11 +156,11 @@ cdef class Language:
        orth.flags = set_orth_flags(lex, orth.length)
        orth.norm = hashed
        last3 = substr(lex, length - 3, length, length)
-        orth.last3 = hash_string(last3, len(last3))
+        orth.last3 = hash(last3)
        norm = get_normalized(lex, length)
-        orth.norm = hash_string(norm, len(norm))
+        orth.norm = hash(norm)
        shape = get_word_shape(lex, length)
-        orth.shape = hash_string(shape, len(shape))
+        orth.shape = hash(shape)
        self.bacov[orth.last3] = last3
        self.bacov[orth.norm] = norm
@ -191,12 +185,12 @@ cdef class Language:
        cdef Lexeme* word
        cdef StringHash hashed
        for chunk, lex, tokens in token_rules:
-            hashed = hash_string(chunk, len(chunk))
+            hashed = hash(chunk)
            word = <Lexeme*>self.new_lexeme(hashed, lex)
            for i, lex in enumerate(tokens):
                token_string = '%s:@:%d:@:%s' % (chunk, i, lex)
                length = len(token_string)
-                hashed = hash_string(token_string, len(token_string))
+                hashed = hash(token_string)
                word.tail = <Lexeme*>self.new_lexeme(hashed, lex)
                word = word.tail
@ -214,7 +208,7 @@ cdef class Language:
                # the first 4 bits. See redshift._parse_features.pyx
                cluster = int(cluster_str[::-1], 2)
                upper_pc, title_pc = case_stats.get(token_string.lower(), (0.0, 0.0))
-                hashed = hash_string(token_string, len(token_string))
+                hashed = hash(token_string)
                word = self.init_lexeme(hashed, token_string)
--- a/spacy/string_tools.pyx
+++ b/spacy/string_tools.pyx
@ -1,6 +1,4 @@
 # cython: profile=True
 from murmurhash cimport mrmr
 cpdef bytes to_bytes(unicode string):
    return string.encode('utf8')