* Remove dependence on murmurhash

This commit is contained in:
Matthew Honnibal 2014-08-16 17:37:09 +02:00
parent 515d41d325
commit 865cacfaf7
3 changed files with 9 additions and 21 deletions

View File

@ -44,8 +44,6 @@ else:
# If you're not using virtualenv, set your include dir here. # If you're not using virtualenv, set your include dir here.
pass pass
print includes
print cython_includes
exts = [ exts = [
Extension("spacy.en", ["spacy/en.pyx"], language="c++", Extension("spacy.en", ["spacy/en.pyx"], language="c++",
@ -64,8 +62,6 @@ exts = [
cython_include_dirs=cython_includes), cython_include_dirs=cython_includes),
Extension("spacy.string_tools", ["spacy/string_tools.pyx"], language="c++", Extension("spacy.string_tools", ["spacy/string_tools.pyx"], language="c++",
include_dirs=includes, cython_include_dirs=cython_includes), include_dirs=includes, cython_include_dirs=cython_includes),
Extension("murmurhash.mrmr", ["murmurhash/mrmr.pyx", 'murmurhash/MurmurHash2.cpp', 'murmurhash/MurmurHash3.cpp'], language="c++",
include_dirs=includes, cython_include_dirs=cython_includes)
] ]

View File

@ -5,7 +5,6 @@ from libc.stdlib cimport calloc, free
from libcpp.pair cimport pair from libcpp.pair cimport pair
from cython.operator cimport dereference as deref from cython.operator cimport dereference as deref
from murmurhash cimport mrmr
from spacy.lexeme cimport Lexeme from spacy.lexeme cimport Lexeme
from spacy.lexeme cimport BLANK_WORD from spacy.lexeme cimport BLANK_WORD
@ -16,11 +15,6 @@ from os import path
cimport cython cimport cython
cdef inline StringHash hash_string(Py_UNICODE* string, size_t length) nogil:
'''Hash unicode with MurmurHash64A'''
return mrmr.hash32(<Py_UNICODE*>string, length * sizeof(Py_UNICODE), 0)
def get_normalized(unicode lex, size_t length): def get_normalized(unicode lex, size_t length):
if lex.isalpha() and lex.islower(): if lex.isalpha() and lex.islower():
return lex return lex
@ -97,7 +91,7 @@ cdef class Language:
if length == 0: if length == 0:
return <Lexeme_addr>&BLANK_WORD return <Lexeme_addr>&BLANK_WORD
cdef StringHash hashed = hash_string(string, len(string)) cdef StringHash hashed = hash(string)
# First, check words seen 2+ times # First, check words seen 2+ times
cdef Lexeme* word_ptr = <Lexeme*>self.vocab[0][hashed] cdef Lexeme* word_ptr = <Lexeme*>self.vocab[0][hashed]
if word_ptr == NULL: if word_ptr == NULL:
@ -112,7 +106,7 @@ cdef class Language:
cdef size_t length = len(string) cdef size_t length = len(string)
if length == 0: if length == 0:
return <Lexeme_addr>&BLANK_WORD return <Lexeme_addr>&BLANK_WORD
cdef StringHash hashed = hash_string(string, length) cdef StringHash hashed = hash(string)
# First, check words seen 2+ times # First, check words seen 2+ times
cdef Lexeme* word_ptr = <Lexeme*>self.vocab[0][hashed] cdef Lexeme* word_ptr = <Lexeme*>self.vocab[0][hashed]
cdef int split cdef int split
@ -141,7 +135,7 @@ cdef class Language:
cdef Lexeme* new_lexeme(self, StringHash key, unicode string) except NULL: cdef Lexeme* new_lexeme(self, StringHash key, unicode string) except NULL:
cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme)) cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
word.sic = key word.sic = key
word.lex = hash_string(string, len(string)) word.lex = hash(string)
self.bacov[word.lex] = string self.bacov[word.lex] = string
word.orth = self.lookup_orth(word.lex, string) word.orth = self.lookup_orth(word.lex, string)
word.dist = self.lookup_dist(word.lex) word.dist = self.lookup_dist(word.lex)
@ -162,11 +156,11 @@ cdef class Language:
orth.flags = set_orth_flags(lex, orth.length) orth.flags = set_orth_flags(lex, orth.length)
orth.norm = hashed orth.norm = hashed
last3 = substr(lex, length - 3, length, length) last3 = substr(lex, length - 3, length, length)
orth.last3 = hash_string(last3, len(last3)) orth.last3 = hash(last3)
norm = get_normalized(lex, length) norm = get_normalized(lex, length)
orth.norm = hash_string(norm, len(norm)) orth.norm = hash(norm)
shape = get_word_shape(lex, length) shape = get_word_shape(lex, length)
orth.shape = hash_string(shape, len(shape)) orth.shape = hash(shape)
self.bacov[orth.last3] = last3 self.bacov[orth.last3] = last3
self.bacov[orth.norm] = norm self.bacov[orth.norm] = norm
@ -191,12 +185,12 @@ cdef class Language:
cdef Lexeme* word cdef Lexeme* word
cdef StringHash hashed cdef StringHash hashed
for chunk, lex, tokens in token_rules: for chunk, lex, tokens in token_rules:
hashed = hash_string(chunk, len(chunk)) hashed = hash(chunk)
word = <Lexeme*>self.new_lexeme(hashed, lex) word = <Lexeme*>self.new_lexeme(hashed, lex)
for i, lex in enumerate(tokens): for i, lex in enumerate(tokens):
token_string = '%s:@:%d:@:%s' % (chunk, i, lex) token_string = '%s:@:%d:@:%s' % (chunk, i, lex)
length = len(token_string) length = len(token_string)
hashed = hash_string(token_string, len(token_string)) hashed = hash(token_string)
word.tail = <Lexeme*>self.new_lexeme(hashed, lex) word.tail = <Lexeme*>self.new_lexeme(hashed, lex)
word = word.tail word = word.tail
@ -214,7 +208,7 @@ cdef class Language:
# the first 4 bits. See redshift._parse_features.pyx # the first 4 bits. See redshift._parse_features.pyx
cluster = int(cluster_str[::-1], 2) cluster = int(cluster_str[::-1], 2)
upper_pc, title_pc = case_stats.get(token_string.lower(), (0.0, 0.0)) upper_pc, title_pc = case_stats.get(token_string.lower(), (0.0, 0.0))
hashed = hash_string(token_string, len(token_string)) hashed = hash(token_string)
word = self.init_lexeme(hashed, token_string) word = self.init_lexeme(hashed, token_string)

View File

@ -1,6 +1,4 @@
# cython: profile=True # cython: profile=True
from murmurhash cimport mrmr
cpdef bytes to_bytes(unicode string): cpdef bytes to_bytes(unicode string):
return string.encode('utf8') return string.encode('utf8')