mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 18:06:29 +03:00
* Remove dependence on murmurhash
This commit is contained in:
parent
515d41d325
commit
865cacfaf7
4
setup.py
4
setup.py
|
@ -44,8 +44,6 @@ else:
|
||||||
# If you're not using virtualenv, set your include dir here.
|
# If you're not using virtualenv, set your include dir here.
|
||||||
pass
|
pass
|
||||||
|
|
||||||
print includes
|
|
||||||
print cython_includes
|
|
||||||
|
|
||||||
exts = [
|
exts = [
|
||||||
Extension("spacy.en", ["spacy/en.pyx"], language="c++",
|
Extension("spacy.en", ["spacy/en.pyx"], language="c++",
|
||||||
|
@ -64,8 +62,6 @@ exts = [
|
||||||
cython_include_dirs=cython_includes),
|
cython_include_dirs=cython_includes),
|
||||||
Extension("spacy.string_tools", ["spacy/string_tools.pyx"], language="c++",
|
Extension("spacy.string_tools", ["spacy/string_tools.pyx"], language="c++",
|
||||||
include_dirs=includes, cython_include_dirs=cython_includes),
|
include_dirs=includes, cython_include_dirs=cython_includes),
|
||||||
Extension("murmurhash.mrmr", ["murmurhash/mrmr.pyx", 'murmurhash/MurmurHash2.cpp', 'murmurhash/MurmurHash3.cpp'], language="c++",
|
|
||||||
include_dirs=includes, cython_include_dirs=cython_includes)
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -5,7 +5,6 @@ from libc.stdlib cimport calloc, free
|
||||||
from libcpp.pair cimport pair
|
from libcpp.pair cimport pair
|
||||||
from cython.operator cimport dereference as deref
|
from cython.operator cimport dereference as deref
|
||||||
|
|
||||||
from murmurhash cimport mrmr
|
|
||||||
from spacy.lexeme cimport Lexeme
|
from spacy.lexeme cimport Lexeme
|
||||||
from spacy.lexeme cimport BLANK_WORD
|
from spacy.lexeme cimport BLANK_WORD
|
||||||
|
|
||||||
|
@ -16,11 +15,6 @@ from os import path
|
||||||
cimport cython
|
cimport cython
|
||||||
|
|
||||||
|
|
||||||
cdef inline StringHash hash_string(Py_UNICODE* string, size_t length) nogil:
|
|
||||||
'''Hash unicode with MurmurHash64A'''
|
|
||||||
return mrmr.hash32(<Py_UNICODE*>string, length * sizeof(Py_UNICODE), 0)
|
|
||||||
|
|
||||||
|
|
||||||
def get_normalized(unicode lex, size_t length):
|
def get_normalized(unicode lex, size_t length):
|
||||||
if lex.isalpha() and lex.islower():
|
if lex.isalpha() and lex.islower():
|
||||||
return lex
|
return lex
|
||||||
|
@ -97,7 +91,7 @@ cdef class Language:
|
||||||
if length == 0:
|
if length == 0:
|
||||||
return <Lexeme_addr>&BLANK_WORD
|
return <Lexeme_addr>&BLANK_WORD
|
||||||
|
|
||||||
cdef StringHash hashed = hash_string(string, len(string))
|
cdef StringHash hashed = hash(string)
|
||||||
# First, check words seen 2+ times
|
# First, check words seen 2+ times
|
||||||
cdef Lexeme* word_ptr = <Lexeme*>self.vocab[0][hashed]
|
cdef Lexeme* word_ptr = <Lexeme*>self.vocab[0][hashed]
|
||||||
if word_ptr == NULL:
|
if word_ptr == NULL:
|
||||||
|
@ -112,7 +106,7 @@ cdef class Language:
|
||||||
cdef size_t length = len(string)
|
cdef size_t length = len(string)
|
||||||
if length == 0:
|
if length == 0:
|
||||||
return <Lexeme_addr>&BLANK_WORD
|
return <Lexeme_addr>&BLANK_WORD
|
||||||
cdef StringHash hashed = hash_string(string, length)
|
cdef StringHash hashed = hash(string)
|
||||||
# First, check words seen 2+ times
|
# First, check words seen 2+ times
|
||||||
cdef Lexeme* word_ptr = <Lexeme*>self.vocab[0][hashed]
|
cdef Lexeme* word_ptr = <Lexeme*>self.vocab[0][hashed]
|
||||||
cdef int split
|
cdef int split
|
||||||
|
@ -141,7 +135,7 @@ cdef class Language:
|
||||||
cdef Lexeme* new_lexeme(self, StringHash key, unicode string) except NULL:
|
cdef Lexeme* new_lexeme(self, StringHash key, unicode string) except NULL:
|
||||||
cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
|
cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
|
||||||
word.sic = key
|
word.sic = key
|
||||||
word.lex = hash_string(string, len(string))
|
word.lex = hash(string)
|
||||||
self.bacov[word.lex] = string
|
self.bacov[word.lex] = string
|
||||||
word.orth = self.lookup_orth(word.lex, string)
|
word.orth = self.lookup_orth(word.lex, string)
|
||||||
word.dist = self.lookup_dist(word.lex)
|
word.dist = self.lookup_dist(word.lex)
|
||||||
|
@ -162,11 +156,11 @@ cdef class Language:
|
||||||
orth.flags = set_orth_flags(lex, orth.length)
|
orth.flags = set_orth_flags(lex, orth.length)
|
||||||
orth.norm = hashed
|
orth.norm = hashed
|
||||||
last3 = substr(lex, length - 3, length, length)
|
last3 = substr(lex, length - 3, length, length)
|
||||||
orth.last3 = hash_string(last3, len(last3))
|
orth.last3 = hash(last3)
|
||||||
norm = get_normalized(lex, length)
|
norm = get_normalized(lex, length)
|
||||||
orth.norm = hash_string(norm, len(norm))
|
orth.norm = hash(norm)
|
||||||
shape = get_word_shape(lex, length)
|
shape = get_word_shape(lex, length)
|
||||||
orth.shape = hash_string(shape, len(shape))
|
orth.shape = hash(shape)
|
||||||
|
|
||||||
self.bacov[orth.last3] = last3
|
self.bacov[orth.last3] = last3
|
||||||
self.bacov[orth.norm] = norm
|
self.bacov[orth.norm] = norm
|
||||||
|
@ -191,12 +185,12 @@ cdef class Language:
|
||||||
cdef Lexeme* word
|
cdef Lexeme* word
|
||||||
cdef StringHash hashed
|
cdef StringHash hashed
|
||||||
for chunk, lex, tokens in token_rules:
|
for chunk, lex, tokens in token_rules:
|
||||||
hashed = hash_string(chunk, len(chunk))
|
hashed = hash(chunk)
|
||||||
word = <Lexeme*>self.new_lexeme(hashed, lex)
|
word = <Lexeme*>self.new_lexeme(hashed, lex)
|
||||||
for i, lex in enumerate(tokens):
|
for i, lex in enumerate(tokens):
|
||||||
token_string = '%s:@:%d:@:%s' % (chunk, i, lex)
|
token_string = '%s:@:%d:@:%s' % (chunk, i, lex)
|
||||||
length = len(token_string)
|
length = len(token_string)
|
||||||
hashed = hash_string(token_string, len(token_string))
|
hashed = hash(token_string)
|
||||||
word.tail = <Lexeme*>self.new_lexeme(hashed, lex)
|
word.tail = <Lexeme*>self.new_lexeme(hashed, lex)
|
||||||
word = word.tail
|
word = word.tail
|
||||||
|
|
||||||
|
@ -214,7 +208,7 @@ cdef class Language:
|
||||||
# the first 4 bits. See redshift._parse_features.pyx
|
# the first 4 bits. See redshift._parse_features.pyx
|
||||||
cluster = int(cluster_str[::-1], 2)
|
cluster = int(cluster_str[::-1], 2)
|
||||||
upper_pc, title_pc = case_stats.get(token_string.lower(), (0.0, 0.0))
|
upper_pc, title_pc = case_stats.get(token_string.lower(), (0.0, 0.0))
|
||||||
hashed = hash_string(token_string, len(token_string))
|
hashed = hash(token_string)
|
||||||
word = self.init_lexeme(hashed, token_string)
|
word = self.init_lexeme(hashed, token_string)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,4 @@
|
||||||
# cython: profile=True
|
# cython: profile=True
|
||||||
from murmurhash cimport mrmr
|
|
||||||
|
|
||||||
|
|
||||||
cpdef bytes to_bytes(unicode string):
|
cpdef bytes to_bytes(unicode string):
|
||||||
return string.encode('utf8')
|
return string.encode('utf8')
|
||||||
|
|
Loading…
Reference in New Issue
Block a user