* Switch to using sparsehash and murmurhash libraries out of pip

This commit is contained in:
Matthew Honnibal 2014-07-25 15:47:27 +01:00
parent 3f7cbb93e0
commit b9016c4633
4 changed files with 22 additions and 30 deletions

View File

@ -1,3 +1,6 @@
cython cython
sparsehash
murmurhash
fabric fabric
pytest pytest

View File

@ -7,6 +7,7 @@ import sys
import os import os
import os.path import os.path
from os import path from os import path
from glob import glob
def clean(ext): def clean(ext):
@ -34,32 +35,22 @@ libs = []
includes = [] includes = []
exts = [
Extension("ext.sparsehash", ["ext/sparsehash.pyx"], language="c++"),
Extension('ext.murmurhash',
["ext/murmurhash.pyx", "ext/MurmurHash2.cpp",
"ext/MurmurHash3.cpp"], language="c++",
include_dirs=[path.join(HERE, 'ext')]),
Extension("spacy.en", if 'VIRTUAL_ENV' in os.environ:
["spacy/en.pyx", "ext/MurmurHash3.cpp", "ext/MurmurHash2.cpp"], includes += glob(path.join(os.environ['VIRTUAL_ENV'], 'include', 'site', '*'))
language="c++", else:
include_dirs=[path.join(HERE, 'ext')]), # If you're not using virtualenv, set your include dir here.
Extension("spacy.en_ptb", pass
["spacy/en_ptb.pyx", "ext/MurmurHash3.cpp", "ext/MurmurHash2.cpp"],
language="c++",
include_dirs=[path.join(HERE, 'ext')]), exts = [
Extension("spacy.en", ["spacy/en.pyx"], language="c++", include_dirs=includes),
Extension("spacy.en_ptb", ["spacy/en_ptb.pyx"], language="c++", include_dirs=includes),
Extension("spacy.lexeme", ["spacy/lexeme.pyx"], language="c++", include_dirs=includes), Extension("spacy.lexeme", ["spacy/lexeme.pyx"], language="c++", include_dirs=includes),
Extension("spacy.spacy", Extension("spacy.spacy", ["spacy/spacy.pyx"], language="c++", include_dirs=includes),
["spacy/spacy.pyx", "ext/MurmurHash3.cpp", "ext/MurmurHash2.cpp"], Extension("spacy.tokens", ["spacy/tokens.pyx"], language="c++", include_dirs=includes),
language="c++", include_dirs=includes), Extension("spacy.string_tools", ["spacy/string_tools.pyx"], language="c++",
Extension("spacy.tokens", include_dirs=includes),
["spacy/tokens.pyx", "ext/MurmurHash3.cpp", "ext/MurmurHash2.cpp"],
language="c++", include_dirs=includes),
Extension("spacy.string_tools",
["spacy/string_tools.pyx", "ext/MurmurHash3.cpp", "ext/MurmurHash2.cpp"],
language="c++", include_dirs=includes),
] ]
@ -68,7 +59,7 @@ if sys.argv[1] == 'clean':
map(clean, exts) map(clean, exts)
distutils.core.setup( distutils.core.setup(
name='Sparse linear models with Cython', name='Lightning fast, full-cream NL tokenizer',
packages=['thinc'], packages=['thinc'],
author='Matthew Honnibal', author='Matthew Honnibal',
author_email='honnibal@gmail.com', author_email='honnibal@gmail.com',

View File

@ -1,7 +1,7 @@
from libcpp.vector cimport vector from libcpp.vector cimport vector
from libc.stdint cimport uint64_t from libc.stdint cimport uint64_t
from ext.sparsehash cimport dense_hash_map from sparsehash.dense_hash_map cimport dense_hash_map
# Circular import problems here # Circular import problems here

View File

@ -3,9 +3,7 @@ from __future__ import unicode_literals
from libc.stdlib cimport calloc, free from libc.stdlib cimport calloc, free
from ext.murmurhash cimport MurmurHash64A from murmurhash cimport mrmr
from ext.murmurhash cimport MurmurHash64B
from spacy.lexeme cimport Lexeme from spacy.lexeme cimport Lexeme
from spacy.lexeme cimport BLANK_WORD from spacy.lexeme cimport BLANK_WORD
@ -100,7 +98,7 @@ cdef class Language:
cdef StringHash hash_string(self, Py_UNICODE* s, size_t length) except 0: cdef StringHash hash_string(self, Py_UNICODE* s, size_t length) except 0:
'''Hash unicode with MurmurHash64A''' '''Hash unicode with MurmurHash64A'''
return MurmurHash64A(<Py_UNICODE*>s, length * sizeof(Py_UNICODE), 0) return mrmr.hash64(<Py_UNICODE*>s, length * sizeof(Py_UNICODE), 0)
cdef unicode unhash(self, StringHash hash_value): cdef unicode unhash(self, StringHash hash_value):
'''Fetch a string from the reverse index, given its hash value.''' '''Fetch a string from the reverse index, given its hash value.'''