mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
* PointerHash working, efficiency is good. 6-7 mins
This commit is contained in:
parent
85d68e8e95
commit
0447279c57
1
setup.py
1
setup.py
|
@ -46,6 +46,7 @@ else:
|
||||||
|
|
||||||
exts = [
|
exts = [
|
||||||
Extension("spacy.lang", ["spacy/lang.pyx"], language="c++", include_dirs=includes),
|
Extension("spacy.lang", ["spacy/lang.pyx"], language="c++", include_dirs=includes),
|
||||||
|
Extension("spacy._hashing", ["spacy/_hashing.pyx"], language="c++", include_dirs=includes),
|
||||||
Extension("spacy.word", ["spacy/word.pyx"], language="c++",
|
Extension("spacy.word", ["spacy/word.pyx"], language="c++",
|
||||||
include_dirs=includes),
|
include_dirs=includes),
|
||||||
Extension("spacy.lexeme", ["spacy/lexeme.pyx"], language="c++",
|
Extension("spacy.lexeme", ["spacy/lexeme.pyx"], language="c++",
|
||||||
|
|
|
@ -12,9 +12,9 @@ cdef struct Cell:
|
||||||
cdef class PointerHash:
|
cdef class PointerHash:
|
||||||
cdef size_t size
|
cdef size_t size
|
||||||
cdef size_t filled
|
cdef size_t filled
|
||||||
|
cdef Cell* _last
|
||||||
cdef Cell* cells
|
cdef Cell* cells
|
||||||
|
|
||||||
cdef size_t find_slot(self, key_t key)
|
cdef val_t lookup(self, key_t key)
|
||||||
cdef Cell* lookup(self, key_t key)
|
cdef void insert(self, key_t key, val_t value) except *
|
||||||
cdef void insert(self, key_t key, val_t value)
|
cdef void resize(self, size_t new_size) except *
|
||||||
cdef void resize(self, size_t new_size)
|
|
||||||
|
|
|
@ -6,7 +6,9 @@ cimport cython
|
||||||
cdef class PointerHash:
|
cdef class PointerHash:
|
||||||
def __cinit__(self, size_t initial_size=8):
|
def __cinit__(self, size_t initial_size=8):
|
||||||
self.size = initial_size
|
self.size = initial_size
|
||||||
|
self.size = 8
|
||||||
self.filled = 0
|
self.filled = 0
|
||||||
|
self._last = NULL
|
||||||
# Size must be power of two
|
# Size must be power of two
|
||||||
assert self.size & (self.size - 1) == 0
|
assert self.size & (self.size - 1) == 0
|
||||||
self.cells = <Cell*>calloc(self.size, sizeof(Cell))
|
self.cells = <Cell*>calloc(self.size, sizeof(Cell))
|
||||||
|
@ -16,42 +18,37 @@ cdef class PointerHash:
|
||||||
|
|
||||||
def __getitem__(self, key_t key):
|
def __getitem__(self, key_t key):
|
||||||
assert key != 0
|
assert key != 0
|
||||||
cdef Cell* cell = self.lookup(key)
|
cdef val_t value = self.lookup(key)
|
||||||
return cell.value if cell.key != 0 else None
|
return value if value != 0 else None
|
||||||
|
|
||||||
def __setitem__(self, key_t key, val_t value):
|
def __setitem__(self, key_t key, val_t value):
|
||||||
assert key != 0
|
assert key != 0
|
||||||
|
assert value != 0
|
||||||
self.insert(key, value)
|
self.insert(key, value)
|
||||||
|
|
||||||
@cython.cdivision
|
cdef val_t lookup(self, key_t key):
|
||||||
cdef size_t find_slot(self, key_t key):
|
cell = _find_cell(self.cells, self.size, key)
|
||||||
cdef size_t i = (key % self.size)
|
self._last = cell
|
||||||
while self.cells[i].key != 0 and self.cells[i].key != key:
|
return cell.value
|
||||||
i = (i + 1) % self.size
|
|
||||||
return i
|
|
||||||
|
|
||||||
@cython.cdivision
|
cdef void insert(self, key_t key, val_t value) except *:
|
||||||
cdef Cell* lookup(self, key_t key):
|
cdef Cell* cell
|
||||||
cdef size_t i = (key % self.size)
|
if self._last != NULL and key == self._last.key:
|
||||||
while self.cells[i].key != 0 and self.cells[i].key != key:
|
cell = self._last
|
||||||
i = (i + 1) % self.size
|
else:
|
||||||
return &self.cells[i]
|
cell = _find_cell(self.cells, self.size, key)
|
||||||
|
self._last = NULL
|
||||||
cdef void insert(self, key_t key, val_t value):
|
if cell.key == 0:
|
||||||
cdef size_t i = self.find_slot(key)
|
cell.key = key
|
||||||
if self.cells[i].key == 0:
|
|
||||||
self.cells[i].key = key
|
|
||||||
self.filled += 1
|
self.filled += 1
|
||||||
self.cells[i].value = value
|
cell.value = value
|
||||||
if (self.filled + 1) * 4 >= (self.size * 3):
|
if (self.filled + 1) * 4 >= (self.size * 3):
|
||||||
self.resize(self.size * 2)
|
self.resize(self.size * 2)
|
||||||
|
|
||||||
cdef void resize(self, size_t new_size):
|
cdef void resize(self, size_t new_size) except *:
|
||||||
assert (new_size & (new_size - 1)) == 0 # Must be a power of 2
|
assert (new_size & (new_size - 1)) == 0 # Must be a power of 2
|
||||||
assert self.filled * 4 <= new_size * 3
|
assert self.filled * 4 <= new_size * 3
|
||||||
|
|
||||||
self.size = new_size
|
|
||||||
|
|
||||||
cdef Cell* old_cells = self.cells
|
cdef Cell* old_cells = self.cells
|
||||||
cdef size_t old_size = self.size
|
cdef size_t old_size = self.size
|
||||||
|
|
||||||
|
@ -60,6 +57,17 @@ cdef class PointerHash:
|
||||||
|
|
||||||
self.filled = 0
|
self.filled = 0
|
||||||
cdef size_t i
|
cdef size_t i
|
||||||
|
cdef size_t slot
|
||||||
for i in range(old_size):
|
for i in range(old_size):
|
||||||
if self.cells[i].key != 0:
|
if old_cells[i].key != 0:
|
||||||
self.insert(self.cells[i].key, self.cells[i].value)
|
assert old_cells[i].value != 0, i
|
||||||
|
self.insert(old_cells[i].key, old_cells[i].value)
|
||||||
|
free(old_cells)
|
||||||
|
|
||||||
|
|
||||||
|
@cython.cdivision
|
||||||
|
cdef inline Cell* _find_cell(Cell* cells, size_t size, key_t key) nogil:
|
||||||
|
cdef size_t i = (key % size)
|
||||||
|
while cells[i].key != 0 and cells[i].key != key:
|
||||||
|
i = (i + 1) % size
|
||||||
|
return &cells[i]
|
||||||
|
|
|
@ -238,7 +238,7 @@ cdef class English(Language):
|
||||||
v_shape = View_WordShape
|
v_shape = View_WordShape
|
||||||
def __cinit__(self, name, user_string_features, user_flag_features):
|
def __cinit__(self, name, user_string_features, user_flag_features):
|
||||||
self.cache = PointerHash(2 ** 25)
|
self.cache = PointerHash(2 ** 25)
|
||||||
self.specials.set_empty_key(0)
|
self.specials = PointerHash(2 ** 16)
|
||||||
lang_data = util.read_lang_data(name)
|
lang_data = util.read_lang_data(name)
|
||||||
rules, words, probs, clusters, case_stats, tag_stats = lang_data
|
rules, words, probs, clusters, case_stats, tag_stats = lang_data
|
||||||
self.lexicon = lang.Lexicon(words, probs, clusters, case_stats, tag_stats,
|
self.lexicon = lang.Lexicon(words, probs, clusters, case_stats, tag_stats,
|
||||||
|
|
|
@ -15,49 +15,6 @@ cdef extern from "Python.h":
|
||||||
cdef bint Py_UNICODE_ISALNUM(Py_UNICODE ch)
|
cdef bint Py_UNICODE_ISALNUM(Py_UNICODE ch)
|
||||||
|
|
||||||
|
|
||||||
cdef extern from "sparsehash/dense_hash_map" namespace "google":
|
|
||||||
cdef cppclass dense_hash_map[K, D]:
|
|
||||||
K& key_type
|
|
||||||
D& data_type
|
|
||||||
pair[K, D]& value_type
|
|
||||||
uint64_t size_type
|
|
||||||
cppclass iterator:
|
|
||||||
pair[K, D]& operator*() nogil
|
|
||||||
iterator operator++() nogil
|
|
||||||
iterator operator--() nogil
|
|
||||||
bint operator==(iterator) nogil
|
|
||||||
bint operator!=(iterator) nogil
|
|
||||||
iterator begin()
|
|
||||||
iterator end()
|
|
||||||
uint64_t size()
|
|
||||||
uint64_t max_size()
|
|
||||||
bint empty()
|
|
||||||
uint64_t bucket_count()
|
|
||||||
uint64_t bucket_size(uint64_t i)
|
|
||||||
uint64_t bucket(K& key)
|
|
||||||
double max_load_factor()
|
|
||||||
void max_load_vactor(double new_grow)
|
|
||||||
double min_load_factor()
|
|
||||||
double min_load_factor(double new_grow)
|
|
||||||
void set_resizing_parameters(double shrink, double grow)
|
|
||||||
void resize(uint64_t n)
|
|
||||||
void rehash(uint64_t n)
|
|
||||||
dense_hash_map()
|
|
||||||
dense_hash_map(uint64_t n)
|
|
||||||
void swap(dense_hash_map&)
|
|
||||||
pair[iterator, bint] insert(pair[K, D]) nogil
|
|
||||||
void set_empty_key(K&)
|
|
||||||
void set_deleted_key(K& key)
|
|
||||||
void clear_deleted_key()
|
|
||||||
void erase(iterator pos)
|
|
||||||
uint64_t erase(K& k)
|
|
||||||
void erase(iterator first, iterator last)
|
|
||||||
void clear()
|
|
||||||
void clear_no_resize()
|
|
||||||
pair[iterator, iterator] equal_range(K& k)
|
|
||||||
D& operator[](K&) nogil
|
|
||||||
|
|
||||||
|
|
||||||
cdef struct String:
|
cdef struct String:
|
||||||
Py_UNICODE* chars
|
Py_UNICODE* chars
|
||||||
size_t n
|
size_t n
|
||||||
|
@ -70,7 +27,7 @@ cdef class Lexicon:
|
||||||
cpdef Lexeme lookup(self, unicode string)
|
cpdef Lexeme lookup(self, unicode string)
|
||||||
cdef size_t get(self, String* s)
|
cdef size_t get(self, String* s)
|
||||||
|
|
||||||
cdef dense_hash_map[uint64_t, size_t] _dict
|
cdef PointerHash _dict
|
||||||
|
|
||||||
cdef list _string_features
|
cdef list _string_features
|
||||||
cdef list _flag_features
|
cdef list _flag_features
|
||||||
|
@ -79,7 +36,7 @@ cdef class Lexicon:
|
||||||
cdef class Language:
|
cdef class Language:
|
||||||
cdef unicode name
|
cdef unicode name
|
||||||
cdef PointerHash cache
|
cdef PointerHash cache
|
||||||
cdef dense_hash_map[uint64_t, size_t] specials
|
cdef PointerHash specials
|
||||||
cpdef readonly Lexicon lexicon
|
cpdef readonly Lexicon lexicon
|
||||||
cpdef readonly object tokens_class
|
cpdef readonly object tokens_class
|
||||||
|
|
||||||
|
|
|
@ -43,7 +43,7 @@ cdef class Language:
|
||||||
string_features = []
|
string_features = []
|
||||||
self.name = name
|
self.name = name
|
||||||
self.cache = PointerHash(2 ** 22)
|
self.cache = PointerHash(2 ** 22)
|
||||||
self.specials.set_empty_key(0)
|
self.specials = PointerHash(2 ** 16)
|
||||||
lang_data = read_lang_data(name)
|
lang_data = read_lang_data(name)
|
||||||
rules, words, probs, clusters, case_stats, tag_stats = lang_data
|
rules, words, probs, clusters, case_stats, tag_stats = lang_data
|
||||||
self.lexicon = Lexicon(words, probs, clusters, case_stats, tag_stats,
|
self.lexicon = Lexicon(words, probs, clusters, case_stats, tag_stats,
|
||||||
|
@ -52,10 +52,7 @@ cdef class Language:
|
||||||
self.tokens_class = Tokens
|
self.tokens_class = Tokens
|
||||||
|
|
||||||
def __dealloc__(self):
|
def __dealloc__(self):
|
||||||
cdef uint64_t hashed
|
pass
|
||||||
cdef size_t lex_addr
|
|
||||||
for (hashed, lex_addr) in self.specials:
|
|
||||||
free(<LexemeC*>lex_addr)
|
|
||||||
|
|
||||||
property nr_types:
|
property nr_types:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
@ -112,28 +109,25 @@ cdef class Language:
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
cdef int _tokenize(self, Tokens tokens, String* string):
|
cdef int _tokenize(self, Tokens tokens, String* string):
|
||||||
cdef Cell* cell = self.cache.lookup(string.key)
|
cdef LexemeC** lexemes = <LexemeC**>self.cache.lookup(string.key)
|
||||||
cdef LexemeC** lexemes
|
|
||||||
cdef size_t i
|
cdef size_t i
|
||||||
if cell.key != 0:
|
if lexemes != NULL:
|
||||||
lexemes = <LexemeC**>cell.value
|
|
||||||
i = 0
|
i = 0
|
||||||
while lexemes[i] != NULL:
|
while lexemes[i] != NULL:
|
||||||
tokens.push_back(lexemes[i])
|
tokens.push_back(lexemes[i])
|
||||||
i += 1
|
i += 1
|
||||||
return 0
|
return 0
|
||||||
|
cdef uint64_t key = string.key
|
||||||
cell.key = string.key
|
|
||||||
self.cache.filled += 1
|
|
||||||
cdef size_t first_token = tokens.length
|
cdef size_t first_token = tokens.length
|
||||||
cdef int split
|
cdef int split
|
||||||
cdef int remaining = string.n
|
cdef int remaining = string.n
|
||||||
cdef String prefix
|
cdef String prefix
|
||||||
|
cdef Cell* tmp_cell
|
||||||
while remaining >= 1:
|
while remaining >= 1:
|
||||||
split = self._split_one(string.chars, string.n)
|
split = self._split_one(string.chars, string.n)
|
||||||
remaining -= split
|
remaining -= split
|
||||||
string_slice_prefix(string, &prefix, split)
|
string_slice_prefix(string, &prefix, split)
|
||||||
lexemes = <LexemeC**>self.specials[prefix.key]
|
lexemes = <LexemeC**>self.specials.lookup(prefix.key)
|
||||||
if lexemes != NULL:
|
if lexemes != NULL:
|
||||||
i = 0
|
i = 0
|
||||||
while lexemes[i] != NULL:
|
while lexemes[i] != NULL:
|
||||||
|
@ -145,7 +139,7 @@ cdef class Language:
|
||||||
cdef size_t j
|
cdef size_t j
|
||||||
for i, j in enumerate(range(first_token, tokens.length)):
|
for i, j in enumerate(range(first_token, tokens.length)):
|
||||||
lexemes[i] = tokens.lexemes[j]
|
lexemes[i] = tokens.lexemes[j]
|
||||||
cell.value = <size_t>lexemes
|
self.cache.insert(key, <size_t>lexemes)
|
||||||
|
|
||||||
cdef int _split_one(self, Py_UNICODE* characters, size_t length):
|
cdef int _split_one(self, Py_UNICODE* characters, size_t length):
|
||||||
return length
|
return length
|
||||||
|
@ -181,7 +175,7 @@ cdef class Lexicon:
|
||||||
string_features, flag_features):
|
string_features, flag_features):
|
||||||
self._flag_features = flag_features
|
self._flag_features = flag_features
|
||||||
self._string_features = string_features
|
self._string_features = string_features
|
||||||
self._dict.set_empty_key(0)
|
self._dict = PointerHash(2 ** 20)
|
||||||
self.size = 0
|
self.size = 0
|
||||||
cdef Lexeme word
|
cdef Lexeme word
|
||||||
for string in words:
|
for string in words:
|
||||||
|
@ -200,9 +194,9 @@ cdef class Lexicon:
|
||||||
self.size += 1
|
self.size += 1
|
||||||
|
|
||||||
cdef size_t get(self, String* string):
|
cdef size_t get(self, String* string):
|
||||||
cdef LexemeC* lexeme = <LexemeC*>self._dict[string.key]
|
cdef size_t lex_addr = self._dict.lookup(string.key)
|
||||||
if lexeme != NULL:
|
if lex_addr != 0:
|
||||||
return <size_t>lexeme
|
return lex_addr
|
||||||
|
|
||||||
cdef unicode uni_string = string.chars[:string.n]
|
cdef unicode uni_string = string.chars[:string.n]
|
||||||
views = [string_view(uni_string, 0.0, 0, {}, {})
|
views = [string_view(uni_string, 0.0, 0, {}, {})
|
||||||
|
@ -212,8 +206,8 @@ cdef class Lexicon:
|
||||||
if flag_feature(uni_string, 0.0, {}, {}):
|
if flag_feature(uni_string, 0.0, {}, {}):
|
||||||
flags.add(i)
|
flags.add(i)
|
||||||
|
|
||||||
lexeme = lexeme_init(uni_string, 0, 0, views, flags)
|
cdef LexemeC* lexeme = lexeme_init(uni_string, 0, 0, views, flags)
|
||||||
self._dict[string.key] = <size_t>lexeme
|
self._dict.insert(string.key, <size_t>lexeme)
|
||||||
self.size += 1
|
self.size += 1
|
||||||
return <size_t>lexeme
|
return <size_t>lexeme
|
||||||
|
|
||||||
|
|
20
tests/test_hashing.py
Normal file
20
tests/test_hashing.py
Normal file
|
@ -0,0 +1,20 @@
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from spacy._hashing import PointerHash
|
||||||
|
import random
|
||||||
|
|
||||||
|
|
||||||
|
def test_insert():
|
||||||
|
h = PointerHash()
|
||||||
|
assert h[1] is None
|
||||||
|
h[1] = 5
|
||||||
|
assert h[1] == 5
|
||||||
|
h[2] = 6
|
||||||
|
assert h[1] == 5
|
||||||
|
assert h[2] == 6
|
||||||
|
|
||||||
|
def test_resize():
|
||||||
|
h = PointerHash(4)
|
||||||
|
for i in range(1, 100):
|
||||||
|
value = int(i * (random.random() + 1))
|
||||||
|
h[i] = value
|
Loading…
Reference in New Issue
Block a user