mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 18:06:29 +03:00
* Replaced cache with own hash table. Similar timing
This commit is contained in:
parent
c8db76e3e1
commit
85d68e8e95
|
@ -1,5 +1,7 @@
|
||||||
ctypedef key_t size_t
|
from libc.stdint cimport uint64_t
|
||||||
ctypedef val_t size_t
|
|
||||||
|
ctypedef uint64_t key_t
|
||||||
|
ctypedef size_t val_t
|
||||||
|
|
||||||
|
|
||||||
cdef struct Cell:
|
cdef struct Cell:
|
||||||
|
@ -14,5 +16,5 @@ cdef class PointerHash:
|
||||||
|
|
||||||
cdef size_t find_slot(self, key_t key)
|
cdef size_t find_slot(self, key_t key)
|
||||||
cdef Cell* lookup(self, key_t key)
|
cdef Cell* lookup(self, key_t key)
|
||||||
cdef void insert(self, key_t key)
|
cdef void insert(self, key_t key, val_t value)
|
||||||
cdef void resize(self, size_t new_size)
|
cdef void resize(self, size_t new_size)
|
||||||
|
|
|
@ -1,3 +1,8 @@
|
||||||
|
# cython: profile=True
|
||||||
|
from libc.stdlib cimport calloc, free
|
||||||
|
cimport cython
|
||||||
|
|
||||||
|
|
||||||
cdef class PointerHash:
|
cdef class PointerHash:
|
||||||
def __cinit__(self, size_t initial_size=8):
|
def __cinit__(self, size_t initial_size=8):
|
||||||
self.size = initial_size
|
self.size = initial_size
|
||||||
|
@ -10,20 +15,26 @@ cdef class PointerHash:
|
||||||
free(self.cells)
|
free(self.cells)
|
||||||
|
|
||||||
def __getitem__(self, key_t key):
|
def __getitem__(self, key_t key):
|
||||||
|
assert key != 0
|
||||||
cdef Cell* cell = self.lookup(key)
|
cdef Cell* cell = self.lookup(key)
|
||||||
return cell.value if cell.key != 0 else None
|
return cell.value if cell.key != 0 else None
|
||||||
|
|
||||||
def __setitem__(self, key_t key, val_t value):
|
def __setitem__(self, key_t key, val_t value):
|
||||||
self.insert(key, value
|
assert key != 0
|
||||||
|
self.insert(key, value)
|
||||||
|
|
||||||
|
@cython.cdivision
|
||||||
cdef size_t find_slot(self, key_t key):
|
cdef size_t find_slot(self, key_t key):
|
||||||
cdef size_t i = key % self.size
|
cdef size_t i = (key % self.size)
|
||||||
while self.cells[i].key != 0 and self.cells[i].key != key:
|
while self.cells[i].key != 0 and self.cells[i].key != key:
|
||||||
i = (i + 1) % self.size
|
i = (i + 1) % self.size
|
||||||
return i
|
return i
|
||||||
|
|
||||||
|
@cython.cdivision
|
||||||
cdef Cell* lookup(self, key_t key):
|
cdef Cell* lookup(self, key_t key):
|
||||||
cdef size_t i = self.find_slot(key)
|
cdef size_t i = (key % self.size)
|
||||||
|
while self.cells[i].key != 0 and self.cells[i].key != key:
|
||||||
|
i = (i + 1) % self.size
|
||||||
return &self.cells[i]
|
return &self.cells[i]
|
||||||
|
|
||||||
cdef void insert(self, key_t key, val_t value):
|
cdef void insert(self, key_t key, val_t value):
|
||||||
|
@ -36,7 +47,7 @@ cdef class PointerHash:
|
||||||
self.resize(self.size * 2)
|
self.resize(self.size * 2)
|
||||||
|
|
||||||
cdef void resize(self, size_t new_size):
|
cdef void resize(self, size_t new_size):
|
||||||
assert new_size & (new_size - 1)) == 0 # Must be a power of 2
|
assert (new_size & (new_size - 1)) == 0 # Must be a power of 2
|
||||||
assert self.filled * 4 <= new_size * 3
|
assert self.filled * 4 <= new_size * 3
|
||||||
|
|
||||||
self.size = new_size
|
self.size = new_size
|
||||||
|
@ -47,5 +58,8 @@ cdef class PointerHash:
|
||||||
self.size = new_size
|
self.size = new_size
|
||||||
self.cells = <Cell*>calloc(new_size, sizeof(Cell))
|
self.cells = <Cell*>calloc(new_size, sizeof(Cell))
|
||||||
|
|
||||||
|
self.filled = 0
|
||||||
|
cdef size_t i
|
||||||
for i in range(old_size):
|
for i in range(old_size):
|
||||||
|
if self.cells[i].key != 0:
|
||||||
self.insert(self.cells[i].key, self.cells[i].value)
|
self.insert(self.cells[i].key, self.cells[i].value)
|
||||||
|
|
|
@ -43,6 +43,7 @@ from libc.stdint cimport uint64_t
|
||||||
cimport lang
|
cimport lang
|
||||||
from spacy.lexeme cimport lexeme_check_flag
|
from spacy.lexeme cimport lexeme_check_flag
|
||||||
from spacy.lexeme cimport lexeme_string_view
|
from spacy.lexeme cimport lexeme_string_view
|
||||||
|
from spacy._hashing cimport PointerHash
|
||||||
|
|
||||||
from spacy import util
|
from spacy import util
|
||||||
|
|
||||||
|
@ -236,7 +237,7 @@ cdef class English(Language):
|
||||||
fl_is_digit = Flag_IsDigit
|
fl_is_digit = Flag_IsDigit
|
||||||
v_shape = View_WordShape
|
v_shape = View_WordShape
|
||||||
def __cinit__(self, name, user_string_features, user_flag_features):
|
def __cinit__(self, name, user_string_features, user_flag_features):
|
||||||
self.cache.set_empty_key(0)
|
self.cache = PointerHash(2 ** 25)
|
||||||
self.specials.set_empty_key(0)
|
self.specials.set_empty_key(0)
|
||||||
lang_data = util.read_lang_data(name)
|
lang_data = util.read_lang_data(name)
|
||||||
rules, words, probs, clusters, case_stats, tag_stats = lang_data
|
rules, words, probs, clusters, case_stats, tag_stats = lang_data
|
||||||
|
|
|
@ -3,6 +3,7 @@ from libc.stdint cimport uint64_t
|
||||||
from spacy.word cimport Lexeme
|
from spacy.word cimport Lexeme
|
||||||
from spacy.tokens cimport Tokens
|
from spacy.tokens cimport Tokens
|
||||||
from spacy.lexeme cimport LexemeC
|
from spacy.lexeme cimport LexemeC
|
||||||
|
from spacy._hashing cimport PointerHash
|
||||||
|
|
||||||
from libcpp.utility cimport pair
|
from libcpp.utility cimport pair
|
||||||
from libcpp.vector cimport vector
|
from libcpp.vector cimport vector
|
||||||
|
@ -77,7 +78,7 @@ cdef class Lexicon:
|
||||||
|
|
||||||
cdef class Language:
|
cdef class Language:
|
||||||
cdef unicode name
|
cdef unicode name
|
||||||
cdef dense_hash_map[uint64_t, size_t] cache
|
cdef PointerHash cache
|
||||||
cdef dense_hash_map[uint64_t, size_t] specials
|
cdef dense_hash_map[uint64_t, size_t] specials
|
||||||
cpdef readonly Lexicon lexicon
|
cpdef readonly Lexicon lexicon
|
||||||
cpdef readonly object tokens_class
|
cpdef readonly object tokens_class
|
||||||
|
|
|
@ -19,6 +19,8 @@ from spacy.tokens import Tokens
|
||||||
from spacy.lexeme cimport LexemeC, lexeme_init
|
from spacy.lexeme cimport LexemeC, lexeme_init
|
||||||
from murmurhash.mrmr cimport hash64
|
from murmurhash.mrmr cimport hash64
|
||||||
|
|
||||||
|
from spacy._hashing cimport PointerHash
|
||||||
|
from spacy._hashing cimport Cell
|
||||||
|
|
||||||
cdef class Language:
|
cdef class Language:
|
||||||
"""Base class for language-specific tokenizers.
|
"""Base class for language-specific tokenizers.
|
||||||
|
@ -40,7 +42,7 @@ cdef class Language:
|
||||||
if string_features is None:
|
if string_features is None:
|
||||||
string_features = []
|
string_features = []
|
||||||
self.name = name
|
self.name = name
|
||||||
self.cache.set_empty_key(0)
|
self.cache = PointerHash(2 ** 22)
|
||||||
self.specials.set_empty_key(0)
|
self.specials.set_empty_key(0)
|
||||||
lang_data = read_lang_data(name)
|
lang_data = read_lang_data(name)
|
||||||
rules, words, probs, clusters, case_stats, tag_stats = lang_data
|
rules, words, probs, clusters, case_stats, tag_stats = lang_data
|
||||||
|
@ -110,17 +112,19 @@ cdef class Language:
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
cdef int _tokenize(self, Tokens tokens, String* string):
|
cdef int _tokenize(self, Tokens tokens, String* string):
|
||||||
cdef LexemeC** lexemes = <LexemeC**>self.cache[string.key]
|
cdef Cell* cell = self.cache.lookup(string.key)
|
||||||
lexemes = <LexemeC**>self.cache[string.key]
|
cdef LexemeC** lexemes
|
||||||
cdef size_t i
|
cdef size_t i
|
||||||
if lexemes != NULL:
|
if cell.key != 0:
|
||||||
|
lexemes = <LexemeC**>cell.value
|
||||||
i = 0
|
i = 0
|
||||||
while lexemes[i] != NULL:
|
while lexemes[i] != NULL:
|
||||||
tokens.push_back(lexemes[i])
|
tokens.push_back(lexemes[i])
|
||||||
i += 1
|
i += 1
|
||||||
return 0
|
return 0
|
||||||
cdef uint64_t hashed = string.key
|
|
||||||
|
|
||||||
|
cell.key = string.key
|
||||||
|
self.cache.filled += 1
|
||||||
cdef size_t first_token = tokens.length
|
cdef size_t first_token = tokens.length
|
||||||
cdef int split
|
cdef int split
|
||||||
cdef int remaining = string.n
|
cdef int remaining = string.n
|
||||||
|
@ -141,7 +145,7 @@ cdef class Language:
|
||||||
cdef size_t j
|
cdef size_t j
|
||||||
for i, j in enumerate(range(first_token, tokens.length)):
|
for i, j in enumerate(range(first_token, tokens.length)):
|
||||||
lexemes[i] = tokens.lexemes[j]
|
lexemes[i] = tokens.lexemes[j]
|
||||||
self.cache[hashed] = <size_t>lexemes
|
cell.value = <size_t>lexemes
|
||||||
|
|
||||||
cdef int _split_one(self, Py_UNICODE* characters, size_t length):
|
cdef int _split_one(self, Py_UNICODE* characters, size_t length):
|
||||||
return length
|
return length
|
||||||
|
@ -169,7 +173,7 @@ cdef class Language:
|
||||||
lexemes[i + 1] = NULL
|
lexemes[i + 1] = NULL
|
||||||
string_from_unicode(&string, uni_string)
|
string_from_unicode(&string, uni_string)
|
||||||
self.specials[string.key] = <size_t>lexemes
|
self.specials[string.key] = <size_t>lexemes
|
||||||
self.cache[string.key] = <size_t>lexemes
|
self.cache.insert(string.key, <size_t>lexemes)
|
||||||
|
|
||||||
|
|
||||||
cdef class Lexicon:
|
cdef class Lexicon:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user