mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
* Replace the use of dense_hash_map with Python dict
This commit is contained in:
parent
6f83dca218
commit
47fbd0475a
|
@ -2,12 +2,9 @@ from libcpp.vector cimport vector
|
||||||
from libc.stdint cimport uint32_t
|
from libc.stdint cimport uint32_t
|
||||||
from libc.stdint cimport uint64_t
|
from libc.stdint cimport uint64_t
|
||||||
|
|
||||||
from sparsehash.dense_hash_map cimport dense_hash_map
|
|
||||||
|
|
||||||
# Circular import problems here
|
# Circular import problems here
|
||||||
ctypedef size_t Lexeme_addr
|
ctypedef size_t Lexeme_addr
|
||||||
ctypedef uint32_t StringHash
|
ctypedef uint32_t StringHash
|
||||||
ctypedef dense_hash_map[StringHash, size_t] Vocab
|
|
||||||
from spacy.lexeme cimport Lexeme
|
from spacy.lexeme cimport Lexeme
|
||||||
|
|
||||||
from spacy.tokens cimport Tokens
|
from spacy.tokens cimport Tokens
|
||||||
|
@ -23,8 +20,8 @@ from spacy.lexeme cimport Lexeme
|
||||||
|
|
||||||
cdef class Language:
|
cdef class Language:
|
||||||
cdef object name
|
cdef object name
|
||||||
cdef dense_hash_map[StringHash, size_t] chunks
|
cdef dict chunks
|
||||||
cdef dense_hash_map[StringHash, size_t] vocab
|
cdef dict vocab
|
||||||
cdef dict bacov
|
cdef dict bacov
|
||||||
|
|
||||||
cpdef Tokens tokenize(self, unicode text)
|
cpdef Tokens tokenize(self, unicode text)
|
||||||
|
|
|
@ -28,10 +28,8 @@ cdef class Language:
|
||||||
def __cinit__(self, name):
|
def __cinit__(self, name):
|
||||||
self.name = name
|
self.name = name
|
||||||
self.bacov = {}
|
self.bacov = {}
|
||||||
self.chunks = dense_hash_map[StringHash, size_t]()
|
self.chunks = {}
|
||||||
self.vocab = dense_hash_map[StringHash, size_t]()
|
self.vocab = {}
|
||||||
self.chunks.set_empty_key(0)
|
|
||||||
self.vocab.set_empty_key(0)
|
|
||||||
self.load_tokenization(util.read_tokenization(name))
|
self.load_tokenization(util.read_tokenization(name))
|
||||||
self.load_dist_info(util.read_dist_info(name))
|
self.load_dist_info(util.read_dist_info(name))
|
||||||
|
|
||||||
|
@ -65,16 +63,24 @@ cdef class Language:
|
||||||
|
|
||||||
cdef Lexeme* lookup(self, unicode string) except NULL:
|
cdef Lexeme* lookup(self, unicode string) except NULL:
|
||||||
assert len(string) != 0
|
assert len(string) != 0
|
||||||
cdef Lexeme* word = <Lexeme*>self.vocab[hash(string)]
|
cdef Lexeme* word
|
||||||
if word == NULL:
|
cdef LexID lex_id
|
||||||
|
cdef StringHash h = hash(string)
|
||||||
|
if h in self.vocab:
|
||||||
|
lex_id = self.vocab[h]
|
||||||
|
word = <Lexeme*>lex_id
|
||||||
|
else:
|
||||||
word = self.new_lexeme(string)
|
word = self.new_lexeme(string)
|
||||||
return word
|
return word
|
||||||
|
|
||||||
cdef Lexeme** lookup_chunk(self, unicode string) except NULL:
|
cdef Lexeme** lookup_chunk(self, unicode string) except NULL:
|
||||||
cdef StringHash h = hash(string)
|
cdef StringHash h = hash(string)
|
||||||
cdef Lexeme** chunk = <Lexeme**>self.chunks[h]
|
cdef Lexeme** chunk
|
||||||
cdef int split
|
cdef size_t chunk_id
|
||||||
if chunk == NULL:
|
if h in self.chunks:
|
||||||
|
chunk_id = self.chunks[h]
|
||||||
|
chunk = <Lexeme**>chunk_id
|
||||||
|
else:
|
||||||
chunk = self.new_chunk(string, self.find_substrings(string))
|
chunk = self.new_chunk(string, self.find_substrings(string))
|
||||||
return chunk
|
return chunk
|
||||||
|
|
||||||
|
@ -83,7 +89,8 @@ cdef class Language:
|
||||||
for i, substring in enumerate(substrings):
|
for i, substring in enumerate(substrings):
|
||||||
chunk[i] = self.lookup(substring)
|
chunk[i] = self.lookup(substring)
|
||||||
chunk[i + 1] = NULL
|
chunk[i + 1] = NULL
|
||||||
self.chunks[hash(string)] = <size_t>chunk
|
cdef StringHash h = hash(string)
|
||||||
|
self.chunks[h] = <size_t>chunk
|
||||||
return chunk
|
return chunk
|
||||||
|
|
||||||
cdef Lexeme* new_lexeme(self, unicode string) except NULL:
|
cdef Lexeme* new_lexeme(self, unicode string) except NULL:
|
||||||
|
@ -115,7 +122,7 @@ cdef class Language:
|
||||||
cdef LexID lex_id
|
cdef LexID lex_id
|
||||||
cdef Lexeme* word
|
cdef Lexeme* word
|
||||||
|
|
||||||
for key, lex_id in self.vocab:
|
for key, lex_id in self.vocab.items():
|
||||||
word = <Lexeme*>lex_id
|
word = <Lexeme*>lex_id
|
||||||
free(word.string_views)
|
free(word.string_views)
|
||||||
word.string_views = <StringHash*>calloc(nr_views, sizeof(StringHash))
|
word.string_views = <StringHash*>calloc(nr_views, sizeof(StringHash))
|
||||||
|
|
Loading…
Reference in New Issue
Block a user