mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
* Use fused type in Tokens.push_back, simplifying the use of the cache
This commit is contained in:
parent
516f0f1e14
commit
495e1c7366
|
@ -13,6 +13,17 @@ from .tagger cimport univ_tag_t
|
|||
from .utf8string cimport StringStore, UniStr
|
||||
|
||||
|
||||
cdef union LexemesOrTokens:
|
||||
const Lexeme* const* lexemes
|
||||
TokenC* tokens
|
||||
|
||||
|
||||
cdef struct Cached:
|
||||
LexemesOrTokens data
|
||||
bint is_lex
|
||||
int length
|
||||
|
||||
|
||||
cdef class Lexicon:
|
||||
cpdef public get_lex_props
|
||||
cdef Pool mem
|
||||
|
|
|
@ -137,21 +137,19 @@ cdef class Language:
|
|||
|
||||
cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1:
|
||||
cdef int i
|
||||
specials = <TokenC*>self._specials.get(key)
|
||||
if specials != NULL:
|
||||
i = 0
|
||||
while specials[i].lex != NULL:
|
||||
tokens.push_back(idx, specials[i].lex)
|
||||
tokens.data[tokens.length - 1].pos = specials[i].pos
|
||||
tokens.data[tokens.length - 1].morph = specials[i].morph
|
||||
tokens.data[tokens.length - 1].lemma = specials[i].lemma
|
||||
tokens.data[tokens.length - 1].sense = specials[i].sense
|
||||
i += 1
|
||||
cdef TokenC* token
|
||||
cached = <Cached*>self._specials.get(key)
|
||||
if cached != NULL:
|
||||
assert not cached.is_lex
|
||||
for i in range(cached.length):
|
||||
token = &cached.data.tokens[i]
|
||||
idx = tokens.push_back(idx, token)
|
||||
return True
|
||||
else:
|
||||
cached = <const Lexeme* const*>self._cache.get(key)
|
||||
cached = <Cached*>self._cache.get(key)
|
||||
if cached != NULL:
|
||||
tokens.extend(i, cached, 0)
|
||||
assert cached.is_lex == True
|
||||
tokens.extend(i, cached.data.lexemes, cached.length)
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
@ -244,11 +242,14 @@ cdef class Language:
|
|||
for i in range(n):
|
||||
if tokens[i].lex.id == 1:
|
||||
return 0
|
||||
lexemes = <const Lexeme**>self.mem.alloc(n + 1, sizeof(Lexeme**))
|
||||
cached = <Cached*>self.mem.alloc(1, sizeof(Cached))
|
||||
cached.length = n
|
||||
cached.is_lex = True
|
||||
lexemes = <const Lexeme**>self.mem.alloc(n, sizeof(Lexeme**))
|
||||
for i in range(n):
|
||||
lexemes[i] = tokens[i].lex
|
||||
lexemes[i + 1] = NULL
|
||||
self._cache.set(key, lexemes)
|
||||
cached.data.lexemes = <const Lexeme* const*>lexemes
|
||||
self._cache.set(key, cached)
|
||||
|
||||
cdef int _find_infix(self, Py_UNICODE* chars, size_t length) except -1:
|
||||
cdef unicode string = chars[:length]
|
||||
|
@ -287,10 +288,12 @@ cdef class Language:
|
|||
if lemma:
|
||||
tokens[i].lemma = self.lexicon.strings[lemma]
|
||||
set_morph_from_dict(&tokens[i].morph, props)
|
||||
# Null-terminated array
|
||||
tokens[i+1].lex = NULL
|
||||
cached = <Cached*>self.mem.alloc(1, sizeof(Cached))
|
||||
cached.length = len(substrings)
|
||||
cached.is_lex = False
|
||||
cached.data.tokens = tokens
|
||||
slice_unicode(&string, chunk, 0, len(chunk))
|
||||
self._specials.set(string.key, tokens)
|
||||
self._specials.set(string.key, cached)
|
||||
|
||||
|
||||
cdef int set_morph_from_dict(Morphology* morph, dict props) except -1:
|
||||
|
|
|
@ -30,6 +30,14 @@ cdef struct TokenC:
|
|||
int sense
|
||||
|
||||
|
||||
ctypedef const Lexeme* const_Lexeme_ptr
|
||||
ctypedef TokenC* TokenC_ptr
|
||||
|
||||
ctypedef fused LexemeOrToken:
|
||||
const_Lexeme_ptr
|
||||
TokenC_ptr
|
||||
|
||||
|
||||
cdef class Tokens:
|
||||
cdef Pool mem
|
||||
cdef StringStore _string_store
|
||||
|
@ -40,7 +48,7 @@ cdef class Tokens:
|
|||
cdef int max_length
|
||||
|
||||
cdef int extend(self, int i, const Lexeme* const* lexemes, int n) except -1
|
||||
cdef int push_back(self, int i, const Lexeme* lexeme) except -1
|
||||
cdef int push_back(self, int i, LexemeOrToken lex_or_tok) except -1
|
||||
cpdef int set_tag(self, int i, int tag_type, int tag) except -1
|
||||
|
||||
cpdef np.ndarray[long, ndim=2] get_array(self, list features)
|
||||
|
|
|
@ -60,16 +60,16 @@ cdef class Tokens:
|
|||
def __len__(self):
|
||||
return self.length
|
||||
|
||||
cdef int push_back(self, int idx, const Lexeme* lexeme) except -1:
|
||||
cdef int push_back(self, int idx, LexemeOrToken lex_or_tok) except -1:
|
||||
if self.length == self.max_length:
|
||||
self._realloc(self.length * 2)
|
||||
cdef TokenC* t = &self.data[self.length]
|
||||
t.lex = lexeme
|
||||
t.idx = idx
|
||||
t.pos = 0
|
||||
t.sense = 0
|
||||
if LexemeOrToken is TokenC_ptr:
|
||||
t[0] = lex_or_tok[0]
|
||||
else:
|
||||
t.lex = lex_or_tok
|
||||
self.length += 1
|
||||
return idx + lexeme.length
|
||||
return idx + t.lex.length
|
||||
|
||||
cdef int extend(self, int idx, const Lexeme* const* lexemes, int n) except -1:
|
||||
cdef int i
|
||||
|
|
Loading…
Reference in New Issue
Block a user