mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 10:46:29 +03:00
Disable tokenizer cache for special-cases. Fixes #1250
This commit is contained in:
parent
63f0bde749
commit
b0f6fd3f1d
|
@ -27,8 +27,9 @@ cdef class Tokenizer:
|
||||||
cdef int _try_cache(self, hash_t key, Doc tokens) except -1
|
cdef int _try_cache(self, hash_t key, Doc tokens) except -1
|
||||||
cdef int _tokenize(self, Doc tokens, unicode span, hash_t key) except -1
|
cdef int _tokenize(self, Doc tokens, unicode span, hash_t key) except -1
|
||||||
cdef unicode _split_affixes(self, Pool mem, unicode string, vector[LexemeC*] *prefixes,
|
cdef unicode _split_affixes(self, Pool mem, unicode string, vector[LexemeC*] *prefixes,
|
||||||
vector[LexemeC*] *suffixes)
|
vector[LexemeC*] *suffixes, int* has_special)
|
||||||
cdef int _attach_tokens(self, Doc tokens, unicode string,
|
cdef int _attach_tokens(self, Doc tokens, unicode string,
|
||||||
vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except -1
|
vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except -1
|
||||||
|
|
||||||
cdef int _save_cached(self, const TokenC* tokens, hash_t key, int n) except -1
|
cdef int _save_cached(self, const TokenC* tokens, hash_t key, int has_special,
|
||||||
|
int n) except -1
|
||||||
|
|
|
@ -20,7 +20,8 @@ cdef class Tokenizer:
|
||||||
"""Segment text, and create Doc objects with the discovered segment
|
"""Segment text, and create Doc objects with the discovered segment
|
||||||
boundaries.
|
boundaries.
|
||||||
"""
|
"""
|
||||||
def __init__(self, Vocab vocab, rules, prefix_search, suffix_search, infix_finditer, token_match=None):
|
def __init__(self, Vocab vocab, rules=None, prefix_search=None,
|
||||||
|
suffix_search=None, infix_finditer=None, token_match=None):
|
||||||
"""Create a `Tokenizer`, to create `Doc` objects given unicode text.
|
"""Create a `Tokenizer`, to create `Doc` objects given unicode text.
|
||||||
|
|
||||||
vocab (Vocab): A storage container for lexical types.
|
vocab (Vocab): A storage container for lexical types.
|
||||||
|
@ -48,6 +49,7 @@ cdef class Tokenizer:
|
||||||
self.infix_finditer = infix_finditer
|
self.infix_finditer = infix_finditer
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self._rules = {}
|
self._rules = {}
|
||||||
|
if rules is not None:
|
||||||
for chunk, substrings in sorted(rules.items()):
|
for chunk, substrings in sorted(rules.items()):
|
||||||
self.add_special_case(chunk, substrings)
|
self.add_special_case(chunk, substrings)
|
||||||
|
|
||||||
|
@ -148,14 +150,18 @@ cdef class Tokenizer:
|
||||||
cdef vector[LexemeC*] prefixes
|
cdef vector[LexemeC*] prefixes
|
||||||
cdef vector[LexemeC*] suffixes
|
cdef vector[LexemeC*] suffixes
|
||||||
cdef int orig_size
|
cdef int orig_size
|
||||||
|
cdef int has_special
|
||||||
orig_size = tokens.length
|
orig_size = tokens.length
|
||||||
span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes)
|
span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes,
|
||||||
|
&has_special)
|
||||||
self._attach_tokens(tokens, span, &prefixes, &suffixes)
|
self._attach_tokens(tokens, span, &prefixes, &suffixes)
|
||||||
self._save_cached(&tokens.c[orig_size], orig_key, tokens.length - orig_size)
|
self._save_cached(&tokens.c[orig_size], orig_key, has_special,
|
||||||
|
tokens.length - orig_size)
|
||||||
|
|
||||||
cdef unicode _split_affixes(self, Pool mem, unicode string,
|
cdef unicode _split_affixes(self, Pool mem, unicode string,
|
||||||
vector[const LexemeC*] *prefixes,
|
vector[const LexemeC*] *prefixes,
|
||||||
vector[const LexemeC*] *suffixes):
|
vector[const LexemeC*] *suffixes,
|
||||||
|
int* has_special):
|
||||||
cdef size_t i
|
cdef size_t i
|
||||||
cdef unicode prefix
|
cdef unicode prefix
|
||||||
cdef unicode suffix
|
cdef unicode suffix
|
||||||
|
@ -174,6 +180,7 @@ cdef class Tokenizer:
|
||||||
if minus_pre and self._specials.get(hash_string(minus_pre)) != NULL:
|
if minus_pre and self._specials.get(hash_string(minus_pre)) != NULL:
|
||||||
string = minus_pre
|
string = minus_pre
|
||||||
prefixes.push_back(self.vocab.get(mem, prefix))
|
prefixes.push_back(self.vocab.get(mem, prefix))
|
||||||
|
has_special[0] = 1
|
||||||
break
|
break
|
||||||
if self.token_match and self.token_match(string):
|
if self.token_match and self.token_match(string):
|
||||||
break
|
break
|
||||||
|
@ -185,6 +192,7 @@ cdef class Tokenizer:
|
||||||
if minus_suf and (self._specials.get(hash_string(minus_suf)) != NULL):
|
if minus_suf and (self._specials.get(hash_string(minus_suf)) != NULL):
|
||||||
string = minus_suf
|
string = minus_suf
|
||||||
suffixes.push_back(self.vocab.get(mem, suffix))
|
suffixes.push_back(self.vocab.get(mem, suffix))
|
||||||
|
has_special[0] = 1
|
||||||
break
|
break
|
||||||
if pre_len and suf_len and (pre_len + suf_len) <= len(string):
|
if pre_len and suf_len and (pre_len + suf_len) <= len(string):
|
||||||
string = string[pre_len:-suf_len]
|
string = string[pre_len:-suf_len]
|
||||||
|
@ -197,6 +205,7 @@ cdef class Tokenizer:
|
||||||
string = minus_suf
|
string = minus_suf
|
||||||
suffixes.push_back(self.vocab.get(mem, suffix))
|
suffixes.push_back(self.vocab.get(mem, suffix))
|
||||||
if string and (self._specials.get(hash_string(string)) != NULL):
|
if string and (self._specials.get(hash_string(string)) != NULL):
|
||||||
|
has_special[0] = 1
|
||||||
break
|
break
|
||||||
return string
|
return string
|
||||||
|
|
||||||
|
@ -256,11 +265,15 @@ cdef class Tokenizer:
|
||||||
preinc(it)
|
preinc(it)
|
||||||
tokens.push_back(lexeme, False)
|
tokens.push_back(lexeme, False)
|
||||||
|
|
||||||
cdef int _save_cached(self, const TokenC* tokens, hash_t key, int n) except -1:
|
cdef int _save_cached(self, const TokenC* tokens, hash_t key,
|
||||||
|
int has_special, int n) except -1:
|
||||||
cdef int i
|
cdef int i
|
||||||
for i in range(n):
|
for i in range(n):
|
||||||
if tokens[i].lex.id == 0:
|
if tokens[i].lex.id == 0:
|
||||||
return 0
|
return 0
|
||||||
|
# See https://github.com/explosion/spaCy/issues/1250
|
||||||
|
if has_special:
|
||||||
|
return 0
|
||||||
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
|
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
|
||||||
cached.length = n
|
cached.length = n
|
||||||
cached.is_lex = True
|
cached.is_lex = True
|
||||||
|
|
Loading…
Reference in New Issue
Block a user