Disable tokenizer cache for special-cases. Fixes #1250

This commit is contained in:
Matthew Honnibal 2017-10-24 16:07:44 +02:00
parent 63f0bde749
commit b0f6fd3f1d
2 changed files with 23 additions and 9 deletions

View File

@ -27,8 +27,9 @@ cdef class Tokenizer:
cdef int _try_cache(self, hash_t key, Doc tokens) except -1 cdef int _try_cache(self, hash_t key, Doc tokens) except -1
cdef int _tokenize(self, Doc tokens, unicode span, hash_t key) except -1 cdef int _tokenize(self, Doc tokens, unicode span, hash_t key) except -1
cdef unicode _split_affixes(self, Pool mem, unicode string, vector[LexemeC*] *prefixes, cdef unicode _split_affixes(self, Pool mem, unicode string, vector[LexemeC*] *prefixes,
vector[LexemeC*] *suffixes) vector[LexemeC*] *suffixes, int* has_special)
cdef int _attach_tokens(self, Doc tokens, unicode string, cdef int _attach_tokens(self, Doc tokens, unicode string,
vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except -1 vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except -1
cdef int _save_cached(self, const TokenC* tokens, hash_t key, int n) except -1 cdef int _save_cached(self, const TokenC* tokens, hash_t key, int has_special,
int n) except -1

View File

@ -20,7 +20,8 @@ cdef class Tokenizer:
"""Segment text, and create Doc objects with the discovered segment """Segment text, and create Doc objects with the discovered segment
boundaries. boundaries.
""" """
def __init__(self, Vocab vocab, rules, prefix_search, suffix_search, infix_finditer, token_match=None): def __init__(self, Vocab vocab, rules=None, prefix_search=None,
suffix_search=None, infix_finditer=None, token_match=None):
"""Create a `Tokenizer`, to create `Doc` objects given unicode text. """Create a `Tokenizer`, to create `Doc` objects given unicode text.
vocab (Vocab): A storage container for lexical types. vocab (Vocab): A storage container for lexical types.
@ -48,6 +49,7 @@ cdef class Tokenizer:
self.infix_finditer = infix_finditer self.infix_finditer = infix_finditer
self.vocab = vocab self.vocab = vocab
self._rules = {} self._rules = {}
if rules is not None:
for chunk, substrings in sorted(rules.items()): for chunk, substrings in sorted(rules.items()):
self.add_special_case(chunk, substrings) self.add_special_case(chunk, substrings)
@ -148,14 +150,18 @@ cdef class Tokenizer:
cdef vector[LexemeC*] prefixes cdef vector[LexemeC*] prefixes
cdef vector[LexemeC*] suffixes cdef vector[LexemeC*] suffixes
cdef int orig_size cdef int orig_size
cdef int has_special
orig_size = tokens.length orig_size = tokens.length
span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes) span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes,
&has_special)
self._attach_tokens(tokens, span, &prefixes, &suffixes) self._attach_tokens(tokens, span, &prefixes, &suffixes)
self._save_cached(&tokens.c[orig_size], orig_key, tokens.length - orig_size) self._save_cached(&tokens.c[orig_size], orig_key, has_special,
tokens.length - orig_size)
cdef unicode _split_affixes(self, Pool mem, unicode string, cdef unicode _split_affixes(self, Pool mem, unicode string,
vector[const LexemeC*] *prefixes, vector[const LexemeC*] *prefixes,
vector[const LexemeC*] *suffixes): vector[const LexemeC*] *suffixes,
int* has_special):
cdef size_t i cdef size_t i
cdef unicode prefix cdef unicode prefix
cdef unicode suffix cdef unicode suffix
@ -174,6 +180,7 @@ cdef class Tokenizer:
if minus_pre and self._specials.get(hash_string(minus_pre)) != NULL: if minus_pre and self._specials.get(hash_string(minus_pre)) != NULL:
string = minus_pre string = minus_pre
prefixes.push_back(self.vocab.get(mem, prefix)) prefixes.push_back(self.vocab.get(mem, prefix))
has_special[0] = 1
break break
if self.token_match and self.token_match(string): if self.token_match and self.token_match(string):
break break
@ -185,6 +192,7 @@ cdef class Tokenizer:
if minus_suf and (self._specials.get(hash_string(minus_suf)) != NULL): if minus_suf and (self._specials.get(hash_string(minus_suf)) != NULL):
string = minus_suf string = minus_suf
suffixes.push_back(self.vocab.get(mem, suffix)) suffixes.push_back(self.vocab.get(mem, suffix))
has_special[0] = 1
break break
if pre_len and suf_len and (pre_len + suf_len) <= len(string): if pre_len and suf_len and (pre_len + suf_len) <= len(string):
string = string[pre_len:-suf_len] string = string[pre_len:-suf_len]
@ -197,6 +205,7 @@ cdef class Tokenizer:
string = minus_suf string = minus_suf
suffixes.push_back(self.vocab.get(mem, suffix)) suffixes.push_back(self.vocab.get(mem, suffix))
if string and (self._specials.get(hash_string(string)) != NULL): if string and (self._specials.get(hash_string(string)) != NULL):
has_special[0] = 1
break break
return string return string
@ -256,11 +265,15 @@ cdef class Tokenizer:
preinc(it) preinc(it)
tokens.push_back(lexeme, False) tokens.push_back(lexeme, False)
cdef int _save_cached(self, const TokenC* tokens, hash_t key, int n) except -1: cdef int _save_cached(self, const TokenC* tokens, hash_t key,
int has_special, int n) except -1:
cdef int i cdef int i
for i in range(n): for i in range(n):
if tokens[i].lex.id == 0: if tokens[i].lex.id == 0:
return 0 return 0
# See https://github.com/explosion/spaCy/issues/1250
if has_special:
return 0
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached)) cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
cached.length = n cached.length = n
cached.is_lex = True cached.is_lex = True