Simplify specials and cache checks (#6012)

This commit is contained in:
Adriane Boyd 2020-09-03 09:42:49 +02:00 committed by GitHub
parent 122cb02001
commit 77ac4a38aa
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 31 additions and 44 deletions

View File

@ -34,9 +34,9 @@ cdef class Tokenizer:
vector[SpanC] &filtered) vector[SpanC] &filtered)
cdef int _retokenize_special_spans(self, Doc doc, TokenC* tokens, cdef int _retokenize_special_spans(self, Doc doc, TokenC* tokens,
object span_data) object span_data)
cdef int _try_cache(self, hash_t key, Doc tokens) except -1 cdef int _try_specials_and_cache(self, hash_t key, Doc tokens,
cdef int _try_specials(self, hash_t key, Doc tokens, int* has_special,
int* has_special) except -1 bint with_special_cases) except -1
cdef int _tokenize(self, Doc tokens, unicode span, hash_t key, cdef int _tokenize(self, Doc tokens, unicode span, hash_t key,
int* has_special, bint with_special_cases) except -1 int* has_special, bint with_special_cases) except -1
cdef unicode _split_affixes(self, Pool mem, unicode string, cdef unicode _split_affixes(self, Pool mem, unicode string,

View File

@ -169,8 +169,6 @@ cdef class Tokenizer:
cdef int i = 0 cdef int i = 0
cdef int start = 0 cdef int start = 0
cdef int has_special = 0 cdef int has_special = 0
cdef bint specials_hit = 0
cdef bint cache_hit = 0
cdef bint in_ws = string[0].isspace() cdef bint in_ws = string[0].isspace()
cdef unicode span cdef unicode span
# The task here is much like string.split, but not quite # The task here is much like string.split, but not quite
@ -186,13 +184,7 @@ cdef class Tokenizer:
# we don't have to create the slice when we hit the cache. # we don't have to create the slice when we hit the cache.
span = string[start:i] span = string[start:i]
key = hash_string(span) key = hash_string(span)
specials_hit = 0 if not self._try_specials_and_cache(key, doc, &has_special, with_special_cases):
cache_hit = 0
if with_special_cases:
specials_hit = self._try_specials(key, doc, &has_special)
if not specials_hit:
cache_hit = self._try_cache(key, doc)
if not specials_hit and not cache_hit:
self._tokenize(doc, span, key, &has_special, with_special_cases) self._tokenize(doc, span, key, &has_special, with_special_cases)
if uc == ' ': if uc == ' ':
doc.c[doc.length - 1].spacy = True doc.c[doc.length - 1].spacy = True
@ -204,13 +196,7 @@ cdef class Tokenizer:
if start < i: if start < i:
span = string[start:] span = string[start:]
key = hash_string(span) key = hash_string(span)
specials_hit = 0 if not self._try_specials_and_cache(key, doc, &has_special, with_special_cases):
cache_hit = 0
if with_special_cases:
specials_hit = self._try_specials(key, doc, &has_special)
if not specials_hit:
cache_hit = self._try_cache(key, doc)
if not specials_hit and not cache_hit:
self._tokenize(doc, span, key, &has_special, with_special_cases) self._tokenize(doc, span, key, &has_special, with_special_cases)
doc.c[doc.length - 1].spacy = string[-1] == " " and not in_ws doc.c[doc.length - 1].spacy = string[-1] == " " and not in_ws
return doc return doc
@ -364,27 +350,33 @@ cdef class Tokenizer:
offset += span[3] offset += span[3]
return offset return offset
cdef int _try_cache(self, hash_t key, Doc tokens) except -1: cdef int _try_specials_and_cache(self, hash_t key, Doc tokens, int* has_special, bint with_special_cases) except -1:
cached = <_Cached*>self._cache.get(key) cdef bint specials_hit = 0
if cached == NULL: cdef bint cache_hit = 0
return False
cdef int i cdef int i
if cached.is_lex: if with_special_cases:
for i in range(cached.length): cached = <_Cached*>self._specials.get(key)
tokens.push_back(cached.data.lexemes[i], False) if cached == NULL:
else: specials_hit = False
for i in range(cached.length): else:
tokens.push_back(&cached.data.tokens[i], False) for i in range(cached.length):
return True tokens.push_back(&cached.data.tokens[i], False)
has_special[0] = 1
cdef int _try_specials(self, hash_t key, Doc tokens, int* has_special) except -1: specials_hit = True
cached = <_Cached*>self._specials.get(key) if not specials_hit:
if cached == NULL: cached = <_Cached*>self._cache.get(key)
if cached == NULL:
cache_hit = False
else:
if cached.is_lex:
for i in range(cached.length):
tokens.push_back(cached.data.lexemes[i], False)
else:
for i in range(cached.length):
tokens.push_back(&cached.data.tokens[i], False)
cache_hit = True
if not specials_hit and not cache_hit:
return False return False
cdef int i
for i in range(cached.length):
tokens.push_back(&cached.data.tokens[i], False)
has_special[0] = 1
return True return True
cdef int _tokenize(self, Doc tokens, unicode span, hash_t orig_key, int* has_special, bint with_special_cases) except -1: cdef int _tokenize(self, Doc tokens, unicode span, hash_t orig_key, int* has_special, bint with_special_cases) except -1:
@ -462,12 +454,7 @@ cdef class Tokenizer:
for i in range(prefixes.size()): for i in range(prefixes.size()):
tokens.push_back(prefixes[0][i], False) tokens.push_back(prefixes[0][i], False)
if string: if string:
if with_special_cases: if self._try_specials_and_cache(hash_string(string), tokens, has_special, with_special_cases):
specials_hit = self._try_specials(hash_string(string), tokens,
has_special)
if not specials_hit:
cache_hit = self._try_cache(hash_string(string), tokens)
if specials_hit or cache_hit:
pass pass
elif (self.token_match and self.token_match(string)) or \ elif (self.token_match and self.token_match(string)) or \
(self.url_match and \ (self.url_match and \