mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-27 09:44:36 +03:00
Simplify specials and cache checks (#6012)
This commit is contained in:
parent
122cb02001
commit
77ac4a38aa
|
@ -34,9 +34,9 @@ cdef class Tokenizer:
|
||||||
vector[SpanC] &filtered)
|
vector[SpanC] &filtered)
|
||||||
cdef int _retokenize_special_spans(self, Doc doc, TokenC* tokens,
|
cdef int _retokenize_special_spans(self, Doc doc, TokenC* tokens,
|
||||||
object span_data)
|
object span_data)
|
||||||
cdef int _try_cache(self, hash_t key, Doc tokens) except -1
|
cdef int _try_specials_and_cache(self, hash_t key, Doc tokens,
|
||||||
cdef int _try_specials(self, hash_t key, Doc tokens,
|
int* has_special,
|
||||||
int* has_special) except -1
|
bint with_special_cases) except -1
|
||||||
cdef int _tokenize(self, Doc tokens, unicode span, hash_t key,
|
cdef int _tokenize(self, Doc tokens, unicode span, hash_t key,
|
||||||
int* has_special, bint with_special_cases) except -1
|
int* has_special, bint with_special_cases) except -1
|
||||||
cdef unicode _split_affixes(self, Pool mem, unicode string,
|
cdef unicode _split_affixes(self, Pool mem, unicode string,
|
||||||
|
|
|
@ -169,8 +169,6 @@ cdef class Tokenizer:
|
||||||
cdef int i = 0
|
cdef int i = 0
|
||||||
cdef int start = 0
|
cdef int start = 0
|
||||||
cdef int has_special = 0
|
cdef int has_special = 0
|
||||||
cdef bint specials_hit = 0
|
|
||||||
cdef bint cache_hit = 0
|
|
||||||
cdef bint in_ws = string[0].isspace()
|
cdef bint in_ws = string[0].isspace()
|
||||||
cdef unicode span
|
cdef unicode span
|
||||||
# The task here is much like string.split, but not quite
|
# The task here is much like string.split, but not quite
|
||||||
|
@ -186,13 +184,7 @@ cdef class Tokenizer:
|
||||||
# we don't have to create the slice when we hit the cache.
|
# we don't have to create the slice when we hit the cache.
|
||||||
span = string[start:i]
|
span = string[start:i]
|
||||||
key = hash_string(span)
|
key = hash_string(span)
|
||||||
specials_hit = 0
|
if not self._try_specials_and_cache(key, doc, &has_special, with_special_cases):
|
||||||
cache_hit = 0
|
|
||||||
if with_special_cases:
|
|
||||||
specials_hit = self._try_specials(key, doc, &has_special)
|
|
||||||
if not specials_hit:
|
|
||||||
cache_hit = self._try_cache(key, doc)
|
|
||||||
if not specials_hit and not cache_hit:
|
|
||||||
self._tokenize(doc, span, key, &has_special, with_special_cases)
|
self._tokenize(doc, span, key, &has_special, with_special_cases)
|
||||||
if uc == ' ':
|
if uc == ' ':
|
||||||
doc.c[doc.length - 1].spacy = True
|
doc.c[doc.length - 1].spacy = True
|
||||||
|
@ -204,13 +196,7 @@ cdef class Tokenizer:
|
||||||
if start < i:
|
if start < i:
|
||||||
span = string[start:]
|
span = string[start:]
|
||||||
key = hash_string(span)
|
key = hash_string(span)
|
||||||
specials_hit = 0
|
if not self._try_specials_and_cache(key, doc, &has_special, with_special_cases):
|
||||||
cache_hit = 0
|
|
||||||
if with_special_cases:
|
|
||||||
specials_hit = self._try_specials(key, doc, &has_special)
|
|
||||||
if not specials_hit:
|
|
||||||
cache_hit = self._try_cache(key, doc)
|
|
||||||
if not specials_hit and not cache_hit:
|
|
||||||
self._tokenize(doc, span, key, &has_special, with_special_cases)
|
self._tokenize(doc, span, key, &has_special, with_special_cases)
|
||||||
doc.c[doc.length - 1].spacy = string[-1] == " " and not in_ws
|
doc.c[doc.length - 1].spacy = string[-1] == " " and not in_ws
|
||||||
return doc
|
return doc
|
||||||
|
@ -364,27 +350,33 @@ cdef class Tokenizer:
|
||||||
offset += span[3]
|
offset += span[3]
|
||||||
return offset
|
return offset
|
||||||
|
|
||||||
cdef int _try_cache(self, hash_t key, Doc tokens) except -1:
|
cdef int _try_specials_and_cache(self, hash_t key, Doc tokens, int* has_special, bint with_special_cases) except -1:
|
||||||
cached = <_Cached*>self._cache.get(key)
|
cdef bint specials_hit = 0
|
||||||
if cached == NULL:
|
cdef bint cache_hit = 0
|
||||||
return False
|
|
||||||
cdef int i
|
cdef int i
|
||||||
if cached.is_lex:
|
if with_special_cases:
|
||||||
for i in range(cached.length):
|
cached = <_Cached*>self._specials.get(key)
|
||||||
tokens.push_back(cached.data.lexemes[i], False)
|
if cached == NULL:
|
||||||
else:
|
specials_hit = False
|
||||||
for i in range(cached.length):
|
else:
|
||||||
tokens.push_back(&cached.data.tokens[i], False)
|
for i in range(cached.length):
|
||||||
return True
|
tokens.push_back(&cached.data.tokens[i], False)
|
||||||
|
has_special[0] = 1
|
||||||
cdef int _try_specials(self, hash_t key, Doc tokens, int* has_special) except -1:
|
specials_hit = True
|
||||||
cached = <_Cached*>self._specials.get(key)
|
if not specials_hit:
|
||||||
if cached == NULL:
|
cached = <_Cached*>self._cache.get(key)
|
||||||
|
if cached == NULL:
|
||||||
|
cache_hit = False
|
||||||
|
else:
|
||||||
|
if cached.is_lex:
|
||||||
|
for i in range(cached.length):
|
||||||
|
tokens.push_back(cached.data.lexemes[i], False)
|
||||||
|
else:
|
||||||
|
for i in range(cached.length):
|
||||||
|
tokens.push_back(&cached.data.tokens[i], False)
|
||||||
|
cache_hit = True
|
||||||
|
if not specials_hit and not cache_hit:
|
||||||
return False
|
return False
|
||||||
cdef int i
|
|
||||||
for i in range(cached.length):
|
|
||||||
tokens.push_back(&cached.data.tokens[i], False)
|
|
||||||
has_special[0] = 1
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
cdef int _tokenize(self, Doc tokens, unicode span, hash_t orig_key, int* has_special, bint with_special_cases) except -1:
|
cdef int _tokenize(self, Doc tokens, unicode span, hash_t orig_key, int* has_special, bint with_special_cases) except -1:
|
||||||
|
@ -462,12 +454,7 @@ cdef class Tokenizer:
|
||||||
for i in range(prefixes.size()):
|
for i in range(prefixes.size()):
|
||||||
tokens.push_back(prefixes[0][i], False)
|
tokens.push_back(prefixes[0][i], False)
|
||||||
if string:
|
if string:
|
||||||
if with_special_cases:
|
if self._try_specials_and_cache(hash_string(string), tokens, has_special, with_special_cases):
|
||||||
specials_hit = self._try_specials(hash_string(string), tokens,
|
|
||||||
has_special)
|
|
||||||
if not specials_hit:
|
|
||||||
cache_hit = self._try_cache(hash_string(string), tokens)
|
|
||||||
if specials_hit or cache_hit:
|
|
||||||
pass
|
pass
|
||||||
elif (self.token_match and self.token_match(string)) or \
|
elif (self.token_match and self.token_match(string)) or \
|
||||||
(self.url_match and \
|
(self.url_match and \
|
||||||
|
|
Loading…
Reference in New Issue
Block a user