mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-29 18:54:07 +03:00
Use special Matcher only for cases with affixes
* Reinsert specials cache checks during normal tokenization for special cases as much as possible * Additionally include specials cache checks while splitting on infixes * Since the special Matcher needs consistent affix-only tokenization for the special cases themselves, introduce the argument `with_special_cases` in order to do tokenization with or without specials cache checks * After normal tokenization, postprocess with special cases Matcher for special cases containing affixes
This commit is contained in:
parent
b097b0b83d
commit
33946d2ef8
|
@ -26,14 +26,20 @@ cdef class Tokenizer:
|
||||||
|
|
||||||
cpdef Doc tokens_from_list(self, list strings)
|
cpdef Doc tokens_from_list(self, list strings)
|
||||||
|
|
||||||
cdef Doc _tokenize_affixes(self, unicode string)
|
cdef Doc _tokenize_affixes(self, unicode string, bint with_special_cases)
|
||||||
cdef int _apply_special_cases(self, Doc doc)
|
cdef int _apply_special_cases(self, Doc doc)
|
||||||
cdef int _try_cache(self, hash_t key, Doc tokens) except -1
|
cdef int _try_cache(self, hash_t key, Doc tokens) except -1
|
||||||
cdef int _tokenize(self, Doc tokens, unicode span, hash_t key) except -1
|
cdef int _try_specials(self, hash_t key, Doc tokens,
|
||||||
cdef unicode _split_affixes(self, Pool mem, unicode string, vector[LexemeC*] *prefixes,
|
int* has_special) except -1
|
||||||
vector[LexemeC*] *suffixes)
|
cdef int _tokenize(self, Doc tokens, unicode span, hash_t key,
|
||||||
|
int* has_special, bint with_special_cases) except -1
|
||||||
|
cdef unicode _split_affixes(self, Pool mem, unicode string,
|
||||||
|
vector[LexemeC*] *prefixes,
|
||||||
|
vector[LexemeC*] *suffixes, int* has_special,
|
||||||
|
bint with_special_cases)
|
||||||
cdef int _attach_tokens(self, Doc tokens, unicode string,
|
cdef int _attach_tokens(self, Doc tokens, unicode string,
|
||||||
vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except -1
|
vector[LexemeC*] *prefixes,
|
||||||
|
vector[LexemeC*] *suffixes, int* has_special,
|
||||||
|
bint with_special_cases) except -1
|
||||||
cdef int _save_cached(self, const TokenC* tokens, hash_t key,
|
cdef int _save_cached(self, const TokenC* tokens, hash_t key,
|
||||||
int n) except -1
|
int* has_special, int n) except -1
|
||||||
|
|
|
@ -121,12 +121,12 @@ cdef class Tokenizer:
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tokenizer#call
|
DOCS: https://spacy.io/api/tokenizer#call
|
||||||
"""
|
"""
|
||||||
doc = self._tokenize_affixes(string)
|
doc = self._tokenize_affixes(string, True)
|
||||||
self._apply_special_cases(doc)
|
self._apply_special_cases(doc)
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
@cython.boundscheck(False)
|
@cython.boundscheck(False)
|
||||||
cdef Doc _tokenize_affixes(self, unicode string):
|
cdef Doc _tokenize_affixes(self, unicode string, bint with_special_cases):
|
||||||
"""Tokenize according to affix and token_match settings.
|
"""Tokenize according to affix and token_match settings.
|
||||||
|
|
||||||
string (unicode): The string to tokenize.
|
string (unicode): The string to tokenize.
|
||||||
|
@ -140,7 +140,9 @@ cdef class Tokenizer:
|
||||||
return doc
|
return doc
|
||||||
cdef int i = 0
|
cdef int i = 0
|
||||||
cdef int start = 0
|
cdef int start = 0
|
||||||
cdef bint cache_hit
|
cdef int has_special = 0
|
||||||
|
cdef bint specials_hit = 0
|
||||||
|
cdef bint cache_hit = 0
|
||||||
cdef bint in_ws = string[0].isspace()
|
cdef bint in_ws = string[0].isspace()
|
||||||
cdef unicode span
|
cdef unicode span
|
||||||
# The task here is much like string.split, but not quite
|
# The task here is much like string.split, but not quite
|
||||||
|
@ -156,9 +158,14 @@ cdef class Tokenizer:
|
||||||
# we don't have to create the slice when we hit the cache.
|
# we don't have to create the slice when we hit the cache.
|
||||||
span = string[start:i]
|
span = string[start:i]
|
||||||
key = hash_string(span)
|
key = hash_string(span)
|
||||||
cache_hit = self._try_cache(key, doc)
|
specials_hit = 0
|
||||||
if not cache_hit:
|
cache_hit = 0
|
||||||
self._tokenize(doc, span, key)
|
if with_special_cases:
|
||||||
|
specials_hit = self._try_specials(key, doc, &has_special)
|
||||||
|
if not specials_hit:
|
||||||
|
cache_hit = self._try_cache(key, doc)
|
||||||
|
if not specials_hit and not cache_hit:
|
||||||
|
self._tokenize(doc, span, key, &has_special, with_special_cases)
|
||||||
if uc == ' ':
|
if uc == ' ':
|
||||||
doc.c[doc.length - 1].spacy = True
|
doc.c[doc.length - 1].spacy = True
|
||||||
start = i + 1
|
start = i + 1
|
||||||
|
@ -169,9 +176,14 @@ cdef class Tokenizer:
|
||||||
if start < i:
|
if start < i:
|
||||||
span = string[start:]
|
span = string[start:]
|
||||||
key = hash_string(span)
|
key = hash_string(span)
|
||||||
cache_hit = self._try_cache(key, doc)
|
specials_hit = 0
|
||||||
if not cache_hit:
|
cache_hit = 0
|
||||||
self._tokenize(doc, span, key)
|
if with_special_cases:
|
||||||
|
specials_hit = self._try_specials(key, doc, &has_special)
|
||||||
|
if not specials_hit:
|
||||||
|
cache_hit = self._try_cache(key, doc)
|
||||||
|
if not specials_hit and not cache_hit:
|
||||||
|
self._tokenize(doc, span, key, &has_special, with_special_cases)
|
||||||
doc.c[doc.length - 1].spacy = string[-1] == " " and not in_ws
|
doc.c[doc.length - 1].spacy = string[-1] == " " and not in_ws
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
|
@ -270,19 +282,37 @@ cdef class Tokenizer:
|
||||||
tokens.push_back(&cached.data.tokens[i], False)
|
tokens.push_back(&cached.data.tokens[i], False)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
cdef int _tokenize(self, Doc tokens, unicode span, hash_t orig_key) except -1:
|
cdef int _try_specials(self, hash_t key, Doc tokens, int* has_special) except -1:
|
||||||
|
cached = <_Cached*>self._specials.get(key)
|
||||||
|
if cached == NULL:
|
||||||
|
return False
|
||||||
|
cdef int i
|
||||||
|
if cached.is_lex:
|
||||||
|
for i in range(cached.length):
|
||||||
|
tokens.push_back(cached.data.lexemes[i], False)
|
||||||
|
else:
|
||||||
|
for i in range(cached.length):
|
||||||
|
tokens.push_back(&cached.data.tokens[i], False)
|
||||||
|
has_special[0] = 1
|
||||||
|
return True
|
||||||
|
|
||||||
|
cdef int _tokenize(self, Doc tokens, unicode span, hash_t orig_key, int* has_special, bint with_special_cases) except -1:
|
||||||
cdef vector[LexemeC*] prefixes
|
cdef vector[LexemeC*] prefixes
|
||||||
cdef vector[LexemeC*] suffixes
|
cdef vector[LexemeC*] suffixes
|
||||||
cdef int orig_size
|
cdef int orig_size
|
||||||
orig_size = tokens.length
|
orig_size = tokens.length
|
||||||
span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes)
|
span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes,
|
||||||
self._attach_tokens(tokens, span, &prefixes, &suffixes)
|
has_special, with_special_cases)
|
||||||
self._save_cached(&tokens.c[orig_size], orig_key,
|
self._attach_tokens(tokens, span, &prefixes, &suffixes, has_special,
|
||||||
|
with_special_cases)
|
||||||
|
self._save_cached(&tokens.c[orig_size], orig_key, has_special,
|
||||||
tokens.length - orig_size)
|
tokens.length - orig_size)
|
||||||
|
|
||||||
cdef unicode _split_affixes(self, Pool mem, unicode string,
|
cdef unicode _split_affixes(self, Pool mem, unicode string,
|
||||||
vector[const LexemeC*] *prefixes,
|
vector[const LexemeC*] *prefixes,
|
||||||
vector[const LexemeC*] *suffixes):
|
vector[const LexemeC*] *suffixes,
|
||||||
|
int* has_special,
|
||||||
|
bint with_special_cases):
|
||||||
cdef size_t i
|
cdef size_t i
|
||||||
cdef unicode prefix
|
cdef unicode prefix
|
||||||
cdef unicode suffix
|
cdef unicode suffix
|
||||||
|
@ -292,15 +322,25 @@ cdef class Tokenizer:
|
||||||
while string and len(string) != last_size:
|
while string and len(string) != last_size:
|
||||||
if self.token_match and self.token_match(string):
|
if self.token_match and self.token_match(string):
|
||||||
break
|
break
|
||||||
|
if with_special_cases and self._specials.get(hash_string(string)) != NULL:
|
||||||
|
break
|
||||||
last_size = len(string)
|
last_size = len(string)
|
||||||
pre_len = self.find_prefix(string)
|
pre_len = self.find_prefix(string)
|
||||||
if pre_len != 0:
|
if pre_len != 0:
|
||||||
prefix = string[:pre_len]
|
prefix = string[:pre_len]
|
||||||
minus_pre = string[pre_len:]
|
minus_pre = string[pre_len:]
|
||||||
|
if minus_pre and with_special_cases and self._specials.get(hash_string(minus_pre)) != NULL:
|
||||||
|
string = minus_pre
|
||||||
|
prefixes.push_back(self.vocab.get(mem, prefix))
|
||||||
|
break
|
||||||
suf_len = self.find_suffix(string)
|
suf_len = self.find_suffix(string)
|
||||||
if suf_len != 0:
|
if suf_len != 0:
|
||||||
suffix = string[-suf_len:]
|
suffix = string[-suf_len:]
|
||||||
minus_suf = string[:-suf_len]
|
minus_suf = string[:-suf_len]
|
||||||
|
if minus_suf and with_special_cases and self._specials.get(hash_string(minus_suf)) != NULL:
|
||||||
|
string = minus_suf
|
||||||
|
suffixes.push_back(self.vocab.get(mem, suffix))
|
||||||
|
break
|
||||||
if pre_len and suf_len and (pre_len + suf_len) <= len(string):
|
if pre_len and suf_len and (pre_len + suf_len) <= len(string):
|
||||||
string = string[pre_len:-suf_len]
|
string = string[pre_len:-suf_len]
|
||||||
prefixes.push_back(self.vocab.get(mem, prefix))
|
prefixes.push_back(self.vocab.get(mem, prefix))
|
||||||
|
@ -315,8 +355,11 @@ cdef class Tokenizer:
|
||||||
|
|
||||||
cdef int _attach_tokens(self, Doc tokens, unicode string,
|
cdef int _attach_tokens(self, Doc tokens, unicode string,
|
||||||
vector[const LexemeC*] *prefixes,
|
vector[const LexemeC*] *prefixes,
|
||||||
vector[const LexemeC*] *suffixes) except -1:
|
vector[const LexemeC*] *suffixes,
|
||||||
cdef bint cache_hit
|
int* has_special,
|
||||||
|
bint with_special_cases) except -1:
|
||||||
|
cdef bint specials_hit = 0
|
||||||
|
cdef bint cache_hit = 0
|
||||||
cdef int split, end
|
cdef int split, end
|
||||||
cdef const LexemeC* const* lexemes
|
cdef const LexemeC* const* lexemes
|
||||||
cdef const LexemeC* lexeme
|
cdef const LexemeC* lexeme
|
||||||
|
@ -326,8 +369,12 @@ cdef class Tokenizer:
|
||||||
for i in range(prefixes.size()):
|
for i in range(prefixes.size()):
|
||||||
tokens.push_back(prefixes[0][i], False)
|
tokens.push_back(prefixes[0][i], False)
|
||||||
if string:
|
if string:
|
||||||
cache_hit = self._try_cache(hash_string(string), tokens)
|
if with_special_cases:
|
||||||
if cache_hit:
|
specials_hit = self._try_specials(hash_string(string), tokens,
|
||||||
|
has_special)
|
||||||
|
if not specials_hit:
|
||||||
|
cache_hit = self._try_cache(hash_string(string), tokens)
|
||||||
|
if specials_hit or cache_hit:
|
||||||
pass
|
pass
|
||||||
elif self.token_match and self.token_match(string):
|
elif self.token_match and self.token_match(string):
|
||||||
# We're always saying 'no' to spaces here -- the caller will
|
# We're always saying 'no' to spaces here -- the caller will
|
||||||
|
@ -372,11 +419,14 @@ cdef class Tokenizer:
|
||||||
tokens.push_back(lexeme, False)
|
tokens.push_back(lexeme, False)
|
||||||
|
|
||||||
cdef int _save_cached(self, const TokenC* tokens, hash_t key,
|
cdef int _save_cached(self, const TokenC* tokens, hash_t key,
|
||||||
int n) except -1:
|
int* has_special, int n) except -1:
|
||||||
cdef int i
|
cdef int i
|
||||||
for i in range(n):
|
for i in range(n):
|
||||||
if self.vocab._by_orth.get(tokens[i].lex.orth) == NULL:
|
if self.vocab._by_orth.get(tokens[i].lex.orth) == NULL:
|
||||||
return 0
|
return 0
|
||||||
|
# See #1250
|
||||||
|
if has_special:
|
||||||
|
return 0
|
||||||
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
|
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
|
||||||
cached.length = n
|
cached.length = n
|
||||||
cached.is_lex = True
|
cached.is_lex = True
|
||||||
|
@ -470,7 +520,8 @@ cdef class Tokenizer:
|
||||||
self.mem.free(stale_special)
|
self.mem.free(stale_special)
|
||||||
self._rules[string] = substrings
|
self._rules[string] = substrings
|
||||||
self._flush_cache()
|
self._flush_cache()
|
||||||
self._special_matcher.add(string, None, [{ORTH: token.text} for token in self._tokenize_affixes(string)])
|
if self.find_prefix(string) or self.find_infix(string) or self.find_suffix(string):
|
||||||
|
self._special_matcher.add(string, None, [{ORTH: token.text} for token in self._tokenize_affixes(string, False)])
|
||||||
|
|
||||||
def _reload_special_cases(self):
|
def _reload_special_cases(self):
|
||||||
try:
|
try:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user