mirror of
https://github.com/explosion/spaCy.git
synced 2025-03-03 19:08:06 +03:00
Move whole token mach inside _split_affixes.
This commit is contained in:
parent
3ba7c167a8
commit
fde53be3b4
|
@ -207,12 +207,9 @@ cdef class Tokenizer:
|
||||||
cdef vector[LexemeC*] suffixes
|
cdef vector[LexemeC*] suffixes
|
||||||
cdef int orig_size
|
cdef int orig_size
|
||||||
orig_size = tokens.length
|
orig_size = tokens.length
|
||||||
if self.token_match and self.token_match(span):
|
span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes)
|
||||||
tokens.push_back(self.vocab.get(tokens.mem, span), False)
|
self._attach_tokens(tokens, span, &prefixes, &suffixes)
|
||||||
else:
|
self._save_cached(&tokens.c[orig_size], orig_key, tokens.length - orig_size)
|
||||||
span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes)
|
|
||||||
self._attach_tokens(tokens, span, &prefixes, &suffixes)
|
|
||||||
self._save_cached(&tokens.c[orig_size], orig_key, tokens.length - orig_size)
|
|
||||||
|
|
||||||
cdef unicode _split_affixes(self, Pool mem, unicode string,
|
cdef unicode _split_affixes(self, Pool mem, unicode string,
|
||||||
vector[const LexemeC*] *prefixes,
|
vector[const LexemeC*] *prefixes,
|
||||||
|
@ -224,6 +221,8 @@ cdef class Tokenizer:
|
||||||
cdef unicode minus_suf
|
cdef unicode minus_suf
|
||||||
cdef size_t last_size = 0
|
cdef size_t last_size = 0
|
||||||
while string and len(string) != last_size:
|
while string and len(string) != last_size:
|
||||||
|
if self.token_match and self.token_match(string):
|
||||||
|
break
|
||||||
last_size = len(string)
|
last_size = len(string)
|
||||||
pre_len = self.find_prefix(string)
|
pre_len = self.find_prefix(string)
|
||||||
if pre_len != 0:
|
if pre_len != 0:
|
||||||
|
@ -234,6 +233,8 @@ cdef class Tokenizer:
|
||||||
string = minus_pre
|
string = minus_pre
|
||||||
prefixes.push_back(self.vocab.get(mem, prefix))
|
prefixes.push_back(self.vocab.get(mem, prefix))
|
||||||
break
|
break
|
||||||
|
if self.token_match and self.token_match(string):
|
||||||
|
break
|
||||||
suf_len = self.find_suffix(string)
|
suf_len = self.find_suffix(string)
|
||||||
if suf_len != 0:
|
if suf_len != 0:
|
||||||
suffix = string[-suf_len:]
|
suffix = string[-suf_len:]
|
||||||
|
@ -271,7 +272,11 @@ cdef class Tokenizer:
|
||||||
tokens.push_back(prefixes[0][i], False)
|
tokens.push_back(prefixes[0][i], False)
|
||||||
if string:
|
if string:
|
||||||
cache_hit = self._try_cache(hash_string(string), tokens)
|
cache_hit = self._try_cache(hash_string(string), tokens)
|
||||||
if not cache_hit:
|
if cache_hit:
|
||||||
|
pass
|
||||||
|
elif self.token_match and self.token_match(string):
|
||||||
|
tokens.push_back(self.vocab.get(tokens.mem, string), not suffixes.size())
|
||||||
|
else:
|
||||||
matches = self.find_infix(string)
|
matches = self.find_infix(string)
|
||||||
if not matches:
|
if not matches:
|
||||||
tokens.push_back(self.vocab.get(tokens.mem, string), False)
|
tokens.push_back(self.vocab.get(tokens.mem, string), False)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user