mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-28 18:24:08 +03:00
5861308910
Handle tokenizer special cases more generally by using the Matcher internally to match special cases after the affix/token_match tokenization is complete. Instead of only matching special cases while processing balanced or nearly balanced prefixes and suffixes, this recognizes special cases in a wider range of contexts: * Allows arbitrary numbers of prefixes/affixes around special cases * Allows special cases separated by infixes Existing tests/settings that couldn't be preserved as before: * The emoticon '")' is no longer a supported special case * The emoticon ':)' in "example:)" is a false positive again When merged with #4258 (or the relevant cache bugfix), the affix and token_match properties should be modified to flush and reload all special cases to use the updated internal tokenization with the Matcher.
39 lines
1.3 KiB
Cython
39 lines
1.3 KiB
Cython
from libcpp.vector cimport vector
|
|
|
|
from preshed.maps cimport PreshMap
|
|
from cymem.cymem cimport Pool
|
|
|
|
from .typedefs cimport hash_t
|
|
from .structs cimport LexemeC, TokenC
|
|
from .strings cimport StringStore
|
|
from .tokens.doc cimport Doc
|
|
from .vocab cimport Vocab, LexemesOrTokens, _Cached
|
|
|
|
|
|
cdef class Tokenizer:
|
|
cdef Pool mem
|
|
cdef PreshMap _cache
|
|
cdef PreshMap _specials
|
|
cpdef readonly Vocab vocab
|
|
|
|
cdef public object token_match
|
|
cdef public object prefix_search
|
|
cdef public object suffix_search
|
|
cdef public object infix_finditer
|
|
cdef object _rules
|
|
cdef object _special_matcher
|
|
|
|
cpdef Doc tokens_from_list(self, list strings)
|
|
|
|
cdef Doc _tokenize_affixes(self, unicode string)
|
|
cdef int _apply_special_cases(self, Doc doc)
|
|
cdef int _try_cache(self, hash_t key, Doc tokens) except -1
|
|
cdef int _tokenize(self, Doc tokens, unicode span, hash_t key) except -1
|
|
cdef unicode _split_affixes(self, Pool mem, unicode string, vector[LexemeC*] *prefixes,
|
|
vector[LexemeC*] *suffixes)
|
|
cdef int _attach_tokens(self, Doc tokens, unicode string,
|
|
vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except -1
|
|
|
|
cdef int _save_cached(self, const TokenC* tokens, hash_t key,
|
|
int n) except -1
|