mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	* Add tokenizer option to allow Matcher handling for all rules
Add tokenizer option `with_faster_rules_heuristics` that determines
whether the special cases applied by the internal `Matcher` are filtered
by whether they contain affixes or space. If `True` (default), the rules
are filtered to prioritize speed over rare edge cases. If `False`, all
rules are included in the final `Matcher`-based pass over the doc.
* Reset all caches when reloading special cases
* Revert "Reset all caches when reloading special cases"
This reverts commit 4ef6bd171d.
* Initialize max_length properly
* Add new tag to API docs
* Rename to faster heuristics
		
	
			
		
			
				
	
	
		
			55 lines
		
	
	
		
			2.3 KiB
		
	
	
	
		
			Cython
		
	
	
	
	
	
			
		
		
	
	
			55 lines
		
	
	
		
			2.3 KiB
		
	
	
	
		
			Cython
		
	
	
	
	
	
| from libcpp.vector cimport vector
 | |
| from preshed.maps cimport PreshMap
 | |
| from cymem.cymem cimport Pool
 | |
| 
 | |
| from .typedefs cimport hash_t
 | |
| from .structs cimport LexemeC, SpanC, TokenC
 | |
| from .strings cimport StringStore
 | |
| from .tokens.doc cimport Doc
 | |
| from .vocab cimport Vocab, LexemesOrTokens, _Cached
 | |
| from .matcher.phrasematcher cimport PhraseMatcher
 | |
| 
 | |
| 
 | |
| cdef class Tokenizer:
 | |
|     cdef Pool mem
 | |
|     cdef PreshMap _cache
 | |
|     cdef PreshMap _specials
 | |
|     cdef readonly Vocab vocab
 | |
| 
 | |
|     cdef object _token_match
 | |
|     cdef object _url_match
 | |
|     cdef object _prefix_search
 | |
|     cdef object _suffix_search
 | |
|     cdef object _infix_finditer
 | |
|     cdef object _rules
 | |
|     cdef PhraseMatcher _special_matcher
 | |
|     # TODO convert to bool in v4
 | |
|     cdef int _faster_heuristics
 | |
|     # TODO next one is unused and should be removed in v4
 | |
|     # https://github.com/explosion/spaCy/pull/9150
 | |
|     cdef int _unused_int2
 | |
| 
 | |
|     cdef Doc _tokenize_affixes(self, str string, bint with_special_cases)
 | |
|     cdef int _apply_special_cases(self, Doc doc) except -1
 | |
|     cdef void _filter_special_spans(self, vector[SpanC] &original,
 | |
|                             vector[SpanC] &filtered, int doc_len) nogil
 | |
|     cdef object _prepare_special_spans(self, Doc doc,
 | |
|                                        vector[SpanC] &filtered)
 | |
|     cdef int _retokenize_special_spans(self, Doc doc, TokenC* tokens,
 | |
|                                        object span_data)
 | |
|     cdef int _try_specials_and_cache(self, hash_t key, Doc tokens,
 | |
|                                      int* has_special,
 | |
|                                      bint with_special_cases) except -1
 | |
|     cdef int _tokenize(self, Doc tokens, str span, hash_t key,
 | |
|                        int* has_special, bint with_special_cases) except -1
 | |
|     cdef str _split_affixes(self, Pool mem, str string,
 | |
|                                 vector[LexemeC*] *prefixes,
 | |
|                                 vector[LexemeC*] *suffixes, int* has_special,
 | |
|                                 bint with_special_cases)
 | |
|     cdef int _attach_tokens(self, Doc tokens, str string,
 | |
|                             vector[LexemeC*] *prefixes,
 | |
|                             vector[LexemeC*] *suffixes, int* has_special,
 | |
|                             bint with_special_cases) except -1
 | |
|     cdef int _save_cached(self, const TokenC* tokens, hash_t key,
 | |
|                           int* has_special, int n) except -1
 |