mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	* chore: add cython-linter dev dependency * fix: lexeme.pyx * fix: morphology.pxd * fix: tokenizer.pxd * fix: vocab.pxd * fix: morphology.pxd (line length) * ci: add cython-lint * ci: fix cython-lint call * Fix kb/candidate.pyx. * Fix kb/kb.pyx. * Fix kb/kb_in_memory.pyx. * Fix kb. * Fix training/ partially. * Fix training/. Ignore trailing whitespaces and too long lines. * Fix ml/. * Fix matcher/. * Fix pipeline/. * Fix tokens/. * Fix build errors. Fix vocab.pyx. * Fix cython-lint install and run. * Fix lexeme.pyx, parts_of_speech.pxd, vectors.pyx. Temporarily disable cython-lint execution. * Fix attrs.pyx, lexeme.pyx, symbols.pxd, isort issues. * Make cython-lint install conditional. Fix tokenizer.pyx. * Fix remaining files. Reenable cython-lint check. * Readded parentheses. * Fix test_build_dependencies(). * Add explanatory comment to cython-lint execution. --------- Co-authored-by: Raphael Mitsch <r.mitsch@outlook.com>
		
			
				
	
	
		
			89 lines
		
	
	
		
			2.3 KiB
		
	
	
	
		
			Cython
		
	
	
	
	
	
			
		
		
	
	
			89 lines
		
	
	
		
			2.3 KiB
		
	
	
	
		
			Cython
		
	
	
	
	
	
from cymem.cymem cimport Pool
 | 
						|
from libcpp.vector cimport vector
 | 
						|
from preshed.maps cimport PreshMap
 | 
						|
 | 
						|
from .matcher.phrasematcher cimport PhraseMatcher
 | 
						|
from .strings cimport StringStore
 | 
						|
from .structs cimport LexemeC, SpanC, TokenC
 | 
						|
from .tokens.doc cimport Doc
 | 
						|
from .typedefs cimport hash_t
 | 
						|
from .vocab cimport LexemesOrTokens, Vocab, _Cached
 | 
						|
 | 
						|
 | 
						|
cdef class Tokenizer:
 | 
						|
    cdef Pool mem
 | 
						|
    cdef PreshMap _cache
 | 
						|
    cdef PreshMap _specials
 | 
						|
    cdef readonly Vocab vocab
 | 
						|
 | 
						|
    cdef object _token_match
 | 
						|
    cdef object _url_match
 | 
						|
    cdef object _prefix_search
 | 
						|
    cdef object _suffix_search
 | 
						|
    cdef object _infix_finditer
 | 
						|
    cdef object _rules
 | 
						|
    cdef PhraseMatcher _special_matcher
 | 
						|
    # TODO convert to bool in v4
 | 
						|
    cdef int _faster_heuristics
 | 
						|
    # TODO next one is unused and should be removed in v4
 | 
						|
    # https://github.com/explosion/spaCy/pull/9150
 | 
						|
    cdef int _unused_int2
 | 
						|
 | 
						|
    cdef Doc _tokenize_affixes(self, str string, bint with_special_cases)
 | 
						|
    cdef int _apply_special_cases(self, Doc doc) except -1
 | 
						|
    cdef void _filter_special_spans(
 | 
						|
        self,
 | 
						|
        vector[SpanC] &original,
 | 
						|
        vector[SpanC] &filtered,
 | 
						|
        int doc_len,
 | 
						|
    ) nogil
 | 
						|
    cdef object _prepare_special_spans(
 | 
						|
        self,
 | 
						|
        Doc doc,
 | 
						|
        vector[SpanC] &filtered,
 | 
						|
    )
 | 
						|
    cdef int _retokenize_special_spans(
 | 
						|
        self,
 | 
						|
        Doc doc,
 | 
						|
        TokenC* tokens,
 | 
						|
        object span_data,
 | 
						|
    )
 | 
						|
    cdef int _try_specials_and_cache(
 | 
						|
        self,
 | 
						|
        hash_t key,
 | 
						|
        Doc tokens,
 | 
						|
        int* has_special,
 | 
						|
        bint with_special_cases,
 | 
						|
    ) except -1
 | 
						|
    cdef int _tokenize(
 | 
						|
        self,
 | 
						|
        Doc tokens,
 | 
						|
        str span,
 | 
						|
        hash_t key,
 | 
						|
        int* has_special,
 | 
						|
        bint with_special_cases,
 | 
						|
    ) except -1
 | 
						|
    cdef str _split_affixes(
 | 
						|
        self,
 | 
						|
        Pool mem,
 | 
						|
        str string,
 | 
						|
        vector[LexemeC*] *prefixes,
 | 
						|
        vector[LexemeC*] *suffixes, int* has_special,
 | 
						|
        bint with_special_cases,
 | 
						|
    )
 | 
						|
    cdef int _attach_tokens(
 | 
						|
        self,
 | 
						|
        Doc tokens,
 | 
						|
        str string,
 | 
						|
        vector[LexemeC*] *prefixes,
 | 
						|
        vector[LexemeC*] *suffixes, int* has_special,
 | 
						|
        bint with_special_cases,
 | 
						|
    ) except -1
 | 
						|
    cdef int _save_cached(
 | 
						|
        self,
 | 
						|
        const TokenC* tokens,
 | 
						|
        hash_t key,
 | 
						|
        int* has_special,
 | 
						|
        int n,
 | 
						|
    ) except -1
 |