mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
b0228d8ea6
* chore: add cython-linter dev dependency * fix: lexeme.pyx * fix: morphology.pxd * fix: tokenizer.pxd * fix: vocab.pxd * fix: morphology.pxd (line length) * ci: add cython-lint * ci: fix cython-lint call * Fix kb/candidate.pyx. * Fix kb/kb.pyx. * Fix kb/kb_in_memory.pyx. * Fix kb. * Fix training/ partially. * Fix training/. Ignore trailing whitespaces and too long lines. * Fix ml/. * Fix matcher/. * Fix pipeline/. * Fix tokens/. * Fix build errors. Fix vocab.pyx. * Fix cython-lint install and run. * Fix lexeme.pyx, parts_of_speech.pxd, vectors.pyx. Temporarily disable cython-lint execution. * Fix attrs.pyx, lexeme.pyx, symbols.pxd, isort issues. * Make cython-lint install conditional. Fix tokenizer.pyx. * Fix remaining files. Reenable cython-lint check. * Readded parentheses. * Fix test_build_dependencies(). * Add explanatory comment to cython-lint execution. --------- Co-authored-by: Raphael Mitsch <r.mitsch@outlook.com>
89 lines
2.3 KiB
Cython
89 lines
2.3 KiB
Cython
from cymem.cymem cimport Pool
|
|
from libcpp.vector cimport vector
|
|
from preshed.maps cimport PreshMap
|
|
|
|
from .matcher.phrasematcher cimport PhraseMatcher
|
|
from .strings cimport StringStore
|
|
from .structs cimport LexemeC, SpanC, TokenC
|
|
from .tokens.doc cimport Doc
|
|
from .typedefs cimport hash_t
|
|
from .vocab cimport LexemesOrTokens, Vocab, _Cached
|
|
|
|
|
|
cdef class Tokenizer:
|
|
cdef Pool mem
|
|
cdef PreshMap _cache
|
|
cdef PreshMap _specials
|
|
cdef readonly Vocab vocab
|
|
|
|
cdef object _token_match
|
|
cdef object _url_match
|
|
cdef object _prefix_search
|
|
cdef object _suffix_search
|
|
cdef object _infix_finditer
|
|
cdef object _rules
|
|
cdef PhraseMatcher _special_matcher
|
|
# TODO convert to bool in v4
|
|
cdef int _faster_heuristics
|
|
# TODO next one is unused and should be removed in v4
|
|
# https://github.com/explosion/spaCy/pull/9150
|
|
cdef int _unused_int2
|
|
|
|
cdef Doc _tokenize_affixes(self, str string, bint with_special_cases)
|
|
cdef int _apply_special_cases(self, Doc doc) except -1
|
|
cdef void _filter_special_spans(
|
|
self,
|
|
vector[SpanC] &original,
|
|
vector[SpanC] &filtered,
|
|
int doc_len,
|
|
) nogil
|
|
cdef object _prepare_special_spans(
|
|
self,
|
|
Doc doc,
|
|
vector[SpanC] &filtered,
|
|
)
|
|
cdef int _retokenize_special_spans(
|
|
self,
|
|
Doc doc,
|
|
TokenC* tokens,
|
|
object span_data,
|
|
)
|
|
cdef int _try_specials_and_cache(
|
|
self,
|
|
hash_t key,
|
|
Doc tokens,
|
|
int* has_special,
|
|
bint with_special_cases,
|
|
) except -1
|
|
cdef int _tokenize(
|
|
self,
|
|
Doc tokens,
|
|
str span,
|
|
hash_t key,
|
|
int* has_special,
|
|
bint with_special_cases,
|
|
) except -1
|
|
cdef str _split_affixes(
|
|
self,
|
|
Pool mem,
|
|
str string,
|
|
vector[LexemeC*] *prefixes,
|
|
vector[LexemeC*] *suffixes, int* has_special,
|
|
bint with_special_cases,
|
|
)
|
|
cdef int _attach_tokens(
|
|
self,
|
|
Doc tokens,
|
|
str string,
|
|
vector[LexemeC*] *prefixes,
|
|
vector[LexemeC*] *suffixes, int* has_special,
|
|
bint with_special_cases,
|
|
) except -1
|
|
cdef int _save_cached(
|
|
self,
|
|
const TokenC* tokens,
|
|
hash_t key,
|
|
int* has_special,
|
|
int n,
|
|
) except -1
|