Make cython-lint install conditional. Fix tokenizer.pyx.

This commit is contained in:
Raphael Mitsch 2023-07-04 09:24:28 +02:00
parent c32414c43b
commit 94110a1c6d
2 changed files with 19 additions and 20 deletions

View File

@ -38,5 +38,5 @@ types-setuptools>=57.0.0
types-requests types-requests
types-setuptools>=57.0.0 types-setuptools>=57.0.0
black==22.3.0 black==22.3.0
cython-lint>=0.15.0 cython-lint>=0.15.0; python_version >= "3.7"
isort>=5.0,<6.0 isort>=5.0,<6.0

View File

@ -8,20 +8,18 @@ from libcpp.set cimport set as stdset
from preshed.maps cimport PreshMap from preshed.maps cimport PreshMap
import re import re
import warnings
from .lexeme cimport EMPTY_LEXEME from .lexeme cimport EMPTY_LEXEME
from .strings cimport hash_string from .strings cimport hash_string
from .tokens.doc cimport Doc from .tokens.doc cimport Doc
from . import util from . import util
from .attrs import intify_attrs from .attrs import intify_attrs
from .errors import Errors, Warnings from .errors import Errors
from .scorer import Scorer from .scorer import Scorer
from .symbols import NORM, ORTH from .symbols import NORM, ORTH
from .tokens import Span from .tokens import Span
from .training import validate_examples from .training import validate_examples
from .util import get_words_and_spaces, registry from .util import get_words_and_spaces
cdef class Tokenizer: cdef class Tokenizer:
@ -324,7 +322,7 @@ cdef class Tokenizer:
cdef int span_start cdef int span_start
cdef int span_end cdef int span_end
while i < doc.length: while i < doc.length:
if not i in span_data: if i not in span_data:
tokens[i + offset] = doc.c[i] tokens[i + offset] = doc.c[i]
i += 1 i += 1
else: else:
@ -395,12 +393,15 @@ cdef class Tokenizer:
self._save_cached(&tokens.c[orig_size], orig_key, has_special, self._save_cached(&tokens.c[orig_size], orig_key, has_special,
tokens.length - orig_size) tokens.length - orig_size)
cdef str _split_affixes(self, Pool mem, str string, cdef str _split_affixes(
self,
Pool mem,
str string,
vector[const LexemeC*] *prefixes, vector[const LexemeC*] *prefixes,
vector[const LexemeC*] *suffixes, vector[const LexemeC*] *suffixes,
int* has_special, int* has_special,
bint with_special_cases): bint with_special_cases
cdef size_t i ):
cdef str prefix cdef str prefix
cdef str suffix cdef str suffix
cdef str minus_pre cdef str minus_pre
@ -445,10 +446,6 @@ cdef class Tokenizer:
vector[const LexemeC*] *suffixes, vector[const LexemeC*] *suffixes,
int* has_special, int* has_special,
bint with_special_cases) except -1: bint with_special_cases) except -1:
cdef bint specials_hit = 0
cdef bint cache_hit = 0
cdef int split, end
cdef const LexemeC* const* lexemes
cdef const LexemeC* lexeme cdef const LexemeC* lexeme
cdef str span cdef str span
cdef int i cdef int i
@ -458,9 +455,11 @@ cdef class Tokenizer:
if string: if string:
if self._try_specials_and_cache(hash_string(string), tokens, has_special, with_special_cases): if self._try_specials_and_cache(hash_string(string), tokens, has_special, with_special_cases):
pass pass
elif (self.token_match and self.token_match(string)) or \ elif (
(self.url_match and \ self.token_match and self.token_match(string) or
self.url_match(string)): self.url_match and self.url_match(string)
):
# We're always saying 'no' to spaces here -- the caller will # We're always saying 'no' to spaces here -- the caller will
# fix up the outermost one, with reference to the original. # fix up the outermost one, with reference to the original.
# See Issue #859 # See Issue #859
@ -821,7 +820,7 @@ cdef class Tokenizer:
self.infix_finditer = None self.infix_finditer = None
self.token_match = None self.token_match = None
self.url_match = None self.url_match = None
msg = util.from_bytes(bytes_data, deserializers, exclude) util.from_bytes(bytes_data, deserializers, exclude)
if "prefix_search" in data and isinstance(data["prefix_search"], str): if "prefix_search" in data and isinstance(data["prefix_search"], str):
self.prefix_search = re.compile(data["prefix_search"]).search self.prefix_search = re.compile(data["prefix_search"]).search
if "suffix_search" in data and isinstance(data["suffix_search"], str): if "suffix_search" in data and isinstance(data["suffix_search"], str):