mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-10 16:22:29 +03:00
Make cython-lint install conditional. Fix tokenizer.pyx.
This commit is contained in:
parent
c32414c43b
commit
94110a1c6d
|
@ -38,5 +38,5 @@ types-setuptools>=57.0.0
|
||||||
types-requests
|
types-requests
|
||||||
types-setuptools>=57.0.0
|
types-setuptools>=57.0.0
|
||||||
black==22.3.0
|
black==22.3.0
|
||||||
cython-lint>=0.15.0
|
cython-lint>=0.15.0; python_version >= "3.7"
|
||||||
isort>=5.0,<6.0
|
isort>=5.0,<6.0
|
||||||
|
|
|
@ -8,20 +8,18 @@ from libcpp.set cimport set as stdset
|
||||||
from preshed.maps cimport PreshMap
|
from preshed.maps cimport PreshMap
|
||||||
|
|
||||||
import re
|
import re
|
||||||
import warnings
|
|
||||||
|
|
||||||
from .lexeme cimport EMPTY_LEXEME
|
from .lexeme cimport EMPTY_LEXEME
|
||||||
from .strings cimport hash_string
|
from .strings cimport hash_string
|
||||||
from .tokens.doc cimport Doc
|
from .tokens.doc cimport Doc
|
||||||
|
|
||||||
from . import util
|
from . import util
|
||||||
from .attrs import intify_attrs
|
from .attrs import intify_attrs
|
||||||
from .errors import Errors, Warnings
|
from .errors import Errors
|
||||||
from .scorer import Scorer
|
from .scorer import Scorer
|
||||||
from .symbols import NORM, ORTH
|
from .symbols import NORM, ORTH
|
||||||
from .tokens import Span
|
from .tokens import Span
|
||||||
from .training import validate_examples
|
from .training import validate_examples
|
||||||
from .util import get_words_and_spaces, registry
|
from .util import get_words_and_spaces
|
||||||
|
|
||||||
|
|
||||||
cdef class Tokenizer:
|
cdef class Tokenizer:
|
||||||
|
@ -324,7 +322,7 @@ cdef class Tokenizer:
|
||||||
cdef int span_start
|
cdef int span_start
|
||||||
cdef int span_end
|
cdef int span_end
|
||||||
while i < doc.length:
|
while i < doc.length:
|
||||||
if not i in span_data:
|
if i not in span_data:
|
||||||
tokens[i + offset] = doc.c[i]
|
tokens[i + offset] = doc.c[i]
|
||||||
i += 1
|
i += 1
|
||||||
else:
|
else:
|
||||||
|
@ -395,12 +393,15 @@ cdef class Tokenizer:
|
||||||
self._save_cached(&tokens.c[orig_size], orig_key, has_special,
|
self._save_cached(&tokens.c[orig_size], orig_key, has_special,
|
||||||
tokens.length - orig_size)
|
tokens.length - orig_size)
|
||||||
|
|
||||||
cdef str _split_affixes(self, Pool mem, str string,
|
cdef str _split_affixes(
|
||||||
|
self,
|
||||||
|
Pool mem,
|
||||||
|
str string,
|
||||||
vector[const LexemeC*] *prefixes,
|
vector[const LexemeC*] *prefixes,
|
||||||
vector[const LexemeC*] *suffixes,
|
vector[const LexemeC*] *suffixes,
|
||||||
int* has_special,
|
int* has_special,
|
||||||
bint with_special_cases):
|
bint with_special_cases
|
||||||
cdef size_t i
|
):
|
||||||
cdef str prefix
|
cdef str prefix
|
||||||
cdef str suffix
|
cdef str suffix
|
||||||
cdef str minus_pre
|
cdef str minus_pre
|
||||||
|
@ -445,10 +446,6 @@ cdef class Tokenizer:
|
||||||
vector[const LexemeC*] *suffixes,
|
vector[const LexemeC*] *suffixes,
|
||||||
int* has_special,
|
int* has_special,
|
||||||
bint with_special_cases) except -1:
|
bint with_special_cases) except -1:
|
||||||
cdef bint specials_hit = 0
|
|
||||||
cdef bint cache_hit = 0
|
|
||||||
cdef int split, end
|
|
||||||
cdef const LexemeC* const* lexemes
|
|
||||||
cdef const LexemeC* lexeme
|
cdef const LexemeC* lexeme
|
||||||
cdef str span
|
cdef str span
|
||||||
cdef int i
|
cdef int i
|
||||||
|
@ -458,9 +455,11 @@ cdef class Tokenizer:
|
||||||
if string:
|
if string:
|
||||||
if self._try_specials_and_cache(hash_string(string), tokens, has_special, with_special_cases):
|
if self._try_specials_and_cache(hash_string(string), tokens, has_special, with_special_cases):
|
||||||
pass
|
pass
|
||||||
elif (self.token_match and self.token_match(string)) or \
|
elif (
|
||||||
(self.url_match and \
|
self.token_match and self.token_match(string) or
|
||||||
self.url_match(string)):
|
self.url_match and self.url_match(string)
|
||||||
|
):
|
||||||
|
|
||||||
# We're always saying 'no' to spaces here -- the caller will
|
# We're always saying 'no' to spaces here -- the caller will
|
||||||
# fix up the outermost one, with reference to the original.
|
# fix up the outermost one, with reference to the original.
|
||||||
# See Issue #859
|
# See Issue #859
|
||||||
|
@ -821,7 +820,7 @@ cdef class Tokenizer:
|
||||||
self.infix_finditer = None
|
self.infix_finditer = None
|
||||||
self.token_match = None
|
self.token_match = None
|
||||||
self.url_match = None
|
self.url_match = None
|
||||||
msg = util.from_bytes(bytes_data, deserializers, exclude)
|
util.from_bytes(bytes_data, deserializers, exclude)
|
||||||
if "prefix_search" in data and isinstance(data["prefix_search"], str):
|
if "prefix_search" in data and isinstance(data["prefix_search"], str):
|
||||||
self.prefix_search = re.compile(data["prefix_search"]).search
|
self.prefix_search = re.compile(data["prefix_search"]).search
|
||||||
if "suffix_search" in data and isinstance(data["suffix_search"], str):
|
if "suffix_search" in data and isinstance(data["suffix_search"], str):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user