mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Make cython-lint install conditional. Fix tokenizer.pyx.
This commit is contained in:
		
							parent
							
								
									c32414c43b
								
							
						
					
					
						commit
						94110a1c6d
					
				| 
						 | 
					@ -38,5 +38,5 @@ types-setuptools>=57.0.0
 | 
				
			||||||
types-requests
 | 
					types-requests
 | 
				
			||||||
types-setuptools>=57.0.0
 | 
					types-setuptools>=57.0.0
 | 
				
			||||||
black==22.3.0
 | 
					black==22.3.0
 | 
				
			||||||
cython-lint>=0.15.0
 | 
					cython-lint>=0.15.0; python_version >= "3.7"
 | 
				
			||||||
isort>=5.0,<6.0
 | 
					isort>=5.0,<6.0
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -8,20 +8,18 @@ from libcpp.set cimport set as stdset
 | 
				
			||||||
from preshed.maps cimport PreshMap
 | 
					from preshed.maps cimport PreshMap
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import re
 | 
					import re
 | 
				
			||||||
import warnings
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from .lexeme cimport EMPTY_LEXEME
 | 
					from .lexeme cimport EMPTY_LEXEME
 | 
				
			||||||
from .strings cimport hash_string
 | 
					from .strings cimport hash_string
 | 
				
			||||||
from .tokens.doc cimport Doc
 | 
					from .tokens.doc cimport Doc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from . import util
 | 
					from . import util
 | 
				
			||||||
from .attrs import intify_attrs
 | 
					from .attrs import intify_attrs
 | 
				
			||||||
from .errors import Errors, Warnings
 | 
					from .errors import Errors
 | 
				
			||||||
from .scorer import Scorer
 | 
					from .scorer import Scorer
 | 
				
			||||||
from .symbols import NORM, ORTH
 | 
					from .symbols import NORM, ORTH
 | 
				
			||||||
from .tokens import Span
 | 
					from .tokens import Span
 | 
				
			||||||
from .training import validate_examples
 | 
					from .training import validate_examples
 | 
				
			||||||
from .util import get_words_and_spaces, registry
 | 
					from .util import get_words_and_spaces
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef class Tokenizer:
 | 
					cdef class Tokenizer:
 | 
				
			||||||
| 
						 | 
					@ -324,7 +322,7 @@ cdef class Tokenizer:
 | 
				
			||||||
        cdef int span_start
 | 
					        cdef int span_start
 | 
				
			||||||
        cdef int span_end
 | 
					        cdef int span_end
 | 
				
			||||||
        while i < doc.length:
 | 
					        while i < doc.length:
 | 
				
			||||||
            if not i in span_data:
 | 
					            if i not in span_data:
 | 
				
			||||||
                tokens[i + offset] = doc.c[i]
 | 
					                tokens[i + offset] = doc.c[i]
 | 
				
			||||||
                i += 1
 | 
					                i += 1
 | 
				
			||||||
            else:
 | 
					            else:
 | 
				
			||||||
| 
						 | 
					@ -395,12 +393,15 @@ cdef class Tokenizer:
 | 
				
			||||||
        self._save_cached(&tokens.c[orig_size], orig_key, has_special,
 | 
					        self._save_cached(&tokens.c[orig_size], orig_key, has_special,
 | 
				
			||||||
                          tokens.length - orig_size)
 | 
					                          tokens.length - orig_size)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef str _split_affixes(self, Pool mem, str string,
 | 
					    cdef str _split_affixes(
 | 
				
			||||||
 | 
					        self,
 | 
				
			||||||
 | 
					        Pool mem,
 | 
				
			||||||
 | 
					        str string,
 | 
				
			||||||
        vector[const LexemeC*] *prefixes,
 | 
					        vector[const LexemeC*] *prefixes,
 | 
				
			||||||
        vector[const LexemeC*] *suffixes,
 | 
					        vector[const LexemeC*] *suffixes,
 | 
				
			||||||
        int* has_special,
 | 
					        int* has_special,
 | 
				
			||||||
                                bint with_special_cases):
 | 
					        bint with_special_cases
 | 
				
			||||||
        cdef size_t i
 | 
					    ):
 | 
				
			||||||
        cdef str prefix
 | 
					        cdef str prefix
 | 
				
			||||||
        cdef str suffix
 | 
					        cdef str suffix
 | 
				
			||||||
        cdef str minus_pre
 | 
					        cdef str minus_pre
 | 
				
			||||||
| 
						 | 
					@ -445,10 +446,6 @@ cdef class Tokenizer:
 | 
				
			||||||
                            vector[const LexemeC*] *suffixes,
 | 
					                            vector[const LexemeC*] *suffixes,
 | 
				
			||||||
                            int* has_special,
 | 
					                            int* has_special,
 | 
				
			||||||
                            bint with_special_cases) except -1:
 | 
					                            bint with_special_cases) except -1:
 | 
				
			||||||
        cdef bint specials_hit = 0
 | 
					 | 
				
			||||||
        cdef bint cache_hit = 0
 | 
					 | 
				
			||||||
        cdef int split, end
 | 
					 | 
				
			||||||
        cdef const LexemeC* const* lexemes
 | 
					 | 
				
			||||||
        cdef const LexemeC* lexeme
 | 
					        cdef const LexemeC* lexeme
 | 
				
			||||||
        cdef str span
 | 
					        cdef str span
 | 
				
			||||||
        cdef int i
 | 
					        cdef int i
 | 
				
			||||||
| 
						 | 
					@ -458,9 +455,11 @@ cdef class Tokenizer:
 | 
				
			||||||
        if string:
 | 
					        if string:
 | 
				
			||||||
            if self._try_specials_and_cache(hash_string(string), tokens, has_special, with_special_cases):
 | 
					            if self._try_specials_and_cache(hash_string(string), tokens, has_special, with_special_cases):
 | 
				
			||||||
                pass
 | 
					                pass
 | 
				
			||||||
            elif (self.token_match and self.token_match(string)) or \
 | 
					            elif (
 | 
				
			||||||
                    (self.url_match and \
 | 
					                self.token_match and self.token_match(string) or
 | 
				
			||||||
                    self.url_match(string)):
 | 
					                self.url_match and self.url_match(string)
 | 
				
			||||||
 | 
					            ):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                # We're always saying 'no' to spaces here -- the caller will
 | 
					                # We're always saying 'no' to spaces here -- the caller will
 | 
				
			||||||
                # fix up the outermost one, with reference to the original.
 | 
					                # fix up the outermost one, with reference to the original.
 | 
				
			||||||
                # See Issue #859
 | 
					                # See Issue #859
 | 
				
			||||||
| 
						 | 
					@ -821,7 +820,7 @@ cdef class Tokenizer:
 | 
				
			||||||
        self.infix_finditer = None
 | 
					        self.infix_finditer = None
 | 
				
			||||||
        self.token_match = None
 | 
					        self.token_match = None
 | 
				
			||||||
        self.url_match = None
 | 
					        self.url_match = None
 | 
				
			||||||
        msg = util.from_bytes(bytes_data, deserializers, exclude)
 | 
					        util.from_bytes(bytes_data, deserializers, exclude)
 | 
				
			||||||
        if "prefix_search" in data and isinstance(data["prefix_search"], str):
 | 
					        if "prefix_search" in data and isinstance(data["prefix_search"], str):
 | 
				
			||||||
            self.prefix_search = re.compile(data["prefix_search"]).search
 | 
					            self.prefix_search = re.compile(data["prefix_search"]).search
 | 
				
			||||||
        if "suffix_search" in data and isinstance(data["suffix_search"], str):
 | 
					        if "suffix_search" in data and isinstance(data["suffix_search"], str):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user