mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Use int8_t instead of char in Matcher (#6413)
* Use signed char instead of char in Matcher Remove unused char* utf8_t typedef * Use int8_t instead of signed char
This commit is contained in:
		
							parent
							
								
									4284605683
								
							
						
					
					
						commit
						3f61f5eb54
					
				| 
						 | 
				
			
			@ -3,7 +3,7 @@
 | 
			
		|||
from __future__ import unicode_literals
 | 
			
		||||
 | 
			
		||||
from libcpp.vector cimport vector
 | 
			
		||||
from libc.stdint cimport int32_t
 | 
			
		||||
from libc.stdint cimport int32_t, int8_t
 | 
			
		||||
from cymem.cymem cimport Pool
 | 
			
		||||
from murmurhash.mrmr cimport hash64
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -279,7 +279,7 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
 | 
			
		|||
        # avoid any processing or mem alloc if the document is empty
 | 
			
		||||
        return output
 | 
			
		||||
    if len(predicates) > 0:
 | 
			
		||||
        predicate_cache = <char*>mem.alloc(length * len(predicates), sizeof(char))
 | 
			
		||||
        predicate_cache = <int8_t*>mem.alloc(length * len(predicates), sizeof(int8_t))
 | 
			
		||||
    if extensions is not None and len(extensions) >= 1:
 | 
			
		||||
        nr_extra_attr = max(extensions.values()) + 1
 | 
			
		||||
        extra_attr_values = <attr_t*>mem.alloc(length * nr_extra_attr, sizeof(attr_t))
 | 
			
		||||
| 
						 | 
				
			
			@ -320,7 +320,7 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
 | 
			
		|||
 | 
			
		||||
 | 
			
		||||
cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& matches,
 | 
			
		||||
                            char* cached_py_predicates,
 | 
			
		||||
                            int8_t* cached_py_predicates,
 | 
			
		||||
        Token token, const attr_t* extra_attrs, py_predicates) except *:
 | 
			
		||||
    cdef int q = 0
 | 
			
		||||
    cdef vector[PatternStateC] new_states
 | 
			
		||||
| 
						 | 
				
			
			@ -392,7 +392,7 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
 | 
			
		|||
        states.push_back(new_states[i])
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
cdef int update_predicate_cache(char* cache,
 | 
			
		||||
cdef int update_predicate_cache(int8_t* cache,
 | 
			
		||||
        const TokenPatternC* pattern, Token token, predicates) except -1:
 | 
			
		||||
    # If the state references any extra predicates, check whether they match.
 | 
			
		||||
    # These are cached, so that we don't call these potentially expensive
 | 
			
		||||
| 
						 | 
				
			
			@ -430,7 +430,7 @@ cdef void finish_states(vector[MatchC]& matches, vector[PatternStateC]& states)
 | 
			
		|||
 | 
			
		||||
cdef action_t get_action(PatternStateC state,
 | 
			
		||||
        const TokenC* token, const attr_t* extra_attrs,
 | 
			
		||||
        const char* predicate_matches) nogil:
 | 
			
		||||
        const int8_t* predicate_matches) nogil:
 | 
			
		||||
    """We need to consider:
 | 
			
		||||
    a) Does the token match the specification? [Yes, No]
 | 
			
		||||
    b) What's the quantifier? [1, 0+, ?]
 | 
			
		||||
| 
						 | 
				
			
			@ -488,7 +488,7 @@ cdef action_t get_action(PatternStateC state,
 | 
			
		|||
 | 
			
		||||
    Problem: If a quantifier is matching, we're adding a lot of open partials
 | 
			
		||||
    """
 | 
			
		||||
    cdef char is_match
 | 
			
		||||
    cdef int8_t is_match
 | 
			
		||||
    is_match = get_is_match(state, token, extra_attrs, predicate_matches)
 | 
			
		||||
    quantifier = get_quantifier(state)
 | 
			
		||||
    is_final = get_is_final(state)
 | 
			
		||||
| 
						 | 
				
			
			@ -540,9 +540,9 @@ cdef action_t get_action(PatternStateC state,
 | 
			
		|||
          return RETRY
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
cdef char get_is_match(PatternStateC state,
 | 
			
		||||
cdef int8_t get_is_match(PatternStateC state,
 | 
			
		||||
        const TokenC* token, const attr_t* extra_attrs,
 | 
			
		||||
        const char* predicate_matches) nogil:
 | 
			
		||||
        const int8_t* predicate_matches) nogil:
 | 
			
		||||
    for i in range(state.pattern.nr_py):
 | 
			
		||||
        if predicate_matches[state.pattern.py_predicates[i]] == -1:
 | 
			
		||||
            return 0
 | 
			
		||||
| 
						 | 
				
			
			@ -557,7 +557,7 @@ cdef char get_is_match(PatternStateC state,
 | 
			
		|||
    return True
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
cdef char get_is_final(PatternStateC state) nogil:
 | 
			
		||||
cdef int8_t get_is_final(PatternStateC state) nogil:
 | 
			
		||||
    if state.pattern[1].quantifier == FINAL_ID:
 | 
			
		||||
        id_attr = state.pattern[1].attrs[0]
 | 
			
		||||
        if id_attr.attr != ID:
 | 
			
		||||
| 
						 | 
				
			
			@ -568,7 +568,7 @@ cdef char get_is_final(PatternStateC state) nogil:
 | 
			
		|||
        return 0
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
cdef char get_quantifier(PatternStateC state) nogil:
 | 
			
		||||
cdef int8_t get_quantifier(PatternStateC state) nogil:
 | 
			
		||||
    return state.pattern.quantifier
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -3,7 +3,6 @@ from libc.stdint cimport uint8_t
 | 
			
		|||
 | 
			
		||||
 | 
			
		||||
ctypedef uint64_t hash_t
 | 
			
		||||
ctypedef char* utf8_t
 | 
			
		||||
ctypedef uint64_t attr_t
 | 
			
		||||
ctypedef uint64_t flags_t
 | 
			
		||||
ctypedef uint16_t len_t
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -5,7 +5,7 @@ from cymem.cymem cimport Pool
 | 
			
		|||
from murmurhash.mrmr cimport hash64
 | 
			
		||||
 | 
			
		||||
from .structs cimport LexemeC, TokenC
 | 
			
		||||
from .typedefs cimport utf8_t, attr_t, hash_t
 | 
			
		||||
from .typedefs cimport attr_t, hash_t
 | 
			
		||||
from .strings cimport StringStore
 | 
			
		||||
from .morphology cimport Morphology
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue
	
	Block a user