Use int8_t instead of char in Matcher (#6413)

* Use signed char instead of char in Matcher

Remove unused char* utf8_t typedef

* Use int8_t instead of signed char
This commit is contained in:
Adriane Boyd 2020-11-23 10:26:47 +01:00 committed by GitHub
parent 4284605683
commit 3f61f5eb54
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 11 additions and 12 deletions

View File

@ -3,7 +3,7 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from libcpp.vector cimport vector from libcpp.vector cimport vector
from libc.stdint cimport int32_t from libc.stdint cimport int32_t, int8_t
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from murmurhash.mrmr cimport hash64 from murmurhash.mrmr cimport hash64
@ -279,7 +279,7 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
# avoid any processing or mem alloc if the document is empty # avoid any processing or mem alloc if the document is empty
return output return output
if len(predicates) > 0: if len(predicates) > 0:
predicate_cache = <char*>mem.alloc(length * len(predicates), sizeof(char)) predicate_cache = <int8_t*>mem.alloc(length * len(predicates), sizeof(int8_t))
if extensions is not None and len(extensions) >= 1: if extensions is not None and len(extensions) >= 1:
nr_extra_attr = max(extensions.values()) + 1 nr_extra_attr = max(extensions.values()) + 1
extra_attr_values = <attr_t*>mem.alloc(length * nr_extra_attr, sizeof(attr_t)) extra_attr_values = <attr_t*>mem.alloc(length * nr_extra_attr, sizeof(attr_t))
@ -320,7 +320,7 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& matches, cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& matches,
char* cached_py_predicates, int8_t* cached_py_predicates,
Token token, const attr_t* extra_attrs, py_predicates) except *: Token token, const attr_t* extra_attrs, py_predicates) except *:
cdef int q = 0 cdef int q = 0
cdef vector[PatternStateC] new_states cdef vector[PatternStateC] new_states
@ -392,7 +392,7 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
states.push_back(new_states[i]) states.push_back(new_states[i])
cdef int update_predicate_cache(char* cache, cdef int update_predicate_cache(int8_t* cache,
const TokenPatternC* pattern, Token token, predicates) except -1: const TokenPatternC* pattern, Token token, predicates) except -1:
# If the state references any extra predicates, check whether they match. # If the state references any extra predicates, check whether they match.
# These are cached, so that we don't call these potentially expensive # These are cached, so that we don't call these potentially expensive
@ -430,7 +430,7 @@ cdef void finish_states(vector[MatchC]& matches, vector[PatternStateC]& states)
cdef action_t get_action(PatternStateC state, cdef action_t get_action(PatternStateC state,
const TokenC* token, const attr_t* extra_attrs, const TokenC* token, const attr_t* extra_attrs,
const char* predicate_matches) nogil: const int8_t* predicate_matches) nogil:
"""We need to consider: """We need to consider:
a) Does the token match the specification? [Yes, No] a) Does the token match the specification? [Yes, No]
b) What's the quantifier? [1, 0+, ?] b) What's the quantifier? [1, 0+, ?]
@ -488,7 +488,7 @@ cdef action_t get_action(PatternStateC state,
Problem: If a quantifier is matching, we're adding a lot of open partials Problem: If a quantifier is matching, we're adding a lot of open partials
""" """
cdef char is_match cdef int8_t is_match
is_match = get_is_match(state, token, extra_attrs, predicate_matches) is_match = get_is_match(state, token, extra_attrs, predicate_matches)
quantifier = get_quantifier(state) quantifier = get_quantifier(state)
is_final = get_is_final(state) is_final = get_is_final(state)
@ -540,9 +540,9 @@ cdef action_t get_action(PatternStateC state,
return RETRY return RETRY
cdef char get_is_match(PatternStateC state, cdef int8_t get_is_match(PatternStateC state,
const TokenC* token, const attr_t* extra_attrs, const TokenC* token, const attr_t* extra_attrs,
const char* predicate_matches) nogil: const int8_t* predicate_matches) nogil:
for i in range(state.pattern.nr_py): for i in range(state.pattern.nr_py):
if predicate_matches[state.pattern.py_predicates[i]] == -1: if predicate_matches[state.pattern.py_predicates[i]] == -1:
return 0 return 0
@ -557,7 +557,7 @@ cdef char get_is_match(PatternStateC state,
return True return True
cdef char get_is_final(PatternStateC state) nogil: cdef int8_t get_is_final(PatternStateC state) nogil:
if state.pattern[1].quantifier == FINAL_ID: if state.pattern[1].quantifier == FINAL_ID:
id_attr = state.pattern[1].attrs[0] id_attr = state.pattern[1].attrs[0]
if id_attr.attr != ID: if id_attr.attr != ID:
@ -568,7 +568,7 @@ cdef char get_is_final(PatternStateC state) nogil:
return 0 return 0
cdef char get_quantifier(PatternStateC state) nogil: cdef int8_t get_quantifier(PatternStateC state) nogil:
return state.pattern.quantifier return state.pattern.quantifier

View File

@ -3,7 +3,6 @@ from libc.stdint cimport uint8_t
ctypedef uint64_t hash_t ctypedef uint64_t hash_t
ctypedef char* utf8_t
ctypedef uint64_t attr_t ctypedef uint64_t attr_t
ctypedef uint64_t flags_t ctypedef uint64_t flags_t
ctypedef uint16_t len_t ctypedef uint16_t len_t

View File

@ -5,7 +5,7 @@ from cymem.cymem cimport Pool
from murmurhash.mrmr cimport hash64 from murmurhash.mrmr cimport hash64
from .structs cimport LexemeC, TokenC from .structs cimport LexemeC, TokenC
from .typedefs cimport utf8_t, attr_t, hash_t from .typedefs cimport attr_t, hash_t
from .strings cimport StringStore from .strings cimport StringStore
from .morphology cimport Morphology from .morphology cimport Morphology