mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 02:06:31 +03:00
Use int8_t instead of char in Matcher (#6413)
* Use signed char instead of char in Matcher Remove unused char* utf8_t typedef * Use int8_t instead of signed char
This commit is contained in:
parent
4284605683
commit
3f61f5eb54
|
@ -3,7 +3,7 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from libcpp.vector cimport vector
|
from libcpp.vector cimport vector
|
||||||
from libc.stdint cimport int32_t
|
from libc.stdint cimport int32_t, int8_t
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
from murmurhash.mrmr cimport hash64
|
from murmurhash.mrmr cimport hash64
|
||||||
|
|
||||||
|
@ -279,7 +279,7 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
|
||||||
# avoid any processing or mem alloc if the document is empty
|
# avoid any processing or mem alloc if the document is empty
|
||||||
return output
|
return output
|
||||||
if len(predicates) > 0:
|
if len(predicates) > 0:
|
||||||
predicate_cache = <char*>mem.alloc(length * len(predicates), sizeof(char))
|
predicate_cache = <int8_t*>mem.alloc(length * len(predicates), sizeof(int8_t))
|
||||||
if extensions is not None and len(extensions) >= 1:
|
if extensions is not None and len(extensions) >= 1:
|
||||||
nr_extra_attr = max(extensions.values()) + 1
|
nr_extra_attr = max(extensions.values()) + 1
|
||||||
extra_attr_values = <attr_t*>mem.alloc(length * nr_extra_attr, sizeof(attr_t))
|
extra_attr_values = <attr_t*>mem.alloc(length * nr_extra_attr, sizeof(attr_t))
|
||||||
|
@ -320,7 +320,7 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
|
||||||
|
|
||||||
|
|
||||||
cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& matches,
|
cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& matches,
|
||||||
char* cached_py_predicates,
|
int8_t* cached_py_predicates,
|
||||||
Token token, const attr_t* extra_attrs, py_predicates) except *:
|
Token token, const attr_t* extra_attrs, py_predicates) except *:
|
||||||
cdef int q = 0
|
cdef int q = 0
|
||||||
cdef vector[PatternStateC] new_states
|
cdef vector[PatternStateC] new_states
|
||||||
|
@ -392,7 +392,7 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
|
||||||
states.push_back(new_states[i])
|
states.push_back(new_states[i])
|
||||||
|
|
||||||
|
|
||||||
cdef int update_predicate_cache(char* cache,
|
cdef int update_predicate_cache(int8_t* cache,
|
||||||
const TokenPatternC* pattern, Token token, predicates) except -1:
|
const TokenPatternC* pattern, Token token, predicates) except -1:
|
||||||
# If the state references any extra predicates, check whether they match.
|
# If the state references any extra predicates, check whether they match.
|
||||||
# These are cached, so that we don't call these potentially expensive
|
# These are cached, so that we don't call these potentially expensive
|
||||||
|
@ -430,7 +430,7 @@ cdef void finish_states(vector[MatchC]& matches, vector[PatternStateC]& states)
|
||||||
|
|
||||||
cdef action_t get_action(PatternStateC state,
|
cdef action_t get_action(PatternStateC state,
|
||||||
const TokenC* token, const attr_t* extra_attrs,
|
const TokenC* token, const attr_t* extra_attrs,
|
||||||
const char* predicate_matches) nogil:
|
const int8_t* predicate_matches) nogil:
|
||||||
"""We need to consider:
|
"""We need to consider:
|
||||||
a) Does the token match the specification? [Yes, No]
|
a) Does the token match the specification? [Yes, No]
|
||||||
b) What's the quantifier? [1, 0+, ?]
|
b) What's the quantifier? [1, 0+, ?]
|
||||||
|
@ -488,7 +488,7 @@ cdef action_t get_action(PatternStateC state,
|
||||||
|
|
||||||
Problem: If a quantifier is matching, we're adding a lot of open partials
|
Problem: If a quantifier is matching, we're adding a lot of open partials
|
||||||
"""
|
"""
|
||||||
cdef char is_match
|
cdef int8_t is_match
|
||||||
is_match = get_is_match(state, token, extra_attrs, predicate_matches)
|
is_match = get_is_match(state, token, extra_attrs, predicate_matches)
|
||||||
quantifier = get_quantifier(state)
|
quantifier = get_quantifier(state)
|
||||||
is_final = get_is_final(state)
|
is_final = get_is_final(state)
|
||||||
|
@ -540,9 +540,9 @@ cdef action_t get_action(PatternStateC state,
|
||||||
return RETRY
|
return RETRY
|
||||||
|
|
||||||
|
|
||||||
cdef char get_is_match(PatternStateC state,
|
cdef int8_t get_is_match(PatternStateC state,
|
||||||
const TokenC* token, const attr_t* extra_attrs,
|
const TokenC* token, const attr_t* extra_attrs,
|
||||||
const char* predicate_matches) nogil:
|
const int8_t* predicate_matches) nogil:
|
||||||
for i in range(state.pattern.nr_py):
|
for i in range(state.pattern.nr_py):
|
||||||
if predicate_matches[state.pattern.py_predicates[i]] == -1:
|
if predicate_matches[state.pattern.py_predicates[i]] == -1:
|
||||||
return 0
|
return 0
|
||||||
|
@ -557,7 +557,7 @@ cdef char get_is_match(PatternStateC state,
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
cdef char get_is_final(PatternStateC state) nogil:
|
cdef int8_t get_is_final(PatternStateC state) nogil:
|
||||||
if state.pattern[1].quantifier == FINAL_ID:
|
if state.pattern[1].quantifier == FINAL_ID:
|
||||||
id_attr = state.pattern[1].attrs[0]
|
id_attr = state.pattern[1].attrs[0]
|
||||||
if id_attr.attr != ID:
|
if id_attr.attr != ID:
|
||||||
|
@ -568,7 +568,7 @@ cdef char get_is_final(PatternStateC state) nogil:
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
cdef char get_quantifier(PatternStateC state) nogil:
|
cdef int8_t get_quantifier(PatternStateC state) nogil:
|
||||||
return state.pattern.quantifier
|
return state.pattern.quantifier
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,6 @@ from libc.stdint cimport uint8_t
|
||||||
|
|
||||||
|
|
||||||
ctypedef uint64_t hash_t
|
ctypedef uint64_t hash_t
|
||||||
ctypedef char* utf8_t
|
|
||||||
ctypedef uint64_t attr_t
|
ctypedef uint64_t attr_t
|
||||||
ctypedef uint64_t flags_t
|
ctypedef uint64_t flags_t
|
||||||
ctypedef uint16_t len_t
|
ctypedef uint16_t len_t
|
||||||
|
|
|
@ -5,7 +5,7 @@ from cymem.cymem cimport Pool
|
||||||
from murmurhash.mrmr cimport hash64
|
from murmurhash.mrmr cimport hash64
|
||||||
|
|
||||||
from .structs cimport LexemeC, TokenC
|
from .structs cimport LexemeC, TokenC
|
||||||
from .typedefs cimport utf8_t, attr_t, hash_t
|
from .typedefs cimport attr_t, hash_t
|
||||||
from .strings cimport StringStore
|
from .strings cimport StringStore
|
||||||
from .morphology cimport Morphology
|
from .morphology cimport Morphology
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user