mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-10 19:57:17 +03:00
Use int8_t instead of char in Matcher (#6413)
* Use signed char instead of char in Matcher Remove unused char* utf8_t typedef * Use int8_t instead of signed char
This commit is contained in:
parent
4284605683
commit
3f61f5eb54
|
@ -3,7 +3,7 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from libcpp.vector cimport vector
|
||||
from libc.stdint cimport int32_t
|
||||
from libc.stdint cimport int32_t, int8_t
|
||||
from cymem.cymem cimport Pool
|
||||
from murmurhash.mrmr cimport hash64
|
||||
|
||||
|
@ -279,7 +279,7 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
|
|||
# avoid any processing or mem alloc if the document is empty
|
||||
return output
|
||||
if len(predicates) > 0:
|
||||
predicate_cache = <char*>mem.alloc(length * len(predicates), sizeof(char))
|
||||
predicate_cache = <int8_t*>mem.alloc(length * len(predicates), sizeof(int8_t))
|
||||
if extensions is not None and len(extensions) >= 1:
|
||||
nr_extra_attr = max(extensions.values()) + 1
|
||||
extra_attr_values = <attr_t*>mem.alloc(length * nr_extra_attr, sizeof(attr_t))
|
||||
|
@ -320,7 +320,7 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
|
|||
|
||||
|
||||
cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& matches,
|
||||
char* cached_py_predicates,
|
||||
int8_t* cached_py_predicates,
|
||||
Token token, const attr_t* extra_attrs, py_predicates) except *:
|
||||
cdef int q = 0
|
||||
cdef vector[PatternStateC] new_states
|
||||
|
@ -392,7 +392,7 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
|
|||
states.push_back(new_states[i])
|
||||
|
||||
|
||||
cdef int update_predicate_cache(char* cache,
|
||||
cdef int update_predicate_cache(int8_t* cache,
|
||||
const TokenPatternC* pattern, Token token, predicates) except -1:
|
||||
# If the state references any extra predicates, check whether they match.
|
||||
# These are cached, so that we don't call these potentially expensive
|
||||
|
@ -430,7 +430,7 @@ cdef void finish_states(vector[MatchC]& matches, vector[PatternStateC]& states)
|
|||
|
||||
cdef action_t get_action(PatternStateC state,
|
||||
const TokenC* token, const attr_t* extra_attrs,
|
||||
const char* predicate_matches) nogil:
|
||||
const int8_t* predicate_matches) nogil:
|
||||
"""We need to consider:
|
||||
a) Does the token match the specification? [Yes, No]
|
||||
b) What's the quantifier? [1, 0+, ?]
|
||||
|
@ -488,7 +488,7 @@ cdef action_t get_action(PatternStateC state,
|
|||
|
||||
Problem: If a quantifier is matching, we're adding a lot of open partials
|
||||
"""
|
||||
cdef char is_match
|
||||
cdef int8_t is_match
|
||||
is_match = get_is_match(state, token, extra_attrs, predicate_matches)
|
||||
quantifier = get_quantifier(state)
|
||||
is_final = get_is_final(state)
|
||||
|
@ -540,9 +540,9 @@ cdef action_t get_action(PatternStateC state,
|
|||
return RETRY
|
||||
|
||||
|
||||
cdef char get_is_match(PatternStateC state,
|
||||
cdef int8_t get_is_match(PatternStateC state,
|
||||
const TokenC* token, const attr_t* extra_attrs,
|
||||
const char* predicate_matches) nogil:
|
||||
const int8_t* predicate_matches) nogil:
|
||||
for i in range(state.pattern.nr_py):
|
||||
if predicate_matches[state.pattern.py_predicates[i]] == -1:
|
||||
return 0
|
||||
|
@ -557,7 +557,7 @@ cdef char get_is_match(PatternStateC state,
|
|||
return True
|
||||
|
||||
|
||||
cdef char get_is_final(PatternStateC state) nogil:
|
||||
cdef int8_t get_is_final(PatternStateC state) nogil:
|
||||
if state.pattern[1].quantifier == FINAL_ID:
|
||||
id_attr = state.pattern[1].attrs[0]
|
||||
if id_attr.attr != ID:
|
||||
|
@ -568,7 +568,7 @@ cdef char get_is_final(PatternStateC state) nogil:
|
|||
return 0
|
||||
|
||||
|
||||
cdef char get_quantifier(PatternStateC state) nogil:
|
||||
cdef int8_t get_quantifier(PatternStateC state) nogil:
|
||||
return state.pattern.quantifier
|
||||
|
||||
|
||||
|
|
|
@ -3,7 +3,6 @@ from libc.stdint cimport uint8_t
|
|||
|
||||
|
||||
ctypedef uint64_t hash_t
|
||||
ctypedef char* utf8_t
|
||||
ctypedef uint64_t attr_t
|
||||
ctypedef uint64_t flags_t
|
||||
ctypedef uint16_t len_t
|
||||
|
|
|
@ -5,7 +5,7 @@ from cymem.cymem cimport Pool
|
|||
from murmurhash.mrmr cimport hash64
|
||||
|
||||
from .structs cimport LexemeC, TokenC
|
||||
from .typedefs cimport utf8_t, attr_t, hash_t
|
||||
from .typedefs cimport attr_t, hash_t
|
||||
from .strings cimport StringStore
|
||||
from .morphology cimport Morphology
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user