mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-27 10:26:35 +03:00
45c9a68828
Modify the internal pattern representation in `Matcher` patterns to identify the final ID state using a unique quantifier rather than a combination of other attributes. It was insufficient to identify the final ID node based on an uninitialized `quantifier` (coincidentally being the same as the `ZERO`) with `nr_attr` as 0. (In addition, it was potentially bug-prone that `nr_attr` was set to 0 even though attrs were allocated.) In the case of `{"OP": "!"}` (a valid, if pointless, pattern), `nr_attr` is 0 and the quantifier is ZERO, so the previous methods for incrementing to the ID node at the end of the pattern weren't able to distinguish the final ID node from the `{"OP": "!"}` pattern.
73 lines
1.3 KiB
Cython
73 lines
1.3 KiB
Cython
from libc.stdint cimport int32_t
|
|
from libcpp.vector cimport vector
|
|
from cymem.cymem cimport Pool
|
|
|
|
from ..vocab cimport Vocab
|
|
from ..typedefs cimport attr_t, hash_t
|
|
from ..structs cimport TokenC
|
|
from ..lexeme cimport attr_id_t
|
|
|
|
|
|
cdef enum action_t:
|
|
REJECT = 0000
|
|
MATCH = 1000
|
|
ADVANCE = 0100
|
|
RETRY = 0010
|
|
RETRY_EXTEND = 0011
|
|
RETRY_ADVANCE = 0110
|
|
MATCH_EXTEND = 1001
|
|
MATCH_REJECT = 2000
|
|
MATCH_DOUBLE = 3000
|
|
|
|
|
|
cdef enum quantifier_t:
|
|
ZERO
|
|
ZERO_ONE
|
|
ZERO_PLUS
|
|
ONE
|
|
ONE_PLUS
|
|
FINAL_ID
|
|
|
|
|
|
cdef struct AttrValueC:
|
|
attr_id_t attr
|
|
attr_t value
|
|
|
|
cdef struct IndexValueC:
|
|
int32_t index
|
|
attr_t value
|
|
|
|
cdef struct TokenPatternC:
|
|
AttrValueC* attrs
|
|
int32_t* py_predicates
|
|
IndexValueC* extra_attrs
|
|
int32_t nr_attr
|
|
int32_t nr_extra_attr
|
|
int32_t nr_py
|
|
quantifier_t quantifier
|
|
hash_t key
|
|
|
|
|
|
cdef struct PatternStateC:
|
|
TokenPatternC* pattern
|
|
int32_t start
|
|
int32_t length
|
|
|
|
|
|
cdef struct MatchC:
|
|
attr_t pattern_id
|
|
int32_t start
|
|
int32_t length
|
|
|
|
|
|
cdef class Matcher:
|
|
cdef Pool mem
|
|
cdef vector[TokenPatternC*] patterns
|
|
cdef readonly Vocab vocab
|
|
cdef public object validator
|
|
cdef public object _patterns
|
|
cdef public object _callbacks
|
|
cdef public object _extensions
|
|
cdef public object _extra_predicates
|
|
cdef public object _seen_attrs
|