mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-14 13:47:13 +03:00
40c995b1be
* add "greedy" option for match pattern * distinction between greedy FIRST or LONGEST * check for proper values, throw custom warning otherwise * unxfail one more test * add comment in docstring * add test that LONGEST also prefers first match if equal length * use c arrays for more efficient processing * rename 'greediness' to 'greedy'
73 lines
1.4 KiB
Cython
73 lines
1.4 KiB
Cython
from libc.stdint cimport int32_t
|
|
from libcpp.vector cimport vector
|
|
from cymem.cymem cimport Pool
|
|
|
|
from ..vocab cimport Vocab
|
|
from ..typedefs cimport attr_t, hash_t
|
|
from ..structs cimport TokenC
|
|
from ..lexeme cimport attr_id_t
|
|
|
|
|
|
cdef enum action_t:
|
|
REJECT = 0000
|
|
MATCH = 1000
|
|
ADVANCE = 0100
|
|
RETRY = 0010
|
|
RETRY_EXTEND = 0011
|
|
RETRY_ADVANCE = 0110
|
|
MATCH_EXTEND = 1001
|
|
MATCH_REJECT = 2000
|
|
MATCH_DOUBLE = 3000
|
|
|
|
|
|
cdef enum quantifier_t:
|
|
ZERO
|
|
ZERO_ONE
|
|
ZERO_PLUS
|
|
ONE
|
|
ONE_PLUS
|
|
|
|
|
|
cdef struct AttrValueC:
|
|
attr_id_t attr
|
|
attr_t value
|
|
|
|
cdef struct IndexValueC:
|
|
int32_t index
|
|
attr_t value
|
|
|
|
cdef struct TokenPatternC:
|
|
AttrValueC* attrs
|
|
int32_t* py_predicates
|
|
IndexValueC* extra_attrs
|
|
int32_t nr_attr
|
|
int32_t nr_extra_attr
|
|
int32_t nr_py
|
|
quantifier_t quantifier
|
|
hash_t key
|
|
|
|
|
|
cdef struct PatternStateC:
|
|
TokenPatternC* pattern
|
|
int32_t start
|
|
int32_t length
|
|
|
|
|
|
cdef struct MatchC:
|
|
attr_t pattern_id
|
|
int32_t start
|
|
int32_t length
|
|
|
|
|
|
cdef class Matcher:
|
|
cdef Pool mem
|
|
cdef vector[TokenPatternC*] patterns
|
|
cdef readonly Vocab vocab
|
|
cdef public object validate
|
|
cdef public object _patterns
|
|
cdef public object _callbacks
|
|
cdef public object _filter
|
|
cdef public object _extensions
|
|
cdef public object _extra_predicates
|
|
cdef public object _seen_attrs
|