mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-07 21:54:54 +03:00
Implemented *? and +? to matcher
Main Changes schemas.py - Updated TokenPatternOperatorSimple class to include +? and *? for token validation matcher.pxd - Added quantifier and actions which are needed for the implementation of +? and *? - Key additions: Quantifier "ZERO_MINUS", Action "RETRY_OR_EXTEND" and "MATCH_ADVANCE" matcher.pyx - cast_to_non_greedy_action() function to cast get_action() outputs into actions that demonstrates non-greedy behaviour - added ZERO_MINUS quantifier to get_action() - added a few helper functions
This commit is contained in:
parent
43d74d9cee
commit
616bc51743
|
@ -13,9 +13,12 @@ cdef enum action_t:
|
||||||
MATCH = 1000
|
MATCH = 1000
|
||||||
ADVANCE = 0100
|
ADVANCE = 0100
|
||||||
RETRY = 0010
|
RETRY = 0010
|
||||||
|
EXTEND = 0001
|
||||||
RETRY_EXTEND = 0011
|
RETRY_EXTEND = 0011
|
||||||
|
RETRY_OR_EXTEND = 0022
|
||||||
RETRY_ADVANCE = 0110
|
RETRY_ADVANCE = 0110
|
||||||
MATCH_EXTEND = 1001
|
MATCH_EXTEND = 1001
|
||||||
|
MATCH_ADVANCE = 1100
|
||||||
MATCH_REJECT = 2000
|
MATCH_REJECT = 2000
|
||||||
MATCH_DOUBLE = 3000
|
MATCH_DOUBLE = 3000
|
||||||
|
|
||||||
|
@ -24,8 +27,8 @@ cdef enum quantifier_t:
|
||||||
ZERO
|
ZERO
|
||||||
ZERO_ONE
|
ZERO_ONE
|
||||||
ZERO_PLUS
|
ZERO_PLUS
|
||||||
|
ZERO_MINUS
|
||||||
ONE
|
ONE
|
||||||
ONE_PLUS
|
|
||||||
FINAL_ID
|
FINAL_ID
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -25,10 +25,8 @@ from ..errors import Errors, MatchPatternError, Warnings
|
||||||
from ..strings import get_string_id
|
from ..strings import get_string_id
|
||||||
from ..attrs import IDS
|
from ..attrs import IDS
|
||||||
|
|
||||||
|
|
||||||
DEF PADDING = 5
|
DEF PADDING = 5
|
||||||
|
|
||||||
|
|
||||||
cdef class Matcher:
|
cdef class Matcher:
|
||||||
"""Match sequences of tokens, based on pattern rules.
|
"""Match sequences of tokens, based on pattern rules.
|
||||||
|
|
||||||
|
@ -73,7 +71,7 @@ cdef class Matcher:
|
||||||
"""
|
"""
|
||||||
return self.has_key(key)
|
return self.has_key(key)
|
||||||
|
|
||||||
def add(self, key, patterns, *, on_match=None, greedy: str=None):
|
def add(self, key, patterns, *, on_match=None, greedy: str = None):
|
||||||
"""Add a match-rule to the matcher. A match-rule consists of: an ID
|
"""Add a match-rule to the matcher. A match-rule consists of: an ID
|
||||||
key, an on_match callback, and one or more patterns.
|
key, an on_match callback, and one or more patterns.
|
||||||
|
|
||||||
|
@ -90,6 +88,8 @@ cdef class Matcher:
|
||||||
'?': Make the pattern optional, by allowing it to match 0 or 1 times.
|
'?': Make the pattern optional, by allowing it to match 0 or 1 times.
|
||||||
'+': Require the pattern to match 1 or more times.
|
'+': Require the pattern to match 1 or more times.
|
||||||
'*': Allow the pattern to zero or more times.
|
'*': Allow the pattern to zero or more times.
|
||||||
|
'+?': Require the pattern to match non-greedily 1 or more times.
|
||||||
|
'*?': Allow the pattern to match non-greedily 0 or more times.
|
||||||
'{n}': Require the pattern to match exactly _n_ times.
|
'{n}': Require the pattern to match exactly _n_ times.
|
||||||
'{n,m}': Require the pattern to match at least _n_ but not more than _m_ times.
|
'{n,m}': Require the pattern to match at least _n_ but not more than _m_ times.
|
||||||
'{n,}': Require the pattern to match at least _n_ times.
|
'{n,}': Require the pattern to match at least _n_ times.
|
||||||
|
@ -128,7 +128,7 @@ cdef class Matcher:
|
||||||
for pattern in patterns:
|
for pattern in patterns:
|
||||||
try:
|
try:
|
||||||
specs = _preprocess_pattern(pattern, self.vocab,
|
specs = _preprocess_pattern(pattern, self.vocab,
|
||||||
self._extensions, self._extra_predicates)
|
self._extensions, self._extra_predicates)
|
||||||
self.patterns.push_back(init_pattern(self.mem, key, specs))
|
self.patterns.push_back(init_pattern(self.mem, key, specs))
|
||||||
for spec in specs:
|
for spec in specs:
|
||||||
for attr, _ in spec[1]:
|
for attr, _ in spec[1]:
|
||||||
|
@ -160,7 +160,7 @@ cdef class Matcher:
|
||||||
while i < self.patterns.size():
|
while i < self.patterns.size():
|
||||||
pattern_key = get_ent_id(self.patterns.at(i))
|
pattern_key = get_ent_id(self.patterns.at(i))
|
||||||
if pattern_key == norm_key:
|
if pattern_key == norm_key:
|
||||||
self.patterns.erase(self.patterns.begin()+i)
|
self.patterns.erase(self.patterns.begin() + i)
|
||||||
else:
|
else:
|
||||||
i += 1
|
i += 1
|
||||||
|
|
||||||
|
@ -253,7 +253,8 @@ cdef class Matcher:
|
||||||
matches = []
|
matches = []
|
||||||
else:
|
else:
|
||||||
matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length,
|
matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length,
|
||||||
extensions=self._extensions, predicates=self._extra_predicates, with_alignments=with_alignments)
|
extensions=self._extensions, predicates=self._extra_predicates,
|
||||||
|
with_alignments=with_alignments)
|
||||||
final_matches = []
|
final_matches = []
|
||||||
pairs_by_id = {}
|
pairs_by_id = {}
|
||||||
# For each key, either add all matches, or only the filtered,
|
# For each key, either add all matches, or only the filtered,
|
||||||
|
@ -267,21 +268,21 @@ cdef class Matcher:
|
||||||
pairs_by_id[key] = pairs
|
pairs_by_id[key] = pairs
|
||||||
else:
|
else:
|
||||||
final_matches.append((key, *match))
|
final_matches.append((key, *match))
|
||||||
matched = <char*>tmp_pool.alloc(length, sizeof(char))
|
matched = <char *> tmp_pool.alloc(length, sizeof(char))
|
||||||
empty = <char*>tmp_pool.alloc(length, sizeof(char))
|
empty = <char *> tmp_pool.alloc(length, sizeof(char))
|
||||||
for key, pairs in pairs_by_id.items():
|
for key, pairs in pairs_by_id.items():
|
||||||
memset(matched, 0, length * sizeof(matched[0]))
|
memset(matched, 0, length * sizeof(matched[0]))
|
||||||
span_filter = self._filter.get(key)
|
span_filter = self._filter.get(key)
|
||||||
if span_filter == "FIRST":
|
if span_filter == "FIRST":
|
||||||
sorted_pairs = sorted(pairs, key=lambda x: (x[0], -x[1]), reverse=False) # sort by start
|
sorted_pairs = sorted(pairs, key=lambda x: (x[0], -x[1]), reverse=False) # sort by start
|
||||||
elif span_filter == "LONGEST":
|
elif span_filter == "LONGEST":
|
||||||
sorted_pairs = sorted(pairs, key=lambda x: (x[1]-x[0], -x[0]), reverse=True) # reverse sort by length
|
sorted_pairs = sorted(pairs, key=lambda x: (x[1] - x[0], -x[0]), reverse=True) # reverse sort by length
|
||||||
else:
|
else:
|
||||||
raise ValueError(Errors.E947.format(expected=["FIRST", "LONGEST"], arg=span_filter))
|
raise ValueError(Errors.E947.format(expected=["FIRST", "LONGEST"], arg=span_filter))
|
||||||
for match in sorted_pairs:
|
for match in sorted_pairs:
|
||||||
start, end = match[:2]
|
start, end = match[:2]
|
||||||
assert 0 <= start < end # Defend against segfaults
|
assert 0 <= start < end # Defend against segfaults
|
||||||
span_len = end-start
|
span_len = end - start
|
||||||
# If no tokens in the span have matched
|
# If no tokens in the span have matched
|
||||||
if memcmp(&matched[start], &empty[start], span_len * sizeof(matched[0])) == 0:
|
if memcmp(&matched[start], &empty[start], span_len * sizeof(matched[0])) == 0:
|
||||||
final_matches.append((key, *match))
|
final_matches.append((key, *match))
|
||||||
|
@ -301,9 +302,9 @@ cdef class Matcher:
|
||||||
final_results = []
|
final_results = []
|
||||||
for key, start, end, alignments in final_matches:
|
for key, start, end, alignments in final_matches:
|
||||||
sorted_alignments = sorted(alignments, key=lambda x: (x['length'], x['token_idx']), reverse=False)
|
sorted_alignments = sorted(alignments, key=lambda x: (x['length'], x['token_idx']), reverse=False)
|
||||||
alignments = [0] * (end-start)
|
alignments = [0] * (end - start)
|
||||||
for align in sorted_alignments:
|
for align in sorted_alignments:
|
||||||
if align['length'] >= end-start:
|
if align['length'] >= end - start:
|
||||||
continue
|
continue
|
||||||
# Since alignments are sorted in order of (length, token_idx)
|
# Since alignments are sorted in order of (length, token_idx)
|
||||||
# this overwrites smaller token_idx when they have same length.
|
# this overwrites smaller token_idx when they have same length.
|
||||||
|
@ -325,7 +326,6 @@ cdef class Matcher:
|
||||||
else:
|
else:
|
||||||
return key
|
return key
|
||||||
|
|
||||||
|
|
||||||
def unpickle_matcher(vocab, patterns, callbacks):
|
def unpickle_matcher(vocab, patterns, callbacks):
|
||||||
matcher = Matcher(vocab)
|
matcher = Matcher(vocab)
|
||||||
for key, pattern in patterns.items():
|
for key, pattern in patterns.items():
|
||||||
|
@ -333,8 +333,8 @@ def unpickle_matcher(vocab, patterns, callbacks):
|
||||||
matcher.add(key, pattern, on_match=callback)
|
matcher.add(key, pattern, on_match=callback)
|
||||||
return matcher
|
return matcher
|
||||||
|
|
||||||
|
cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, extensions=None, predicates=tuple(),
|
||||||
cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, extensions=None, predicates=tuple(), bint with_alignments=0):
|
bint with_alignments=0):
|
||||||
"""Find matches in a doc, with a compiled array of patterns. Matches are
|
"""Find matches in a doc, with a compiled array of patterns. Matches are
|
||||||
returned as a list of (id, start, end) tuples or (id, start, end, alignments) tuples (if with_alignments != 0)
|
returned as a list of (id, start, end) tuples or (id, start, end, alignments) tuples (if with_alignments != 0)
|
||||||
|
|
||||||
|
@ -358,13 +358,13 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
|
||||||
# avoid any processing or mem alloc if the document is empty
|
# avoid any processing or mem alloc if the document is empty
|
||||||
return output
|
return output
|
||||||
if len(predicates) > 0:
|
if len(predicates) > 0:
|
||||||
predicate_cache = <int8_t*>mem.alloc(length * len(predicates), sizeof(int8_t))
|
predicate_cache = <int8_t *> mem.alloc(length * len(predicates), sizeof(int8_t))
|
||||||
if extensions is not None and len(extensions) >= 1:
|
if extensions is not None and len(extensions) >= 1:
|
||||||
nr_extra_attr = max(extensions.values()) + 1
|
nr_extra_attr = max(extensions.values()) + 1
|
||||||
extra_attr_values = <attr_t*>mem.alloc(length * nr_extra_attr, sizeof(attr_t))
|
extra_attr_values = <attr_t *> mem.alloc(length * nr_extra_attr, sizeof(attr_t))
|
||||||
else:
|
else:
|
||||||
nr_extra_attr = 0
|
nr_extra_attr = 0
|
||||||
extra_attr_values = <attr_t*>mem.alloc(length, sizeof(attr_t))
|
extra_attr_values = <attr_t *> mem.alloc(length, sizeof(attr_t))
|
||||||
for i, token in enumerate(doclike):
|
for i, token in enumerate(doclike):
|
||||||
for name, index in extensions.items():
|
for name, index in extensions.items():
|
||||||
value = token._.get(name)
|
value = token._.get(name)
|
||||||
|
@ -378,8 +378,8 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
|
||||||
states.push_back(PatternStateC(patterns[j], i, 0))
|
states.push_back(PatternStateC(patterns[j], i, 0))
|
||||||
if with_alignments != 0:
|
if with_alignments != 0:
|
||||||
align_states.resize(states.size())
|
align_states.resize(states.size())
|
||||||
transition_states(states, matches, align_states, align_matches, predicate_cache,
|
transition_states(states, matches, align_states, align_matches, predicate_cache, doclike[i], extra_attr_values,
|
||||||
doclike[i], extra_attr_values, predicates, with_alignments)
|
predicates, with_alignments)
|
||||||
extra_attr_values += nr_extra_attr
|
extra_attr_values += nr_extra_attr
|
||||||
predicate_cache += len(predicates)
|
predicate_cache += len(predicates)
|
||||||
# Handle matches that end in 0-width patterns
|
# Handle matches that end in 0-width patterns
|
||||||
|
@ -389,7 +389,7 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
|
||||||
match = (
|
match = (
|
||||||
matches[i].pattern_id,
|
matches[i].pattern_id,
|
||||||
matches[i].start,
|
matches[i].start,
|
||||||
matches[i].start+matches[i].length
|
matches[i].start + matches[i].length
|
||||||
)
|
)
|
||||||
# We need to deduplicate, because we could otherwise arrive at the same
|
# We need to deduplicate, because we could otherwise arrive at the same
|
||||||
# match through two paths, e.g. .?.? matching 'a'. Are we matching the
|
# match through two paths, e.g. .?.? matching 'a'. Are we matching the
|
||||||
|
@ -404,21 +404,19 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
|
||||||
seen.add(match)
|
seen.add(match)
|
||||||
return output
|
return output
|
||||||
|
|
||||||
|
|
||||||
cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& matches,
|
cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& matches,
|
||||||
vector[vector[MatchAlignmentC]]& align_states, vector[vector[MatchAlignmentC]]& align_matches,
|
vector[vector[MatchAlignmentC]]& align_states,
|
||||||
int8_t* cached_py_predicates,
|
vector[vector[MatchAlignmentC]]& align_matches,
|
||||||
Token token, const attr_t* extra_attrs, py_predicates, bint with_alignments) except *:
|
int8_t * cached_py_predicates, Token token,
|
||||||
|
const attr_t * extra_attrs, py_predicates, bint with_alignments) except *:
|
||||||
cdef int q = 0
|
cdef int q = 0
|
||||||
cdef vector[PatternStateC] new_states
|
cdef vector[PatternStateC] new_states
|
||||||
cdef vector[vector[MatchAlignmentC]] align_new_states
|
cdef vector[vector[MatchAlignmentC]] align_new_states
|
||||||
cdef int nr_predicate = len(py_predicates)
|
cdef int nr_predicate = len(py_predicates)
|
||||||
for i in range(states.size()):
|
for i in range(states.size()):
|
||||||
if states[i].pattern.nr_py >= 1:
|
if states[i].pattern.nr_py >= 1:
|
||||||
update_predicate_cache(cached_py_predicates,
|
update_predicate_cache(cached_py_predicates, states[i].pattern, token, py_predicates)
|
||||||
states[i].pattern, token, py_predicates)
|
action = get_action(states[i], token.c, extra_attrs, cached_py_predicates)
|
||||||
action = get_action(states[i], token.c, extra_attrs,
|
|
||||||
cached_py_predicates)
|
|
||||||
if action == REJECT:
|
if action == REJECT:
|
||||||
continue
|
continue
|
||||||
# Keep only a subset of states (the active ones). Index q is the
|
# Keep only a subset of states (the active ones). Index q is the
|
||||||
|
@ -431,31 +429,32 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
|
||||||
if with_alignments != 0:
|
if with_alignments != 0:
|
||||||
align_state = align_states[i]
|
align_state = align_states[i]
|
||||||
align_states[q] = align_state
|
align_states[q] = align_state
|
||||||
while action in (RETRY, RETRY_ADVANCE, RETRY_EXTEND):
|
while action in (RETRY, RETRY_ADVANCE, RETRY_EXTEND, RETRY_OR_EXTEND):
|
||||||
# Update alignment before the transition of current state
|
# Update alignment before the transition of current state
|
||||||
# 'MatchAlignmentC' maps 'original token index of current pattern' to 'current matching length'
|
# 'MatchAlignmentC' maps 'original token index of current pattern' to 'current matching length'
|
||||||
if with_alignments != 0:
|
if with_alignments != 0:
|
||||||
align_states[q].push_back(MatchAlignmentC(states[q].pattern.token_idx, states[q].length))
|
align_states[q].push_back(MatchAlignmentC(states[q].pattern.token_idx, states[q].length))
|
||||||
if action == RETRY_EXTEND:
|
if action in [RETRY_EXTEND, RETRY_OR_EXTEND]:
|
||||||
# This handles the 'extend'
|
# This handles the 'extend'
|
||||||
new_states.push_back(
|
new_states.push_back(
|
||||||
PatternStateC(pattern=states[q].pattern, start=state.start,
|
PatternStateC(pattern=states[q].pattern, start=state.start, length=state.length + 1))
|
||||||
length=state.length+1))
|
|
||||||
if with_alignments != 0:
|
if with_alignments != 0:
|
||||||
align_new_states.push_back(align_states[q])
|
align_new_states.push_back(align_states[q])
|
||||||
if action == RETRY_ADVANCE:
|
if action == RETRY_ADVANCE:
|
||||||
# This handles the 'advance'
|
# This handles the 'advance'
|
||||||
new_states.push_back(
|
new_states.push_back(
|
||||||
PatternStateC(pattern=states[q].pattern+1, start=state.start,
|
PatternStateC(pattern=states[q].pattern + 1, start=state.start, length=state.length + 1))
|
||||||
length=state.length+1))
|
|
||||||
if with_alignments != 0:
|
if with_alignments != 0:
|
||||||
align_new_states.push_back(align_states[q])
|
align_new_states.push_back(align_states[q])
|
||||||
states[q].pattern += 1
|
states[q].pattern += 1
|
||||||
if states[q].pattern.nr_py != 0:
|
if states[q].pattern.nr_py != 0:
|
||||||
update_predicate_cache(cached_py_predicates,
|
update_predicate_cache(cached_py_predicates, states[q].pattern, token, py_predicates)
|
||||||
states[q].pattern, token, py_predicates)
|
next_action = get_action(states[q], token.c, extra_attrs, cached_py_predicates)
|
||||||
action = get_action(states[q], token.c, extra_attrs,
|
# To account for *? and +?
|
||||||
cached_py_predicates)
|
if get_quantifier(state) == ZERO_MINUS:
|
||||||
|
next_action = cast_to_non_greedy_action(action, next_action, new_states, align_new_states,
|
||||||
|
with_alignments)
|
||||||
|
action = next_action
|
||||||
# Update alignment before the transition of current state
|
# Update alignment before the transition of current state
|
||||||
if with_alignments != 0:
|
if with_alignments != 0:
|
||||||
align_states[q].push_back(MatchAlignmentC(states[q].pattern.token_idx, states[q].length))
|
align_states[q].push_back(MatchAlignmentC(states[q].pattern.token_idx, states[q].length))
|
||||||
|
@ -465,48 +464,49 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
|
||||||
states[q].pattern += 1
|
states[q].pattern += 1
|
||||||
states[q].length += 1
|
states[q].length += 1
|
||||||
q += 1
|
q += 1
|
||||||
|
elif action == EXTEND:
|
||||||
|
states[q].length += 1
|
||||||
|
q += 1
|
||||||
else:
|
else:
|
||||||
ent_id = get_ent_id(state.pattern)
|
ent_id = get_ent_id(state.pattern)
|
||||||
if action == MATCH:
|
if action == MATCH:
|
||||||
matches.push_back(
|
matches.push_back(MatchC(pattern_id=ent_id, start=state.start, length=state.length + 1))
|
||||||
MatchC(pattern_id=ent_id, start=state.start,
|
|
||||||
length=state.length+1))
|
|
||||||
# `align_matches` always corresponds to `matches` 1:1
|
# `align_matches` always corresponds to `matches` 1:1
|
||||||
if with_alignments != 0:
|
if with_alignments != 0:
|
||||||
align_matches.push_back(align_states[q])
|
align_matches.push_back(align_states[q])
|
||||||
elif action == MATCH_DOUBLE:
|
elif action == MATCH_DOUBLE:
|
||||||
# push match without last token if length > 0
|
# push match without last token if length > 0
|
||||||
if state.length > 0:
|
if state.length > 0:
|
||||||
matches.push_back(
|
matches.push_back(MatchC(pattern_id=ent_id, start=state.start, length=state.length))
|
||||||
MatchC(pattern_id=ent_id, start=state.start,
|
|
||||||
length=state.length))
|
|
||||||
# MATCH_DOUBLE emits matches twice,
|
# MATCH_DOUBLE emits matches twice,
|
||||||
# add one more to align_matches in order to keep 1:1 relationship
|
# add one more to align_matches in order to keep 1:1 relationship
|
||||||
if with_alignments != 0:
|
if with_alignments != 0:
|
||||||
align_matches.push_back(align_states[q])
|
align_matches.push_back(align_states[q])
|
||||||
# push match with last token
|
# push match with last token
|
||||||
matches.push_back(
|
matches.push_back(MatchC(pattern_id=ent_id, start=state.start, length=state.length + 1))
|
||||||
MatchC(pattern_id=ent_id, start=state.start,
|
|
||||||
length=state.length+1))
|
|
||||||
# `align_matches` always corresponds to `matches` 1:1
|
# `align_matches` always corresponds to `matches` 1:1
|
||||||
if with_alignments != 0:
|
if with_alignments != 0:
|
||||||
align_matches.push_back(align_states[q])
|
align_matches.push_back(align_states[q])
|
||||||
elif action == MATCH_REJECT:
|
elif action == MATCH_REJECT:
|
||||||
matches.push_back(
|
matches.push_back(MatchC(pattern_id=ent_id, start=state.start, length=state.length))
|
||||||
MatchC(pattern_id=ent_id, start=state.start,
|
|
||||||
length=state.length))
|
|
||||||
# `align_matches` always corresponds to `matches` 1:1
|
# `align_matches` always corresponds to `matches` 1:1
|
||||||
if with_alignments != 0:
|
if with_alignments != 0:
|
||||||
align_matches.push_back(align_states[q])
|
align_matches.push_back(align_states[q])
|
||||||
elif action == MATCH_EXTEND:
|
elif action == MATCH_EXTEND:
|
||||||
matches.push_back(
|
matches.push_back(MatchC(pattern_id=ent_id, start=state.start, length=state.length))
|
||||||
MatchC(pattern_id=ent_id, start=state.start,
|
|
||||||
length=state.length))
|
|
||||||
# `align_matches` always corresponds to `matches` 1:1
|
# `align_matches` always corresponds to `matches` 1:1
|
||||||
if with_alignments != 0:
|
if with_alignments != 0:
|
||||||
align_matches.push_back(align_states[q])
|
align_matches.push_back(align_states[q])
|
||||||
states[q].length += 1
|
states[q].length += 1
|
||||||
q += 1
|
q += 1
|
||||||
|
elif action == MATCH_ADVANCE:
|
||||||
|
matches.push_back(MatchC(pattern_id=ent_id, start=state.start, length=state.length + 1))
|
||||||
|
# `align_matches` always corresponds to `matches` 1:1
|
||||||
|
if with_alignments != 0:
|
||||||
|
align_matches.push_back(align_states[q])
|
||||||
|
states[q].pattern += 1
|
||||||
|
states[q].length += 1
|
||||||
|
q += 1
|
||||||
states.resize(q)
|
states.resize(q)
|
||||||
for i in range(new_states.size()):
|
for i in range(new_states.size()):
|
||||||
states.push_back(new_states[i])
|
states.push_back(new_states[i])
|
||||||
|
@ -516,9 +516,8 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
|
||||||
for i in range(align_new_states.size()):
|
for i in range(align_new_states.size()):
|
||||||
align_states.push_back(align_new_states[i])
|
align_states.push_back(align_new_states[i])
|
||||||
|
|
||||||
|
cdef int update_predicate_cache(int8_t * cache,
|
||||||
cdef int update_predicate_cache(int8_t* cache,
|
const TokenPatternC * pattern, Token token, predicates) except -1:
|
||||||
const TokenPatternC* pattern, Token token, predicates) except -1:
|
|
||||||
# If the state references any extra predicates, check whether they match.
|
# If the state references any extra predicates, check whether they match.
|
||||||
# These are cached, so that we don't call these potentially expensive
|
# These are cached, so that we don't call these potentially expensive
|
||||||
# Python functions more than we need to.
|
# Python functions more than we need to.
|
||||||
|
@ -536,7 +535,6 @@ cdef int update_predicate_cache(int8_t* cache,
|
||||||
else:
|
else:
|
||||||
raise ValueError(Errors.E125.format(value=result))
|
raise ValueError(Errors.E125.format(value=result))
|
||||||
|
|
||||||
|
|
||||||
cdef void finish_states(vector[MatchC]& matches, vector[PatternStateC]& states,
|
cdef void finish_states(vector[MatchC]& matches, vector[PatternStateC]& states,
|
||||||
vector[vector[MatchAlignmentC]]& align_matches,
|
vector[vector[MatchAlignmentC]]& align_matches,
|
||||||
vector[vector[MatchAlignmentC]]& align_states,
|
vector[vector[MatchAlignmentC]]& align_states,
|
||||||
|
@ -546,9 +544,12 @@ cdef void finish_states(vector[MatchC]& matches, vector[PatternStateC]& states,
|
||||||
cdef vector[MatchAlignmentC] align_state
|
cdef vector[MatchAlignmentC] align_state
|
||||||
for i in range(states.size()):
|
for i in range(states.size()):
|
||||||
state = states[i]
|
state = states[i]
|
||||||
|
if is_non_greedy_star(state):
|
||||||
|
# if the final pattern token is a *?, remove the match by skipping it.
|
||||||
|
continue
|
||||||
if with_alignments != 0:
|
if with_alignments != 0:
|
||||||
align_state = align_states[i]
|
align_state = align_states[i]
|
||||||
while get_quantifier(state) in (ZERO_PLUS, ZERO_ONE):
|
while get_quantifier(state) in (ZERO_PLUS, ZERO_MINUS, ZERO_ONE):
|
||||||
# Update alignment before the transition of current state
|
# Update alignment before the transition of current state
|
||||||
if with_alignments != 0:
|
if with_alignments != 0:
|
||||||
align_state.push_back(MatchAlignmentC(state.pattern.token_idx, state.length))
|
align_state.push_back(MatchAlignmentC(state.pattern.token_idx, state.length))
|
||||||
|
@ -564,13 +565,11 @@ cdef void finish_states(vector[MatchC]& matches, vector[PatternStateC]& states,
|
||||||
else:
|
else:
|
||||||
state.pattern += 1
|
state.pattern += 1
|
||||||
|
|
||||||
|
cdef action_t get_action(PatternStateC state, const TokenC * token, const attr_t * extra_attrs,
|
||||||
cdef action_t get_action(PatternStateC state,
|
const int8_t * predicate_matches) nogil:
|
||||||
const TokenC* token, const attr_t* extra_attrs,
|
|
||||||
const int8_t* predicate_matches) nogil:
|
|
||||||
"""We need to consider:
|
"""We need to consider:
|
||||||
a) Does the token match the specification? [Yes, No]
|
a) Does the token match the specification? [Yes, No]
|
||||||
b) What's the quantifier? [1, 0+, ?]
|
b) What's the quantifier? [1, 0+, 0-, ?]
|
||||||
c) Is this the last specification? [final, non-final]
|
c) Is this the last specification? [final, non-final]
|
||||||
|
|
||||||
We can transition in the following ways:
|
We can transition in the following ways:
|
||||||
|
@ -580,7 +579,8 @@ cdef action_t get_action(PatternStateC state,
|
||||||
d) Do we add a state with (same state, next token)?
|
d) Do we add a state with (same state, next token)?
|
||||||
|
|
||||||
We'll code the actions as boolean strings, so 0000 means no to all 4,
|
We'll code the actions as boolean strings, so 0000 means no to all 4,
|
||||||
1000 means match but no states added, etc.
|
1000 means match but no states added,
|
||||||
|
and numbers other than 1 represents special actions etc.
|
||||||
|
|
||||||
1:
|
1:
|
||||||
Yes, final:
|
Yes, final:
|
||||||
|
@ -600,13 +600,22 @@ cdef action_t get_action(PatternStateC state,
|
||||||
1000 (note: Don't include last token!)
|
1000 (note: Don't include last token!)
|
||||||
No, non-final:
|
No, non-final:
|
||||||
0010
|
0010
|
||||||
|
0-:
|
||||||
|
Yes, final:
|
||||||
|
2000 (note: Don't include last token!)
|
||||||
|
Yes, non-final:
|
||||||
|
0022 (note: Retry or Extend)
|
||||||
|
No, final:
|
||||||
|
2000 (note: Don't include last token!)
|
||||||
|
No, non-final:
|
||||||
|
0010
|
||||||
?:
|
?:
|
||||||
Yes, final:
|
Yes, final:
|
||||||
1000
|
3000
|
||||||
Yes, non-final:
|
Yes, non-final:
|
||||||
0100
|
0100
|
||||||
No, final:
|
No, final:
|
||||||
1000 (note: Don't include last token!)
|
2000 (note: Don't include last token!)
|
||||||
No, non-final:
|
No, non-final:
|
||||||
0010
|
0010
|
||||||
|
|
||||||
|
@ -617,9 +626,12 @@ cdef action_t get_action(PatternStateC state,
|
||||||
MATCH = 1000
|
MATCH = 1000
|
||||||
ADVANCE = 0100
|
ADVANCE = 0100
|
||||||
RETRY = 0010
|
RETRY = 0010
|
||||||
|
EXTEND = 0001
|
||||||
MATCH_EXTEND = 1001
|
MATCH_EXTEND = 1001
|
||||||
|
MATCH_ADVANCE = 1100
|
||||||
RETRY_ADVANCE = 0110
|
RETRY_ADVANCE = 0110
|
||||||
RETRY_EXTEND = 0011
|
RETRY_EXTEND = 0011
|
||||||
|
RETRY_OR_EXTEND = 0022 # If there is a Match after Retry, does not Extend
|
||||||
MATCH_REJECT = 2000 # Match, but don't include last token
|
MATCH_REJECT = 2000 # Match, but don't include last token
|
||||||
MATCH_DOUBLE = 3000 # Match both with and without last token
|
MATCH_DOUBLE = 3000 # Match both with and without last token
|
||||||
|
|
||||||
|
@ -633,53 +645,75 @@ cdef action_t get_action(PatternStateC state,
|
||||||
is_match = not is_match
|
is_match = not is_match
|
||||||
quantifier = ONE
|
quantifier = ONE
|
||||||
if quantifier == ONE:
|
if quantifier == ONE:
|
||||||
if is_match and is_final:
|
if is_match and is_final:
|
||||||
# Yes, final: 1000
|
# Yes, final: 1000
|
||||||
return MATCH
|
return MATCH
|
||||||
elif is_match and not is_final:
|
elif is_non_greedy_plus(state) and has_star_tail(state) and is_match and not is_final:
|
||||||
# Yes, non-final: 0100
|
# Yes, non-final: 1100
|
||||||
return ADVANCE
|
# Modification for +?:
|
||||||
elif not is_match and is_final:
|
# Having MATCH_ADVANCE handles the match at the 'ONE' part of the token instead of relying on MATCH_REJECT
|
||||||
# No, final: 0000
|
# and other actions from other tokens to produce a match.
|
||||||
return REJECT
|
# is_non_greedy_plus() verifies that the current state's pattern is +?
|
||||||
else:
|
# has_star_tail() verifies the remaining pattern tokens are either * or *?,
|
||||||
return REJECT
|
# so that it is valid for the current match to exist.
|
||||||
|
return MATCH_ADVANCE
|
||||||
|
elif is_match and not is_final:
|
||||||
|
# Yes, non-final: 0100
|
||||||
|
return ADVANCE
|
||||||
|
elif not is_match and is_final:
|
||||||
|
# No, final: 0000
|
||||||
|
return REJECT
|
||||||
|
else:
|
||||||
|
return REJECT
|
||||||
elif quantifier == ZERO_PLUS:
|
elif quantifier == ZERO_PLUS:
|
||||||
if is_match and is_final:
|
if is_match and is_final:
|
||||||
# Yes, final: 1001
|
# Yes, final: 1001
|
||||||
return MATCH_EXTEND
|
return MATCH_EXTEND
|
||||||
elif is_match and not is_final:
|
elif is_match and not is_final:
|
||||||
# Yes, non-final: 0011
|
# Yes, non-final: 0011
|
||||||
return RETRY_EXTEND
|
return RETRY_EXTEND
|
||||||
elif not is_match and is_final:
|
elif not is_match and is_final:
|
||||||
# No, final 2000 (note: Don't include last token!)
|
# No, final 2000 (note: Don't include last token!)
|
||||||
return MATCH_REJECT
|
return MATCH_REJECT
|
||||||
else:
|
else:
|
||||||
# No, non-final 0010
|
# No, non-final 0010
|
||||||
return RETRY
|
return RETRY
|
||||||
|
elif quantifier == ZERO_MINUS:
|
||||||
|
if is_final or has_non_greedy_tail(state):
|
||||||
|
# Yes/No, final: 2000 (note: Don't include last token!)
|
||||||
|
return MATCH_REJECT
|
||||||
|
elif is_match:
|
||||||
|
# Yes, non-final: 0022
|
||||||
|
# If there is a match, further extensions are skipped so that the behaviour is non-greedy
|
||||||
|
# pattern: b*?b string: b b
|
||||||
|
# We do not extend on first b to exhibit non-greedy behaviour
|
||||||
|
# such that "b" is matched but "b b" is not matched
|
||||||
|
return RETRY_OR_EXTEND
|
||||||
|
else:
|
||||||
|
# No, non-final 0010
|
||||||
|
return RETRY
|
||||||
elif quantifier == ZERO_ONE:
|
elif quantifier == ZERO_ONE:
|
||||||
if is_match and is_final:
|
if is_match and is_final:
|
||||||
# Yes, final: 3000
|
# Yes, final: 3000
|
||||||
# To cater for a pattern ending in "?", we need to add
|
# To cater for a pattern ending in "?", we need to add
|
||||||
# a match both with and without the last token
|
# a match both with and without the last token
|
||||||
return MATCH_DOUBLE
|
return MATCH_DOUBLE
|
||||||
elif is_match and not is_final:
|
elif is_match and not is_final:
|
||||||
# Yes, non-final: 0110
|
# Yes, non-final: 0110
|
||||||
# We need both branches here, consider a pair like:
|
# We need both branches here, consider a pair like:
|
||||||
# pattern: .?b string: b
|
# pattern: .?b string: b
|
||||||
# If we 'ADVANCE' on the .?, we miss the match.
|
# If we 'ADVANCE' on the .?, we miss the match.
|
||||||
return RETRY_ADVANCE
|
return RETRY_ADVANCE
|
||||||
elif not is_match and is_final:
|
elif not is_match and is_final:
|
||||||
# No, final 2000 (note: Don't include last token!)
|
# No, final 2000 (note: Don't include last token!)
|
||||||
return MATCH_REJECT
|
return MATCH_REJECT
|
||||||
else:
|
else:
|
||||||
# No, non-final 0010
|
# No, non-final 0010
|
||||||
return RETRY
|
return RETRY
|
||||||
|
|
||||||
|
|
||||||
cdef int8_t get_is_match(PatternStateC state,
|
cdef int8_t get_is_match(PatternStateC state,
|
||||||
const TokenC* token, const attr_t* extra_attrs,
|
const TokenC * token, const attr_t * extra_attrs,
|
||||||
const int8_t* predicate_matches) nogil:
|
const int8_t * predicate_matches) nogil:
|
||||||
for i in range(state.pattern.nr_py):
|
for i in range(state.pattern.nr_py):
|
||||||
if predicate_matches[state.pattern.py_predicates[i]] == -1:
|
if predicate_matches[state.pattern.py_predicates[i]] == -1:
|
||||||
return 0
|
return 0
|
||||||
|
@ -693,6 +727,61 @@ cdef int8_t get_is_match(PatternStateC state,
|
||||||
return 0
|
return 0
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
cdef action_t cast_to_non_greedy_action(action_t action, action_t next_action, vector[PatternStateC]& new_states,
|
||||||
|
vector[vector[MatchAlignmentC]]& align_new_states, bint with_alignments) nogil:
|
||||||
|
"""Cast "next_action" to another "action" that demonstrates non-greedy behaviour.
|
||||||
|
|
||||||
|
To cast "next_action" to a non-greedy action, the "next_action"s that we have to modify are
|
||||||
|
MATCH, MATCH REJECT, MATCH_EXTEND, MATCH_DOUBLE.
|
||||||
|
|
||||||
|
cast_to_non_greedy_action() is required and cannot be merged with get_action() as there is a need for the
|
||||||
|
comparison of the 2 different actions from different patterns.
|
||||||
|
|
||||||
|
next_action = MATCH, action = RETRY_OR_EXTEND
|
||||||
|
- Removed the extension when there is a MATCH
|
||||||
|
|
||||||
|
next_action = MATCH_REJECT
|
||||||
|
- Cast MATCH_REJECT TO REJECT
|
||||||
|
- Remove the match since it ends with the '*?' pattern token and removes the current state
|
||||||
|
- 'state' is ZERO_MINUS so the previous doc token matched the ZERO_MINUS pattern token
|
||||||
|
- E.g. pattern = "a*? b*", doc = "a a"
|
||||||
|
- MATCH_REJECT will add 'a' to the matches in transition_states()
|
||||||
|
- and casting MATCH_REJECT to EXTEND removes such results.
|
||||||
|
|
||||||
|
next_action = MATCH_EXTEND, action = RETRY (where the RETRY came from ZERO_MINUS quantifier)
|
||||||
|
- Cast MATCH_EXTEND to EXTEND
|
||||||
|
- Remove the match since it ends with the '*?' pattern token
|
||||||
|
- E.g. pattern = "a*? b*" doc = "a b"
|
||||||
|
- MATCH_EXTEND will add 'a' to the matches in transition_states()
|
||||||
|
- and casting MATCH_EXTEND to EXTEND removes such results.
|
||||||
|
|
||||||
|
next_action = MATCH_DOUBLE after action = RETRY (where the RETRY came from ZERO_MINUS quantifier)
|
||||||
|
- Cast MATCH_DOUBLE to MATCH
|
||||||
|
- MATCH_DOUBLE adds 2 matches, one with the last token and one without the token, casting the action to MATCH
|
||||||
|
- removes the match without the last token which is the match that ends with a '*?' pattern token.
|
||||||
|
- E.g. pattern = "a* b?" doc = "a b"
|
||||||
|
- MATCH_DOUBLE will add add the following 2 matches ['a' and 'a b']
|
||||||
|
- and casting MATCH_DOUBLE to MATCH removes 'a'.
|
||||||
|
"""
|
||||||
|
if action == RETRY_OR_EXTEND and next_action == MATCH:
|
||||||
|
# Stop the extension once there is a match
|
||||||
|
new_states.pop_back()
|
||||||
|
if with_alignments != 0:
|
||||||
|
align_new_states.pop_back()
|
||||||
|
return MATCH
|
||||||
|
elif next_action == MATCH_REJECT:
|
||||||
|
# Remove matches that end with *? token
|
||||||
|
# MATCH_REJECT will result in matches that end with the *? token since the
|
||||||
|
return REJECT
|
||||||
|
elif action == RETRY and next_action == MATCH_EXTEND:
|
||||||
|
# This handles the 'extend' without matching
|
||||||
|
# Remove matches that end with *? token
|
||||||
|
return EXTEND
|
||||||
|
elif action == RETRY and next_action == MATCH_DOUBLE:
|
||||||
|
# Remove matches that end with *? token for operator '?'
|
||||||
|
return MATCH
|
||||||
|
else:
|
||||||
|
return next_action
|
||||||
|
|
||||||
cdef inline int8_t get_is_final(PatternStateC state) nogil:
|
cdef inline int8_t get_is_final(PatternStateC state) nogil:
|
||||||
if state.pattern[1].quantifier == FINAL_ID:
|
if state.pattern[1].quantifier == FINAL_ID:
|
||||||
|
@ -700,31 +789,60 @@ cdef inline int8_t get_is_final(PatternStateC state) nogil:
|
||||||
else:
|
else:
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
cdef inline int8_t get_quantifier(PatternStateC state) nogil:
|
cdef inline int8_t get_quantifier(PatternStateC state) nogil:
|
||||||
return state.pattern.quantifier
|
return state.pattern.quantifier
|
||||||
|
|
||||||
|
cdef inline int8_t is_non_greedy_plus(PatternStateC state) nogil:
|
||||||
|
"""Verify whether current state pattern is '+?'"""
|
||||||
|
if (state.pattern + 1).quantifier == ZERO_MINUS and get_quantifier(state) == ONE \
|
||||||
|
and (state.pattern + 1).token_idx == state.pattern.token_idx:
|
||||||
|
return 1
|
||||||
|
else:
|
||||||
|
return 0
|
||||||
|
|
||||||
cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, object token_specs) except NULL:
|
cdef inline int8_t is_non_greedy_star(PatternStateC state) nogil:
|
||||||
pattern = <TokenPatternC*>mem.alloc(len(token_specs) + 1, sizeof(TokenPatternC))
|
"""Verify whether current state pattern is '*?'"""
|
||||||
|
if (state.pattern - 1).quantifier != ONE and get_quantifier(state) == ZERO_MINUS:
|
||||||
|
return 1
|
||||||
|
else:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
cdef inline int8_t has_star_tail(PatternStateC state) nogil:
|
||||||
|
"""Verify whether all remaining patterns are either '*' or '*?'"""
|
||||||
|
while not get_is_final(state):
|
||||||
|
state.pattern += 1
|
||||||
|
if get_quantifier(state) not in [ZERO_PLUS, ZERO_MINUS]:
|
||||||
|
return 0
|
||||||
|
return 1
|
||||||
|
|
||||||
|
cdef inline int8_t has_non_greedy_tail(PatternStateC state) nogil:
|
||||||
|
"""Verify whether all remaining patterns are '*?'"""
|
||||||
|
while not get_is_final(state):
|
||||||
|
state.pattern += 1
|
||||||
|
if state.pattern.quantifier != ZERO_MINUS:
|
||||||
|
return 0
|
||||||
|
return 1
|
||||||
|
|
||||||
|
cdef TokenPatternC * init_pattern(Pool mem, attr_t entity_id, object token_specs) except NULL:
|
||||||
|
pattern = <TokenPatternC *> mem.alloc(len(token_specs) + 1, sizeof(TokenPatternC))
|
||||||
cdef int i, index
|
cdef int i, index
|
||||||
for i, (quantifier, spec, extensions, predicates, token_idx) in enumerate(token_specs):
|
for i, (quantifier, spec, extensions, predicates, token_idx) in enumerate(token_specs):
|
||||||
pattern[i].quantifier = quantifier
|
pattern[i].quantifier = quantifier
|
||||||
# Ensure attrs refers to a null pointer if nr_attr == 0
|
# Ensure attrs refers to a null pointer if nr_attr == 0
|
||||||
if len(spec) > 0:
|
if len(spec) > 0:
|
||||||
pattern[i].attrs = <AttrValueC*>mem.alloc(len(spec), sizeof(AttrValueC))
|
pattern[i].attrs = <AttrValueC *> mem.alloc(len(spec), sizeof(AttrValueC))
|
||||||
pattern[i].nr_attr = len(spec)
|
pattern[i].nr_attr = len(spec)
|
||||||
for j, (attr, value) in enumerate(spec):
|
for j, (attr, value) in enumerate(spec):
|
||||||
pattern[i].attrs[j].attr = attr
|
pattern[i].attrs[j].attr = attr
|
||||||
pattern[i].attrs[j].value = value
|
pattern[i].attrs[j].value = value
|
||||||
if len(extensions) > 0:
|
if len(extensions) > 0:
|
||||||
pattern[i].extra_attrs = <IndexValueC*>mem.alloc(len(extensions), sizeof(IndexValueC))
|
pattern[i].extra_attrs = <IndexValueC *> mem.alloc(len(extensions), sizeof(IndexValueC))
|
||||||
for j, (index, value) in enumerate(extensions):
|
for j, (index, value) in enumerate(extensions):
|
||||||
pattern[i].extra_attrs[j].index = index
|
pattern[i].extra_attrs[j].index = index
|
||||||
pattern[i].extra_attrs[j].value = value
|
pattern[i].extra_attrs[j].value = value
|
||||||
pattern[i].nr_extra_attr = len(extensions)
|
pattern[i].nr_extra_attr = len(extensions)
|
||||||
if len(predicates) > 0:
|
if len(predicates) > 0:
|
||||||
pattern[i].py_predicates = <int32_t*>mem.alloc(len(predicates), sizeof(int32_t))
|
pattern[i].py_predicates = <int32_t *> mem.alloc(len(predicates), sizeof(int32_t))
|
||||||
for j, index in enumerate(predicates):
|
for j, index in enumerate(predicates):
|
||||||
pattern[i].py_predicates[j] = index
|
pattern[i].py_predicates[j] = index
|
||||||
pattern[i].nr_py = len(predicates)
|
pattern[i].nr_py = len(predicates)
|
||||||
|
@ -734,7 +852,7 @@ cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, object token_specs)
|
||||||
# Use quantifier to identify final ID pattern node (rather than previous
|
# Use quantifier to identify final ID pattern node (rather than previous
|
||||||
# uninitialized quantifier == 0/ZERO + nr_attr == 0 + non-zero-length attrs)
|
# uninitialized quantifier == 0/ZERO + nr_attr == 0 + non-zero-length attrs)
|
||||||
pattern[i].quantifier = FINAL_ID
|
pattern[i].quantifier = FINAL_ID
|
||||||
pattern[i].attrs = <AttrValueC*>mem.alloc(1, sizeof(AttrValueC))
|
pattern[i].attrs = <AttrValueC *> mem.alloc(1, sizeof(AttrValueC))
|
||||||
pattern[i].attrs[0].attr = ID
|
pattern[i].attrs[0].attr = ID
|
||||||
pattern[i].attrs[0].value = entity_id
|
pattern[i].attrs[0].value = entity_id
|
||||||
pattern[i].nr_attr = 1
|
pattern[i].nr_attr = 1
|
||||||
|
@ -744,7 +862,7 @@ cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, object token_specs)
|
||||||
return pattern
|
return pattern
|
||||||
|
|
||||||
|
|
||||||
cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil:
|
cdef attr_t get_ent_id(const TokenPatternC * pattern) nogil:
|
||||||
while pattern.quantifier != FINAL_ID:
|
while pattern.quantifier != FINAL_ID:
|
||||||
pattern += 1
|
pattern += 1
|
||||||
id_attr = pattern[0].attrs[0]
|
id_attr = pattern[0].attrs[0]
|
||||||
|
@ -979,7 +1097,7 @@ def _get_extra_predicates(spec, extra_predicates, vocab):
|
||||||
|
|
||||||
|
|
||||||
def _get_extension_extra_predicates(spec, extra_predicates, predicate_types,
|
def _get_extension_extra_predicates(spec, extra_predicates, predicate_types,
|
||||||
seen_predicates):
|
seen_predicates):
|
||||||
output = []
|
output = []
|
||||||
for attr, value in spec.items():
|
for attr, value in spec.items():
|
||||||
if isinstance(value, dict):
|
if isinstance(value, dict):
|
||||||
|
@ -1000,7 +1118,8 @@ def _get_extension_extra_predicates(spec, extra_predicates, predicate_types,
|
||||||
def _get_operators(spec):
|
def _get_operators(spec):
|
||||||
# Support 'syntactic sugar' operator '+', as combination of ONE, ZERO_PLUS
|
# Support 'syntactic sugar' operator '+', as combination of ONE, ZERO_PLUS
|
||||||
lookup = {"*": (ZERO_PLUS,), "+": (ONE, ZERO_PLUS),
|
lookup = {"*": (ZERO_PLUS,), "+": (ONE, ZERO_PLUS),
|
||||||
"?": (ZERO_ONE,), "1": (ONE,), "!": (ZERO,)}
|
"?": (ZERO_ONE,), "*?": (ZERO_MINUS,),
|
||||||
|
"+?": (ONE, ZERO_MINUS), "1": (ONE,), "!": (ZERO,)}
|
||||||
# Fix casing
|
# Fix casing
|
||||||
spec = {key.upper(): values for key, values in spec.items()
|
spec = {key.upper(): values for key, values in spec.items()
|
||||||
if isinstance(key, str)}
|
if isinstance(key, str)}
|
||||||
|
|
|
@ -202,6 +202,8 @@ class TokenPatternNumber(BaseModel):
|
||||||
class TokenPatternOperatorSimple(str, Enum):
|
class TokenPatternOperatorSimple(str, Enum):
|
||||||
plus: StrictStr = StrictStr("+")
|
plus: StrictStr = StrictStr("+")
|
||||||
star: StrictStr = StrictStr("*")
|
star: StrictStr = StrictStr("*")
|
||||||
|
plus_question: StrictStr = StrictStr("+?")
|
||||||
|
star_question: StrictStr = StrictStr("*?")
|
||||||
question: StrictStr = StrictStr("?")
|
question: StrictStr = StrictStr("?")
|
||||||
exclamation: StrictStr = StrictStr("!")
|
exclamation: StrictStr = StrictStr("!")
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user