From 616bc51743c08045647116def7b821f27b6c8358 Mon Sep 17 00:00:00 2001 From: Source-Shen <82353723+Source-Shen@users.noreply.github.com> Date: Sat, 2 Jul 2022 01:19:32 +0800 Subject: [PATCH] Implemented *? and +? to matcher Main Changes schemas.py - Updated TokenPatternOperatorSimple class to include +? and *? for token validation matcher.pxd - Added quantifier and actions which are needed for the implementation of +? and *? - Key additions: Quantifier "ZERO_MINUS", Action "RETRY_OR_EXTEND" and "MATCH_ADVANCE" matcher.pyx - cast_to_non_greedy_action() function to cast get_action() outputs into actions that demonstrates non-greedy behaviour - added ZERO_MINUS quantifier to get_action() - added a few helper functions --- spacy/matcher/matcher.pxd | 5 +- spacy/matcher/matcher.pyx | 361 +++++++++++++++++++++++++------------- spacy/schemas.py | 2 + 3 files changed, 246 insertions(+), 122 deletions(-) diff --git a/spacy/matcher/matcher.pxd b/spacy/matcher/matcher.pxd index 455f978cc..180a88a89 100644 --- a/spacy/matcher/matcher.pxd +++ b/spacy/matcher/matcher.pxd @@ -13,9 +13,12 @@ cdef enum action_t: MATCH = 1000 ADVANCE = 0100 RETRY = 0010 + EXTEND = 0001 RETRY_EXTEND = 0011 + RETRY_OR_EXTEND = 0022 RETRY_ADVANCE = 0110 MATCH_EXTEND = 1001 + MATCH_ADVANCE = 1100 MATCH_REJECT = 2000 MATCH_DOUBLE = 3000 @@ -24,8 +27,8 @@ cdef enum quantifier_t: ZERO ZERO_ONE ZERO_PLUS + ZERO_MINUS ONE - ONE_PLUS FINAL_ID diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 5105f69ed..31752a1e2 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -25,10 +25,8 @@ from ..errors import Errors, MatchPatternError, Warnings from ..strings import get_string_id from ..attrs import IDS - DEF PADDING = 5 - cdef class Matcher: """Match sequences of tokens, based on pattern rules. @@ -73,7 +71,7 @@ cdef class Matcher: """ return self.has_key(key) - def add(self, key, patterns, *, on_match=None, greedy: str=None): + def add(self, key, patterns, *, on_match=None, greedy: str = None): """Add a match-rule to the matcher. A match-rule consists of: an ID key, an on_match callback, and one or more patterns. @@ -90,6 +88,8 @@ cdef class Matcher: '?': Make the pattern optional, by allowing it to match 0 or 1 times. '+': Require the pattern to match 1 or more times. '*': Allow the pattern to zero or more times. + '+?': Require the pattern to match non-greedily 1 or more times. + '*?': Allow the pattern to match non-greedily 0 or more times. '{n}': Require the pattern to match exactly _n_ times. '{n,m}': Require the pattern to match at least _n_ but not more than _m_ times. '{n,}': Require the pattern to match at least _n_ times. @@ -128,7 +128,7 @@ cdef class Matcher: for pattern in patterns: try: specs = _preprocess_pattern(pattern, self.vocab, - self._extensions, self._extra_predicates) + self._extensions, self._extra_predicates) self.patterns.push_back(init_pattern(self.mem, key, specs)) for spec in specs: for attr, _ in spec[1]: @@ -160,7 +160,7 @@ cdef class Matcher: while i < self.patterns.size(): pattern_key = get_ent_id(self.patterns.at(i)) if pattern_key == norm_key: - self.patterns.erase(self.patterns.begin()+i) + self.patterns.erase(self.patterns.begin() + i) else: i += 1 @@ -253,7 +253,8 @@ cdef class Matcher: matches = [] else: matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length, - extensions=self._extensions, predicates=self._extra_predicates, with_alignments=with_alignments) + extensions=self._extensions, predicates=self._extra_predicates, + with_alignments=with_alignments) final_matches = [] pairs_by_id = {} # For each key, either add all matches, or only the filtered, @@ -267,21 +268,21 @@ cdef class Matcher: pairs_by_id[key] = pairs else: final_matches.append((key, *match)) - matched = tmp_pool.alloc(length, sizeof(char)) - empty = tmp_pool.alloc(length, sizeof(char)) + matched = tmp_pool.alloc(length, sizeof(char)) + empty = tmp_pool.alloc(length, sizeof(char)) for key, pairs in pairs_by_id.items(): memset(matched, 0, length * sizeof(matched[0])) span_filter = self._filter.get(key) if span_filter == "FIRST": - sorted_pairs = sorted(pairs, key=lambda x: (x[0], -x[1]), reverse=False) # sort by start + sorted_pairs = sorted(pairs, key=lambda x: (x[0], -x[1]), reverse=False) # sort by start elif span_filter == "LONGEST": - sorted_pairs = sorted(pairs, key=lambda x: (x[1]-x[0], -x[0]), reverse=True) # reverse sort by length + sorted_pairs = sorted(pairs, key=lambda x: (x[1] - x[0], -x[0]), reverse=True) # reverse sort by length else: raise ValueError(Errors.E947.format(expected=["FIRST", "LONGEST"], arg=span_filter)) for match in sorted_pairs: start, end = match[:2] assert 0 <= start < end # Defend against segfaults - span_len = end-start + span_len = end - start # If no tokens in the span have matched if memcmp(&matched[start], &empty[start], span_len * sizeof(matched[0])) == 0: final_matches.append((key, *match)) @@ -301,9 +302,9 @@ cdef class Matcher: final_results = [] for key, start, end, alignments in final_matches: sorted_alignments = sorted(alignments, key=lambda x: (x['length'], x['token_idx']), reverse=False) - alignments = [0] * (end-start) + alignments = [0] * (end - start) for align in sorted_alignments: - if align['length'] >= end-start: + if align['length'] >= end - start: continue # Since alignments are sorted in order of (length, token_idx) # this overwrites smaller token_idx when they have same length. @@ -325,7 +326,6 @@ cdef class Matcher: else: return key - def unpickle_matcher(vocab, patterns, callbacks): matcher = Matcher(vocab) for key, pattern in patterns.items(): @@ -333,8 +333,8 @@ def unpickle_matcher(vocab, patterns, callbacks): matcher.add(key, pattern, on_match=callback) return matcher - -cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, extensions=None, predicates=tuple(), bint with_alignments=0): +cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, extensions=None, predicates=tuple(), + bint with_alignments=0): """Find matches in a doc, with a compiled array of patterns. Matches are returned as a list of (id, start, end) tuples or (id, start, end, alignments) tuples (if with_alignments != 0) @@ -358,13 +358,13 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e # avoid any processing or mem alloc if the document is empty return output if len(predicates) > 0: - predicate_cache = mem.alloc(length * len(predicates), sizeof(int8_t)) + predicate_cache = mem.alloc(length * len(predicates), sizeof(int8_t)) if extensions is not None and len(extensions) >= 1: nr_extra_attr = max(extensions.values()) + 1 - extra_attr_values = mem.alloc(length * nr_extra_attr, sizeof(attr_t)) + extra_attr_values = mem.alloc(length * nr_extra_attr, sizeof(attr_t)) else: nr_extra_attr = 0 - extra_attr_values = mem.alloc(length, sizeof(attr_t)) + extra_attr_values = mem.alloc(length, sizeof(attr_t)) for i, token in enumerate(doclike): for name, index in extensions.items(): value = token._.get(name) @@ -378,8 +378,8 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e states.push_back(PatternStateC(patterns[j], i, 0)) if with_alignments != 0: align_states.resize(states.size()) - transition_states(states, matches, align_states, align_matches, predicate_cache, - doclike[i], extra_attr_values, predicates, with_alignments) + transition_states(states, matches, align_states, align_matches, predicate_cache, doclike[i], extra_attr_values, + predicates, with_alignments) extra_attr_values += nr_extra_attr predicate_cache += len(predicates) # Handle matches that end in 0-width patterns @@ -389,7 +389,7 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e match = ( matches[i].pattern_id, matches[i].start, - matches[i].start+matches[i].length + matches[i].start + matches[i].length ) # We need to deduplicate, because we could otherwise arrive at the same # match through two paths, e.g. .?.? matching 'a'. Are we matching the @@ -404,21 +404,19 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e seen.add(match) return output - cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& matches, - vector[vector[MatchAlignmentC]]& align_states, vector[vector[MatchAlignmentC]]& align_matches, - int8_t* cached_py_predicates, - Token token, const attr_t* extra_attrs, py_predicates, bint with_alignments) except *: + vector[vector[MatchAlignmentC]]& align_states, + vector[vector[MatchAlignmentC]]& align_matches, + int8_t * cached_py_predicates, Token token, + const attr_t * extra_attrs, py_predicates, bint with_alignments) except *: cdef int q = 0 cdef vector[PatternStateC] new_states cdef vector[vector[MatchAlignmentC]] align_new_states cdef int nr_predicate = len(py_predicates) for i in range(states.size()): if states[i].pattern.nr_py >= 1: - update_predicate_cache(cached_py_predicates, - states[i].pattern, token, py_predicates) - action = get_action(states[i], token.c, extra_attrs, - cached_py_predicates) + update_predicate_cache(cached_py_predicates, states[i].pattern, token, py_predicates) + action = get_action(states[i], token.c, extra_attrs, cached_py_predicates) if action == REJECT: continue # Keep only a subset of states (the active ones). Index q is the @@ -431,31 +429,32 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match if with_alignments != 0: align_state = align_states[i] align_states[q] = align_state - while action in (RETRY, RETRY_ADVANCE, RETRY_EXTEND): + while action in (RETRY, RETRY_ADVANCE, RETRY_EXTEND, RETRY_OR_EXTEND): # Update alignment before the transition of current state # 'MatchAlignmentC' maps 'original token index of current pattern' to 'current matching length' if with_alignments != 0: align_states[q].push_back(MatchAlignmentC(states[q].pattern.token_idx, states[q].length)) - if action == RETRY_EXTEND: + if action in [RETRY_EXTEND, RETRY_OR_EXTEND]: # This handles the 'extend' new_states.push_back( - PatternStateC(pattern=states[q].pattern, start=state.start, - length=state.length+1)) + PatternStateC(pattern=states[q].pattern, start=state.start, length=state.length + 1)) if with_alignments != 0: align_new_states.push_back(align_states[q]) if action == RETRY_ADVANCE: # This handles the 'advance' new_states.push_back( - PatternStateC(pattern=states[q].pattern+1, start=state.start, - length=state.length+1)) + PatternStateC(pattern=states[q].pattern + 1, start=state.start, length=state.length + 1)) if with_alignments != 0: align_new_states.push_back(align_states[q]) states[q].pattern += 1 if states[q].pattern.nr_py != 0: - update_predicate_cache(cached_py_predicates, - states[q].pattern, token, py_predicates) - action = get_action(states[q], token.c, extra_attrs, - cached_py_predicates) + update_predicate_cache(cached_py_predicates, states[q].pattern, token, py_predicates) + next_action = get_action(states[q], token.c, extra_attrs, cached_py_predicates) + # To account for *? and +? + if get_quantifier(state) == ZERO_MINUS: + next_action = cast_to_non_greedy_action(action, next_action, new_states, align_new_states, + with_alignments) + action = next_action # Update alignment before the transition of current state if with_alignments != 0: align_states[q].push_back(MatchAlignmentC(states[q].pattern.token_idx, states[q].length)) @@ -465,48 +464,49 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match states[q].pattern += 1 states[q].length += 1 q += 1 + elif action == EXTEND: + states[q].length += 1 + q += 1 else: ent_id = get_ent_id(state.pattern) if action == MATCH: - matches.push_back( - MatchC(pattern_id=ent_id, start=state.start, - length=state.length+1)) + matches.push_back(MatchC(pattern_id=ent_id, start=state.start, length=state.length + 1)) # `align_matches` always corresponds to `matches` 1:1 if with_alignments != 0: align_matches.push_back(align_states[q]) elif action == MATCH_DOUBLE: # push match without last token if length > 0 if state.length > 0: - matches.push_back( - MatchC(pattern_id=ent_id, start=state.start, - length=state.length)) + matches.push_back(MatchC(pattern_id=ent_id, start=state.start, length=state.length)) # MATCH_DOUBLE emits matches twice, # add one more to align_matches in order to keep 1:1 relationship if with_alignments != 0: align_matches.push_back(align_states[q]) # push match with last token - matches.push_back( - MatchC(pattern_id=ent_id, start=state.start, - length=state.length+1)) + matches.push_back(MatchC(pattern_id=ent_id, start=state.start, length=state.length + 1)) # `align_matches` always corresponds to `matches` 1:1 if with_alignments != 0: align_matches.push_back(align_states[q]) elif action == MATCH_REJECT: - matches.push_back( - MatchC(pattern_id=ent_id, start=state.start, - length=state.length)) + matches.push_back(MatchC(pattern_id=ent_id, start=state.start, length=state.length)) # `align_matches` always corresponds to `matches` 1:1 if with_alignments != 0: align_matches.push_back(align_states[q]) elif action == MATCH_EXTEND: - matches.push_back( - MatchC(pattern_id=ent_id, start=state.start, - length=state.length)) + matches.push_back(MatchC(pattern_id=ent_id, start=state.start, length=state.length)) # `align_matches` always corresponds to `matches` 1:1 if with_alignments != 0: align_matches.push_back(align_states[q]) states[q].length += 1 q += 1 + elif action == MATCH_ADVANCE: + matches.push_back(MatchC(pattern_id=ent_id, start=state.start, length=state.length + 1)) + # `align_matches` always corresponds to `matches` 1:1 + if with_alignments != 0: + align_matches.push_back(align_states[q]) + states[q].pattern += 1 + states[q].length += 1 + q += 1 states.resize(q) for i in range(new_states.size()): states.push_back(new_states[i]) @@ -516,9 +516,8 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match for i in range(align_new_states.size()): align_states.push_back(align_new_states[i]) - -cdef int update_predicate_cache(int8_t* cache, - const TokenPatternC* pattern, Token token, predicates) except -1: +cdef int update_predicate_cache(int8_t * cache, + const TokenPatternC * pattern, Token token, predicates) except -1: # If the state references any extra predicates, check whether they match. # These are cached, so that we don't call these potentially expensive # Python functions more than we need to. @@ -536,7 +535,6 @@ cdef int update_predicate_cache(int8_t* cache, else: raise ValueError(Errors.E125.format(value=result)) - cdef void finish_states(vector[MatchC]& matches, vector[PatternStateC]& states, vector[vector[MatchAlignmentC]]& align_matches, vector[vector[MatchAlignmentC]]& align_states, @@ -546,9 +544,12 @@ cdef void finish_states(vector[MatchC]& matches, vector[PatternStateC]& states, cdef vector[MatchAlignmentC] align_state for i in range(states.size()): state = states[i] + if is_non_greedy_star(state): + # if the final pattern token is a *?, remove the match by skipping it. + continue if with_alignments != 0: align_state = align_states[i] - while get_quantifier(state) in (ZERO_PLUS, ZERO_ONE): + while get_quantifier(state) in (ZERO_PLUS, ZERO_MINUS, ZERO_ONE): # Update alignment before the transition of current state if with_alignments != 0: align_state.push_back(MatchAlignmentC(state.pattern.token_idx, state.length)) @@ -564,13 +565,11 @@ cdef void finish_states(vector[MatchC]& matches, vector[PatternStateC]& states, else: state.pattern += 1 - -cdef action_t get_action(PatternStateC state, - const TokenC* token, const attr_t* extra_attrs, - const int8_t* predicate_matches) nogil: +cdef action_t get_action(PatternStateC state, const TokenC * token, const attr_t * extra_attrs, + const int8_t * predicate_matches) nogil: """We need to consider: a) Does the token match the specification? [Yes, No] - b) What's the quantifier? [1, 0+, ?] + b) What's the quantifier? [1, 0+, 0-, ?] c) Is this the last specification? [final, non-final] We can transition in the following ways: @@ -580,7 +579,8 @@ cdef action_t get_action(PatternStateC state, d) Do we add a state with (same state, next token)? We'll code the actions as boolean strings, so 0000 means no to all 4, - 1000 means match but no states added, etc. + 1000 means match but no states added, + and numbers other than 1 represents special actions etc. 1: Yes, final: @@ -600,13 +600,22 @@ cdef action_t get_action(PatternStateC state, 1000 (note: Don't include last token!) No, non-final: 0010 + 0-: + Yes, final: + 2000 (note: Don't include last token!) + Yes, non-final: + 0022 (note: Retry or Extend) + No, final: + 2000 (note: Don't include last token!) + No, non-final: + 0010 ?: Yes, final: - 1000 + 3000 Yes, non-final: 0100 No, final: - 1000 (note: Don't include last token!) + 2000 (note: Don't include last token!) No, non-final: 0010 @@ -617,9 +626,12 @@ cdef action_t get_action(PatternStateC state, MATCH = 1000 ADVANCE = 0100 RETRY = 0010 + EXTEND = 0001 MATCH_EXTEND = 1001 + MATCH_ADVANCE = 1100 RETRY_ADVANCE = 0110 RETRY_EXTEND = 0011 + RETRY_OR_EXTEND = 0022 # If there is a Match after Retry, does not Extend MATCH_REJECT = 2000 # Match, but don't include last token MATCH_DOUBLE = 3000 # Match both with and without last token @@ -633,53 +645,75 @@ cdef action_t get_action(PatternStateC state, is_match = not is_match quantifier = ONE if quantifier == ONE: - if is_match and is_final: - # Yes, final: 1000 - return MATCH - elif is_match and not is_final: - # Yes, non-final: 0100 - return ADVANCE - elif not is_match and is_final: - # No, final: 0000 - return REJECT - else: - return REJECT + if is_match and is_final: + # Yes, final: 1000 + return MATCH + elif is_non_greedy_plus(state) and has_star_tail(state) and is_match and not is_final: + # Yes, non-final: 1100 + # Modification for +?: + # Having MATCH_ADVANCE handles the match at the 'ONE' part of the token instead of relying on MATCH_REJECT + # and other actions from other tokens to produce a match. + # is_non_greedy_plus() verifies that the current state's pattern is +? + # has_star_tail() verifies the remaining pattern tokens are either * or *?, + # so that it is valid for the current match to exist. + return MATCH_ADVANCE + elif is_match and not is_final: + # Yes, non-final: 0100 + return ADVANCE + elif not is_match and is_final: + # No, final: 0000 + return REJECT + else: + return REJECT elif quantifier == ZERO_PLUS: - if is_match and is_final: - # Yes, final: 1001 - return MATCH_EXTEND - elif is_match and not is_final: - # Yes, non-final: 0011 - return RETRY_EXTEND - elif not is_match and is_final: - # No, final 2000 (note: Don't include last token!) - return MATCH_REJECT - else: - # No, non-final 0010 - return RETRY + if is_match and is_final: + # Yes, final: 1001 + return MATCH_EXTEND + elif is_match and not is_final: + # Yes, non-final: 0011 + return RETRY_EXTEND + elif not is_match and is_final: + # No, final 2000 (note: Don't include last token!) + return MATCH_REJECT + else: + # No, non-final 0010 + return RETRY + elif quantifier == ZERO_MINUS: + if is_final or has_non_greedy_tail(state): + # Yes/No, final: 2000 (note: Don't include last token!) + return MATCH_REJECT + elif is_match: + # Yes, non-final: 0022 + # If there is a match, further extensions are skipped so that the behaviour is non-greedy + # pattern: b*?b string: b b + # We do not extend on first b to exhibit non-greedy behaviour + # such that "b" is matched but "b b" is not matched + return RETRY_OR_EXTEND + else: + # No, non-final 0010 + return RETRY elif quantifier == ZERO_ONE: - if is_match and is_final: - # Yes, final: 3000 - # To cater for a pattern ending in "?", we need to add - # a match both with and without the last token - return MATCH_DOUBLE - elif is_match and not is_final: - # Yes, non-final: 0110 - # We need both branches here, consider a pair like: - # pattern: .?b string: b - # If we 'ADVANCE' on the .?, we miss the match. - return RETRY_ADVANCE - elif not is_match and is_final: - # No, final 2000 (note: Don't include last token!) - return MATCH_REJECT - else: - # No, non-final 0010 - return RETRY - + if is_match and is_final: + # Yes, final: 3000 + # To cater for a pattern ending in "?", we need to add + # a match both with and without the last token + return MATCH_DOUBLE + elif is_match and not is_final: + # Yes, non-final: 0110 + # We need both branches here, consider a pair like: + # pattern: .?b string: b + # If we 'ADVANCE' on the .?, we miss the match. + return RETRY_ADVANCE + elif not is_match and is_final: + # No, final 2000 (note: Don't include last token!) + return MATCH_REJECT + else: + # No, non-final 0010 + return RETRY cdef int8_t get_is_match(PatternStateC state, - const TokenC* token, const attr_t* extra_attrs, - const int8_t* predicate_matches) nogil: + const TokenC * token, const attr_t * extra_attrs, + const int8_t * predicate_matches) nogil: for i in range(state.pattern.nr_py): if predicate_matches[state.pattern.py_predicates[i]] == -1: return 0 @@ -693,6 +727,61 @@ cdef int8_t get_is_match(PatternStateC state, return 0 return True +cdef action_t cast_to_non_greedy_action(action_t action, action_t next_action, vector[PatternStateC]& new_states, + vector[vector[MatchAlignmentC]]& align_new_states, bint with_alignments) nogil: + """Cast "next_action" to another "action" that demonstrates non-greedy behaviour. + + To cast "next_action" to a non-greedy action, the "next_action"s that we have to modify are + MATCH, MATCH REJECT, MATCH_EXTEND, MATCH_DOUBLE. + + cast_to_non_greedy_action() is required and cannot be merged with get_action() as there is a need for the + comparison of the 2 different actions from different patterns. + + next_action = MATCH, action = RETRY_OR_EXTEND + - Removed the extension when there is a MATCH + + next_action = MATCH_REJECT + - Cast MATCH_REJECT TO REJECT + - Remove the match since it ends with the '*?' pattern token and removes the current state + - 'state' is ZERO_MINUS so the previous doc token matched the ZERO_MINUS pattern token + - E.g. pattern = "a*? b*", doc = "a a" + - MATCH_REJECT will add 'a' to the matches in transition_states() + - and casting MATCH_REJECT to EXTEND removes such results. + + next_action = MATCH_EXTEND, action = RETRY (where the RETRY came from ZERO_MINUS quantifier) + - Cast MATCH_EXTEND to EXTEND + - Remove the match since it ends with the '*?' pattern token + - E.g. pattern = "a*? b*" doc = "a b" + - MATCH_EXTEND will add 'a' to the matches in transition_states() + - and casting MATCH_EXTEND to EXTEND removes such results. + + next_action = MATCH_DOUBLE after action = RETRY (where the RETRY came from ZERO_MINUS quantifier) + - Cast MATCH_DOUBLE to MATCH + - MATCH_DOUBLE adds 2 matches, one with the last token and one without the token, casting the action to MATCH + - removes the match without the last token which is the match that ends with a '*?' pattern token. + - E.g. pattern = "a* b?" doc = "a b" + - MATCH_DOUBLE will add add the following 2 matches ['a' and 'a b'] + - and casting MATCH_DOUBLE to MATCH removes 'a'. + """ + if action == RETRY_OR_EXTEND and next_action == MATCH: + # Stop the extension once there is a match + new_states.pop_back() + if with_alignments != 0: + align_new_states.pop_back() + return MATCH + elif next_action == MATCH_REJECT: + # Remove matches that end with *? token + # MATCH_REJECT will result in matches that end with the *? token since the + return REJECT + elif action == RETRY and next_action == MATCH_EXTEND: + # This handles the 'extend' without matching + # Remove matches that end with *? token + return EXTEND + elif action == RETRY and next_action == MATCH_DOUBLE: + # Remove matches that end with *? token for operator '?' + return MATCH + else: + return next_action cdef inline int8_t get_is_final(PatternStateC state) nogil: if state.pattern[1].quantifier == FINAL_ID: @@ -700,31 +789,60 @@ cdef inline int8_t get_is_final(PatternStateC state) nogil: else: return 0 - cdef inline int8_t get_quantifier(PatternStateC state) nogil: return state.pattern.quantifier +cdef inline int8_t is_non_greedy_plus(PatternStateC state) nogil: + """Verify whether current state pattern is '+?'""" + if (state.pattern + 1).quantifier == ZERO_MINUS and get_quantifier(state) == ONE \ + and (state.pattern + 1).token_idx == state.pattern.token_idx: + return 1 + else: + return 0 -cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, object token_specs) except NULL: - pattern = mem.alloc(len(token_specs) + 1, sizeof(TokenPatternC)) +cdef inline int8_t is_non_greedy_star(PatternStateC state) nogil: + """Verify whether current state pattern is '*?'""" + if (state.pattern - 1).quantifier != ONE and get_quantifier(state) == ZERO_MINUS: + return 1 + else: + return 0 + +cdef inline int8_t has_star_tail(PatternStateC state) nogil: + """Verify whether all remaining patterns are either '*' or '*?'""" + while not get_is_final(state): + state.pattern += 1 + if get_quantifier(state) not in [ZERO_PLUS, ZERO_MINUS]: + return 0 + return 1 + +cdef inline int8_t has_non_greedy_tail(PatternStateC state) nogil: + """Verify whether all remaining patterns are '*?'""" + while not get_is_final(state): + state.pattern += 1 + if state.pattern.quantifier != ZERO_MINUS: + return 0 + return 1 + +cdef TokenPatternC * init_pattern(Pool mem, attr_t entity_id, object token_specs) except NULL: + pattern = mem.alloc(len(token_specs) + 1, sizeof(TokenPatternC)) cdef int i, index for i, (quantifier, spec, extensions, predicates, token_idx) in enumerate(token_specs): pattern[i].quantifier = quantifier # Ensure attrs refers to a null pointer if nr_attr == 0 if len(spec) > 0: - pattern[i].attrs = mem.alloc(len(spec), sizeof(AttrValueC)) + pattern[i].attrs = mem.alloc(len(spec), sizeof(AttrValueC)) pattern[i].nr_attr = len(spec) for j, (attr, value) in enumerate(spec): pattern[i].attrs[j].attr = attr pattern[i].attrs[j].value = value if len(extensions) > 0: - pattern[i].extra_attrs = mem.alloc(len(extensions), sizeof(IndexValueC)) + pattern[i].extra_attrs = mem.alloc(len(extensions), sizeof(IndexValueC)) for j, (index, value) in enumerate(extensions): pattern[i].extra_attrs[j].index = index pattern[i].extra_attrs[j].value = value pattern[i].nr_extra_attr = len(extensions) if len(predicates) > 0: - pattern[i].py_predicates = mem.alloc(len(predicates), sizeof(int32_t)) + pattern[i].py_predicates = mem.alloc(len(predicates), sizeof(int32_t)) for j, index in enumerate(predicates): pattern[i].py_predicates[j] = index pattern[i].nr_py = len(predicates) @@ -734,7 +852,7 @@ cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, object token_specs) # Use quantifier to identify final ID pattern node (rather than previous # uninitialized quantifier == 0/ZERO + nr_attr == 0 + non-zero-length attrs) pattern[i].quantifier = FINAL_ID - pattern[i].attrs = mem.alloc(1, sizeof(AttrValueC)) + pattern[i].attrs = mem.alloc(1, sizeof(AttrValueC)) pattern[i].attrs[0].attr = ID pattern[i].attrs[0].value = entity_id pattern[i].nr_attr = 1 @@ -744,7 +862,7 @@ cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, object token_specs) return pattern -cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil: +cdef attr_t get_ent_id(const TokenPatternC * pattern) nogil: while pattern.quantifier != FINAL_ID: pattern += 1 id_attr = pattern[0].attrs[0] @@ -979,7 +1097,7 @@ def _get_extra_predicates(spec, extra_predicates, vocab): def _get_extension_extra_predicates(spec, extra_predicates, predicate_types, - seen_predicates): + seen_predicates): output = [] for attr, value in spec.items(): if isinstance(value, dict): @@ -1000,7 +1118,8 @@ def _get_extension_extra_predicates(spec, extra_predicates, predicate_types, def _get_operators(spec): # Support 'syntactic sugar' operator '+', as combination of ONE, ZERO_PLUS lookup = {"*": (ZERO_PLUS,), "+": (ONE, ZERO_PLUS), - "?": (ZERO_ONE,), "1": (ONE,), "!": (ZERO,)} + "?": (ZERO_ONE,), "*?": (ZERO_MINUS,), + "+?": (ONE, ZERO_MINUS), "1": (ONE,), "!": (ZERO,)} # Fix casing spec = {key.upper(): values for key, values in spec.items() if isinstance(key, str)} diff --git a/spacy/schemas.py b/spacy/schemas.py index 658e45268..f27dc51a2 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -202,6 +202,8 @@ class TokenPatternNumber(BaseModel): class TokenPatternOperatorSimple(str, Enum): plus: StrictStr = StrictStr("+") star: StrictStr = StrictStr("*") + plus_question: StrictStr = StrictStr("+?") + star_question: StrictStr = StrictStr("*?") question: StrictStr = StrictStr("?") exclamation: StrictStr = StrictStr("!")