mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	Merge pull request #1402 from explosion/feature/fix-matcher-operators
💫 Fix Matcher variable-length operators
			
			
This commit is contained in:
		
						commit
						010a7309ff
					
				|  | @ -71,6 +71,11 @@ cdef enum action_t: | |||
|     ADVANCE_ZERO | ||||
|     PANIC | ||||
| 
 | ||||
| # A "match expression" conists of one or more token patterns | ||||
| # Each token pattern consists of a quantifier and 0+ (attr, value) pairs. | ||||
| # A state is an (int, pattern pointer) pair, where the int is the start | ||||
| # position, and the pattern pointer shows where we're up to | ||||
| # in the pattern. | ||||
| 
 | ||||
| cdef struct AttrValueC: | ||||
|     attr_id_t attr | ||||
|  | @ -130,7 +135,13 @@ cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil: | |||
|     elif pattern.quantifier in (ONE, ZERO_ONE): | ||||
|         return ACCEPT if (pattern+1).nr_attr == 0 else ADVANCE | ||||
|     elif pattern.quantifier == ZERO_PLUS: | ||||
|         return REPEAT | ||||
|         # This is a bandaid over the 'shadowing' problem described here: | ||||
|         # https://github.com/explosion/spaCy/issues/864 | ||||
|         next_action = get_action(pattern+1, token) | ||||
|         if next_action is REJECT: | ||||
|             return REPEAT | ||||
|         else: | ||||
|             return ADVANCE_ZERO | ||||
|     else: | ||||
|         return PANIC | ||||
| 
 | ||||
|  | @ -220,16 +231,28 @@ cdef class Matcher: | |||
|         return len(self._patterns) | ||||
| 
 | ||||
|     def add(self, key, on_match, *patterns): | ||||
|         """Add a match-rule to the matcher. | ||||
|         A match-rule consists of: an ID key, an on_match callback, and one or | ||||
|         more patterns. If the key exists, the patterns are appended to the | ||||
|         previous ones, and the previous on_match callback is replaced. The | ||||
|         `on_match` callback will receive the arguments `(matcher, doc, i, | ||||
|         matches)`. You can also set `on_match` to `None` to not perform any | ||||
|         actions. A pattern consists of one or more `token_specs`, where a | ||||
|         `token_spec` is a dictionary mapping attribute IDs to values. Token | ||||
|         descriptors can also include quantifiers. There are currently important | ||||
|         known problems with the quantifiers – see the docs. | ||||
|         """Add a match-rule to the matcher. A match-rule consists of: an ID key, | ||||
|         an on_match callback, and one or more patterns. | ||||
| 
 | ||||
|         If the key exists, the patterns are appended to the previous ones, and | ||||
|         the previous on_match callback is replaced. The `on_match` callback will | ||||
|         receive the arguments `(matcher, doc, i, matches)`. You can also set | ||||
|         `on_match` to `None` to not perform any actions. | ||||
| 
 | ||||
|         A pattern consists of one or more `token_specs`, where a `token_spec` | ||||
|         is a dictionary mapping attribute IDs to values, and optionally a | ||||
|         quantifier operator under the key "op". The available quantifiers are: | ||||
| 
 | ||||
|         '!': Negate the pattern, by requiring it to match exactly 0 times. | ||||
|         '?': Make the pattern optional, by allowing it to match 0 or 1 times. | ||||
|         '+': Require the pattern to match 1 or more times. | ||||
|         '*': Allow the pattern to zero or more times. | ||||
| 
 | ||||
|         The + and * operators are usually interpretted "greedily", i.e. longer | ||||
|         matches are returned where possible. However, if you specify two '+' | ||||
|         and '*' patterns in a row and their matches overlap, the first | ||||
|         operator will behave non-greedily. This quirk in the semantics | ||||
|         makes the matcher more efficient, by avoiding the need for back-tracking. | ||||
|         """ | ||||
|         for pattern in patterns: | ||||
|             if len(pattern) == 0: | ||||
|  |  | |||
|  | @ -107,11 +107,22 @@ def test_matcher_empty_dict(en_vocab): | |||
|     matches = matcher(doc) | ||||
|     assert len(matches) == 1 | ||||
|     assert matches[0][1:] == (0, 3) | ||||
|     matcher = Matcher(en_vocab) | ||||
|     matcher.add('A.', None, [{'ORTH': 'a'}, {}]) | ||||
|     matches = matcher(doc) | ||||
|     assert matches[0][1:] == (0, 2) | ||||
|   | ||||
| 
 | ||||
| def test_matcher_operator_shadow(en_vocab): | ||||
|     matcher = Matcher(en_vocab) | ||||
|     abc = ["a", "b", "c"] | ||||
|     doc = get_doc(matcher.vocab, abc) | ||||
|     matcher.add('A.C', None, [{'ORTH': 'a'}, | ||||
|                               {"IS_ALPHA": True, "OP": "+"}, | ||||
|                               {'ORTH': 'c'}]) | ||||
|     matches = matcher(doc) | ||||
|     assert len(matches) == 1 | ||||
|     assert matches[0][1:] == (0, 3) | ||||
|   | ||||
| def test_matcher_phrase_matcher(en_vocab): | ||||
|     words = ["Google", "Now"] | ||||
|     doc = get_doc(en_vocab, words) | ||||
|  | @ -165,3 +176,39 @@ def test_matcher_match_one_plus(matcher): | |||
|                                          {'ORTH': 'Philippe', 'OP': '+'}]) | ||||
|     m = matcher(doc) | ||||
|     assert len(m) == 1 | ||||
| 
 | ||||
| 
 | ||||
| def test_operator_combos(matcher): | ||||
|     cases = [ | ||||
|         ('aaab', 'a a a b', True), | ||||
|         ('aaab', 'a+ b', True), | ||||
|         ('aaab', 'a+ a+ b', True), | ||||
|         ('aaab', 'a+ a+ a b', True), | ||||
|         ('aaab', 'a+ a+ a+ b', True), | ||||
|         ('aaab', 'a+ a a b', True), | ||||
|         ('aaab', 'a+ a a', True), | ||||
|         ('aaab', 'a+', True), | ||||
|         ('aaa', 'a+ b', False), | ||||
|         ('aaa', 'a+ a+ b', False), | ||||
|         ('aaa', 'a+ a+ a+ b', False), | ||||
|         ('aaa', 'a+ a b', False), | ||||
|         ('aaa', 'a+ a a b', False), | ||||
|         ('aaab', 'a+ a a', True), | ||||
|         ('aaab', 'a+', True), | ||||
|         ('aaab', 'a+ a b', True), | ||||
|     ] | ||||
|     for string, pattern_str, result in cases: | ||||
|         matcher = Matcher(matcher.vocab) | ||||
|         doc = get_doc(matcher.vocab, words=list(string)) | ||||
|         pattern = [] | ||||
|         for part in pattern_str.split(): | ||||
|             if part.endswith('+'): | ||||
|                 pattern.append({'ORTH': part[0], 'op': '+'}) | ||||
|             else: | ||||
|                 pattern.append({'ORTH': part}) | ||||
|         matcher.add('PATTERN', None, pattern) | ||||
|         matches = matcher(doc) | ||||
|         if result: | ||||
|             assert matches, (string, pattern_str) | ||||
|         else: | ||||
|             assert not matches, (string, pattern_str) | ||||
|  |  | |||
|  | @ -142,33 +142,30 @@ p | |||
|     |  are no nested or scoped quantifiers – instead, you can build those | ||||
|     |  behaviours with #[code on_match] callbacks. | ||||
| 
 | ||||
| +aside("Problems with quantifiers") | ||||
|     |  Using quantifiers may lead to unexpected results when matching | ||||
|     |  variable-length patterns, for example if the next token would also be | ||||
|     |  matched by the previous token. This problem should be resolved in a future | ||||
|     |  release. For more information, see | ||||
|     |  #[+a(gh("spaCy") + "/issues/864") this issue]. | ||||
| 
 | ||||
| +table([ "OP", "Description", "Example"]) | ||||
| +table([ "OP", "Description"]) | ||||
|     +row | ||||
|         +cell #[code !] | ||||
|         +cell match exactly 0 times | ||||
|         +cell negation | ||||
|         +cell Negate the pattern, by requiring it to match exactly 0 times. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code *] | ||||
|         +cell match 0 or more times | ||||
|         +cell optional, variable number | ||||
|         +cell Make the pattern optional, by allowing it to match 0 or 1 times. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code +] | ||||
|         +cell match 1 or more times | ||||
|         +cell mandatory, variable number | ||||
|         +cell Require the pattern to match 1 or more times. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code ?] | ||||
|         +cell match 0 or 1 times | ||||
|         +cell optional, max one | ||||
|         +cell Allow the pattern to zero or more times. | ||||
| 
 | ||||
| p | ||||
|     |  The #[code +] and #[code *] operators are usually interpretted | ||||
|     |  "greedily", i.e. longer matches are returned where possible. However, if | ||||
|     |  you specify two #[code +] and #[code *] patterns in a row and their | ||||
|     |  matches overlap, the first operator will behave non-greedily. This quirk | ||||
|     |  in the semantics makes the matcher more efficient, by avoiding the need | ||||
|     |  for back-tracking. | ||||
| 
 | ||||
| +h(3, "adding-phrase-patterns") Adding phrase patterns | ||||
| 
 | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user