Merge pull request #1402 from explosion/feature/fix-matcher-operators

💫 Fix Matcher variable-length operators
2025-08-24 05:54:55 +03:00 · 2017-10-16 17:53:19 +02:00 · 2017-10-16 17:53:19 +02:00 · 010a7309ff
commit 010a7309ff
parent 15514dc333 c29927d2e7
3 changed files with 95 additions and 28 deletions
--- a/spacy/matcher.pyx
+++ b/spacy/matcher.pyx
@ -71,6 +71,11 @@ cdef enum action_t:
    ADVANCE_ZERO
    PANIC

+# A "match expression" conists of one or more token patterns
+# Each token pattern consists of a quantifier and 0+ (attr, value) pairs.
+# A state is an (int, pattern pointer) pair, where the int is the start
+# position, and the pattern pointer shows where we're up to
+# in the pattern.

 cdef struct AttrValueC:
    attr_id_t attr
@ -130,7 +135,13 @@ cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil:
    elif pattern.quantifier in (ONE, ZERO_ONE):
        return ACCEPT if (pattern+1).nr_attr == 0 else ADVANCE
    elif pattern.quantifier == ZERO_PLUS:
-        return REPEAT
+        # This is a bandaid over the 'shadowing' problem described here:
+        # https://github.com/explosion/spaCy/issues/864
+        next_action = get_action(pattern+1, token)
+        if next_action is REJECT:
+            return REPEAT
+        else:
+            return ADVANCE_ZERO
    else:
        return PANIC

@ -220,16 +231,28 @@ cdef class Matcher:
        return len(self._patterns)

    def add(self, key, on_match, *patterns):
-        """Add a match-rule to the matcher.
-        A match-rule consists of: an ID key, an on_match callback, and one or
-        more patterns. If the key exists, the patterns are appended to the
-        previous ones, and the previous on_match callback is replaced. The
-        `on_match` callback will receive the arguments `(matcher, doc, i,
-        matches)`. You can also set `on_match` to `None` to not perform any
-        actions. A pattern consists of one or more `token_specs`, where a
-        `token_spec` is a dictionary mapping attribute IDs to values. Token
-        descriptors can also include quantifiers. There are currently important
-        known problems with the quantifiers – see the docs.
+        """Add a match-rule to the matcher. A match-rule consists of: an ID key,
+        an on_match callback, and one or more patterns.
+
+        If the key exists, the patterns are appended to the previous ones, and
+        the previous on_match callback is replaced. The `on_match` callback will
+        receive the arguments `(matcher, doc, i, matches)`. You can also set
+        `on_match` to `None` to not perform any actions.
+
+        A pattern consists of one or more `token_specs`, where a `token_spec`
+        is a dictionary mapping attribute IDs to values, and optionally a
+        quantifier operator under the key "op". The available quantifiers are:
+
+        '!': Negate the pattern, by requiring it to match exactly 0 times.
+        '?': Make the pattern optional, by allowing it to match 0 or 1 times.
+        '+': Require the pattern to match 1 or more times.
+        '*': Allow the pattern to zero or more times.
+
+        The + and * operators are usually interpretted "greedily", i.e. longer
+        matches are returned where possible. However, if you specify two '+'
+        and '*' patterns in a row and their matches overlap, the first
+        operator will behave non-greedily. This quirk in the semantics
+        makes the matcher more efficient, by avoiding the need for back-tracking.
        """
        for pattern in patterns:
            if len(pattern) == 0:
--- a/spacy/tests/test_matcher.py
+++ b/spacy/tests/test_matcher.py
@ -107,11 +107,22 @@ def test_matcher_empty_dict(en_vocab):
    matches = matcher(doc)
    assert len(matches) == 1
    assert matches[0][1:] == (0, 3)
+    matcher = Matcher(en_vocab)
    matcher.add('A.', None, [{'ORTH': 'a'}, {}])
    matches = matcher(doc)
    assert matches[0][1:] == (0, 2)
 
-
+def test_matcher_operator_shadow(en_vocab):
+    matcher = Matcher(en_vocab)
+    abc = ["a", "b", "c"]
+    doc = get_doc(matcher.vocab, abc)
+    matcher.add('A.C', None, [{'ORTH': 'a'},
+                              {"IS_ALPHA": True, "OP": "+"},
+                              {'ORTH': 'c'}])
+    matches = matcher(doc)
+    assert len(matches) == 1
+    assert matches[0][1:] == (0, 3)
+ 
 def test_matcher_phrase_matcher(en_vocab):
    words = ["Google", "Now"]
    doc = get_doc(en_vocab, words)
@ -165,3 +176,39 @@ def test_matcher_match_one_plus(matcher):
                                         {'ORTH': 'Philippe', 'OP': '+'}])
    m = matcher(doc)
    assert len(m) == 1
+
+
+def test_operator_combos(matcher):
+    cases = [
+        ('aaab', 'a a a b', True),
+        ('aaab', 'a+ b', True),
+        ('aaab', 'a+ a+ b', True),
+        ('aaab', 'a+ a+ a b', True),
+        ('aaab', 'a+ a+ a+ b', True),
+        ('aaab', 'a+ a a b', True),
+        ('aaab', 'a+ a a', True),
+        ('aaab', 'a+', True),
+        ('aaa', 'a+ b', False),
+        ('aaa', 'a+ a+ b', False),
+        ('aaa', 'a+ a+ a+ b', False),
+        ('aaa', 'a+ a b', False),
+        ('aaa', 'a+ a a b', False),
+        ('aaab', 'a+ a a', True),
+        ('aaab', 'a+', True),
+        ('aaab', 'a+ a b', True),
+    ]
+    for string, pattern_str, result in cases:
+        matcher = Matcher(matcher.vocab)
+        doc = get_doc(matcher.vocab, words=list(string))
+        pattern = []
+        for part in pattern_str.split():
+            if part.endswith('+'):
+                pattern.append({'ORTH': part[0], 'op': '+'})
+            else:
+                pattern.append({'ORTH': part})
+        matcher.add('PATTERN', None, pattern)
+        matches = matcher(doc)
+        if result:
+            assert matches, (string, pattern_str)
+        else:
+            assert not matches, (string, pattern_str)
--- a/website/usage/_linguistic-features/_rule-based-matching.jade
+++ b/website/usage/_linguistic-features/_rule-based-matching.jade
@ -142,33 +142,30 @@ p
    |  are no nested or scoped quantifiers – instead, you can build those
    |  behaviours with #[code on_match] callbacks.

-+aside("Problems with quantifiers")
-    |  Using quantifiers may lead to unexpected results when matching
-    |  variable-length patterns, for example if the next token would also be
-    |  matched by the previous token. This problem should be resolved in a future
-    |  release. For more information, see
-    |  #[+a(gh("spaCy") + "/issues/864") this issue].
-
-+table([ "OP", "Description", "Example"])
+table([ "OP", "Description"])
    +row
        +cell #[code !]
-        +cell match exactly 0 times
-        +cell negation
+        +cell Negate the pattern, by requiring it to match exactly 0 times.

    +row
        +cell #[code *]
-        +cell match 0 or more times
-        +cell optional, variable number
+        +cell Make the pattern optional, by allowing it to match 0 or 1 times.

    +row
        +cell #[code +]
-        +cell match 1 or more times
-        +cell mandatory, variable number
+        +cell Require the pattern to match 1 or more times.

    +row
        +cell #[code ?]
-        +cell match 0 or 1 times
-        +cell optional, max one
+        +cell Allow the pattern to zero or more times.
+
+p
+    |  The #[code +] and #[code *] operators are usually interpretted
+    |  "greedily", i.e. longer matches are returned where possible. However, if
+    |  you specify two #[code +] and #[code *] patterns in a row and their
+    |  matches overlap, the first operator will behave non-greedily. This quirk
+    |  in the semantics makes the matcher more efficient, by avoiding the need
+    |  for back-tracking.

 +h(3, "adding-phrase-patterns") Adding phrase patterns