Merge pull request #1876 from GregDubbin/master

Pattern matcher fixes
2025-09-21 11:32:38 +03:00 · 2018-01-24 16:38:11 +01:00 · 2018-01-24 16:38:11 +01:00 · 6a8cb905aa
commit 6a8cb905aa
parent 38b260e0c3 85ab99e692
5 changed files with 329 additions and 33 deletions
--- a/spacy/matcher.pyx
+++ b/spacy/matcher.pyx
@ -8,9 +8,15 @@ from cymem.cymem cimport Pool
 from preshed.maps cimport PreshMap
 from libcpp.vector cimport vector
 from libcpp.pair cimport pair
+from cython.operator cimport dereference as deref
 from murmurhash.mrmr cimport hash64
 from libc.stdint cimport int32_t

+# try:
+#     from libcpp.unordered_map cimport unordered_map as umap
+# except:
+#     from libcpp.map cimport map as umap
+
 from .typedefs cimport attr_t
 from .typedefs cimport hash_t
 from .structs cimport TokenC
@ -62,10 +68,11 @@ cdef enum action_t:
    REPEAT
    ACCEPT
    ADVANCE_ZERO
+    ADVANCE_PLUS
    ACCEPT_PREV
    PANIC

-# A "match expression" conists of one or more token patterns
+
 # Each token pattern consists of a quantifier and 0+ (attr, value) pairs.
 # A state is an (int, pattern pointer) pair, where the int is the start
 # position, and the pattern pointer shows where we're up to
@ -83,7 +90,25 @@ cdef struct TokenPatternC:


 ctypedef TokenPatternC* TokenPatternC_ptr
-ctypedef pair[int, TokenPatternC_ptr] StateC
+# ctypedef pair[int, TokenPatternC_ptr] StateC
+
+# Match Dictionary entry type
+cdef struct MatchEntryC:
+    int32_t start
+    int32_t end
+    int32_t offset
+
+# A state instance represents the information that defines a 
+# partial match
+# start: the index of the first token in the partial match
+# pattern: a pointer to the current token pattern in the full
+#       pattern
+# last_match: The entry of the last span matched by the
+#       same pattern
+cdef struct StateC:
+    int32_t start
+    TokenPatternC_ptr pattern
+    MatchEntryC* last_match


 cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id,
@ -128,7 +153,10 @@ cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil:
    if pattern.quantifier == ZERO:
        return REJECT
    elif lookahead.nr_attr == 0:
-        return ACCEPT
+        if pattern.quantifier == ZERO_PLUS:
+            return REPEAT
+        else:
+            return ACCEPT
    elif pattern.quantifier in (ONE, ZERO_ONE):
        return ADVANCE
    elif pattern.quantifier == ZERO_PLUS:
@ -138,7 +166,7 @@ cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil:
        if next_action is REJECT:
            return REPEAT
        else:
-            return ADVANCE_ZERO
+            return ADVANCE_PLUS
    else:
        return PANIC

@ -339,77 +367,223 @@ cdef class Matcher:
        cdef int i, token_i
        cdef const TokenC* token
        cdef StateC state
+        cdef int j = 0
+        cdef int k
+        cdef bint overlap = False
+        cdef MatchEntryC* state_match 
+        cdef MatchEntryC* last_matches = <MatchEntryC*>self.mem.alloc(self.patterns.size(),sizeof(MatchEntryC))
+
+        for i in range(self.patterns.size()):
+            last_matches[i].start = 0
+            last_matches[i].end = 0
+            last_matches[i].offset = 0
+
        matches = []
        for token_i in range(doc.length):
            token = &doc.c[token_i]
            q = 0
            # Go over the open matches, extending or finalizing if able.
            # Otherwise, we over-write them (q doesn't advance)
-            for state in partials:
-                action = get_action(state.second, token)
+            #for state in partials:
+            j=0
+            while j < n_partials:
+                state = partials[j]
+                action = get_action(state.pattern, token)
+                j += 1
+                # Skip patterns that would overlap with an existing match
+                # Patterns overlap an existing match if they point to the
+                # same final state and start between the start and end
+                # of said match.
+                # Different patterns with the same label are allowed to 
+                # overlap.
+                state_match = state.last_match
+                if (state.start > state_match.start 
+                    and state.start < state_match.end):
+                    continue
                if action == PANIC:
                    raise Exception("Error selecting action in matcher")
                while action == ADVANCE_ZERO:
-                    state.second += 1
-                    action = get_action(state.second, token)
+                    state.pattern += 1
+                    action = get_action(state.pattern, token)
                if action == PANIC:
                    raise Exception("Error selecting action in matcher")
+                
+                # ADVANCE_PLUS acts like REPEAT, but also pushes a partial that
+                # acts like and ADVANCE_ZERO
+                if action == ADVANCE_PLUS:
+                    state.pattern += 1
+                    partials.push_back(state)
+                    n_partials += 1
+                    state.pattern -= 1
+                    action = REPEAT

+                if action == ADVANCE:
+                    state.pattern += 1
+
+                # Check for partial matches that are at the same spec in the same pattern
+                # Keep the longer of the matches
+                # This ensures that there are never more then 2 partials for every spec
+                # in a pattern (one of which gets pruned in this step)
+
+                overlap=False
+                for i in range(q):
+                    if state.pattern == partials[i].pattern and state.start < partials[i].start:
+                        partials[i] = state
+                        j = i
+                        overlap = True
+                        break
+                if overlap:
+                    continue
+                overlap=False
+                for i in range(q):
+                    if state.pattern == partials[i].pattern:
+                        overlap = True
+                        break
+                if overlap:
+                    continue
+
+    
                if action == REPEAT:
                    # Leave the state in the queue, and advance to next slot
                    # (i.e. we don't overwrite -- we want to greedily match
                    # more pattern.
+                    partials[q] = state
                    q += 1
                elif action == REJECT:
                    pass
                elif action == ADVANCE:
                    partials[q] = state
-                    partials[q].second += 1
                    q += 1
                elif action in (ACCEPT, ACCEPT_PREV):
                    # TODO: What to do about patterns starting with ZERO? Need
                    # to adjust the start position.
-                    start = state.first
+                    start = state.start
                    end = token_i+1 if action == ACCEPT else token_i
-                    ent_id = state.second[1].attrs[0].value
-                    label = state.second[1].attrs[1].value
-                    matches.append((ent_id, start, end))
+                    ent_id = state.pattern[1].attrs[0].value
+                    label = state.pattern[1].attrs[1].value
+                    # Check that this match doesn't overlap with an earlier match.
+                    # Only overwrite an earlier match if it is a substring of this
+                    # match (i.e. it starts after this match starts).
+                    state_match = state.last_match
+
+                    if start >= state_match.end:
+                        state_match.start = start
+                        state_match.end = end
+                        state_match.offset = len(matches)
+                        matches.append((ent_id,start,end))
+                    elif start <= state_match.start and end >= state_match.end:
+                        if len(matches) == 0:
+                            assert state_match.offset==0
+                            state_match.offset = 0
+                            matches.append((ent_id,start,end))
+                        else:
+                            i = state_match.offset
+                            matches[i] = (ent_id,start,end)
+                        state_match.start = start
+                        state_match.end = end
+                    else:
+                        pass

            partials.resize(q)
+            n_partials = q
            # Check whether we open any new patterns on this token
+            i=0
            for pattern in self.patterns:
+                # Skip patterns that would overlap with an existing match
+                # state_match = pattern.last_match
+                state_match = &last_matches[i]
+                i+=1
+                if (token_i > state_match.start 
+                    and token_i < state_match.end):
+                    continue
                action = get_action(pattern, token)
                if action == PANIC:
                    raise Exception("Error selecting action in matcher")
-                while action == ADVANCE_ZERO:
+                while action in (ADVANCE_PLUS,ADVANCE_ZERO):
+                    if action == ADVANCE_PLUS:
+                        state.start = token_i
+                        state.pattern = pattern
+                        state.last_match = state_match
+                        partials.push_back(state)
+                        n_partials += 1
                    pattern += 1
                    action = get_action(pattern, token)
+
+                if action == ADVANCE:
+                    pattern += 1
+                j=0
+                overlap = False
+                for j in range(q):
+                    if pattern == partials[j].pattern:
+                        overlap = True
+                        break
+                if overlap:
+                    continue
+
+
                if action == REPEAT:
-                    state.first = token_i
-                    state.second = pattern
+                    state.start = token_i
+                    state.pattern = pattern
+                    state.last_match = state_match
                    partials.push_back(state)
+                    n_partials += 1
                elif action == ADVANCE:
                    # TODO: What to do about patterns starting with ZERO? Need
                    # to adjust the start position.
-                    state.first = token_i
-                    state.second = pattern + 1
+                    state.start = token_i
+                    state.pattern = pattern
+                    state.last_match = state_match
                    partials.push_back(state)
+                    n_partials += 1
                elif action in (ACCEPT, ACCEPT_PREV):
                    start = token_i
                    end = token_i+1 if action == ACCEPT else token_i
                    ent_id = pattern[1].attrs[0].value
+
                    label = pattern[1].attrs[1].value
-                    matches.append((ent_id, start, end))
+                    if start >= state_match.end:
+                        state_match.start = start
+                        state_match.end = end
+                        state_match.offset = len(matches)
+                        matches.append((ent_id,start,end))
+                    if start <= state_match.start and end >= state_match.end:
+                        if len(matches) == 0:
+                            state_match.offset = 0
+                            matches.append((ent_id,start,end))
+                        else:
+                            j = state_match.offset
+                            matches[j] = (ent_id,start,end)
+                        state_match.start = start
+                        state_match.end = end
+                    else:
+                        pass
+
        # Look for open patterns that are actually satisfied
        for state in partials:
-            while state.second.quantifier in (ZERO, ZERO_ONE, ZERO_PLUS):
-                state.second += 1
-                if state.second.nr_attr == 0:
-                    start = state.first
+            while state.pattern.quantifier in (ZERO, ZERO_ONE, ZERO_PLUS):
+                state.pattern += 1
+                if state.pattern.nr_attr == 0:
+                    start = state.start
                    end = len(doc)
-                    ent_id = state.second.attrs[0].value
-                    label = state.second.attrs[0].value
-                    matches.append((ent_id, start, end))
+                    ent_id = state.pattern.attrs[0].value
+                    label = state.pattern.attrs[1].value
+                    state_match = state.last_match
+                    if start >= state_match.end:
+                        state_match.start = start
+                        state_match.end = end
+                        state_match.offset = len(matches)
+                        matches.append((ent_id,start,end))
+                    if start <= state_match.start and end >= state_match.end:
+                        j = state_match.offset
+                        if len(matches) == 0:
+                            state_match.offset = 0
+                            matches.append((ent_id,start,end))
+                        else:
+                            matches[j] = (ent_id,start,end)
+                        state_match.start = start
+                        state_match.end = end
+                    else:
+                        pass
        for i, (ent_id, start, end) in enumerate(matches):
            on_match = self._callbacks.get(ent_id)
            if on_match is not None:
--- a/spacy/tests/regression/test_issue1450.py
+++ b/spacy/tests/regression/test_issue1450.py
@ -13,8 +13,8 @@ from ...vocab import Vocab
        ('a b', 0, 2),
        ('a c', 0, 1),
        ('a b c', 0, 2),
-        ('a b b c', 0, 2),
-        ('a b b', 0, 2),
+        ('a b b c', 0, 3),
+        ('a b b', 0, 3),
    ]
 )
 def test_issue1450_matcher_end_zero_plus(string, start, end):
--- a/spacy/tests/regression/test_issue1855.py
+++ b/spacy/tests/regression/test_issue1855.py
@ -0,0 +1,63 @@
+# coding: utf-8
+from __future__ import unicode_literals
+import re
+
+from ...matcher import Matcher
+
+import pytest
+
+pattern1	= [{'ORTH':'A','OP':'1'},{'ORTH':'A','OP':'*'}]
+pattern2	= [{'ORTH':'A','OP':'*'},{'ORTH':'A','OP':'1'}]
+pattern3	= [{'ORTH':'A','OP':'1'},{'ORTH':'A','OP':'1'}]
+pattern4	= [{'ORTH':'B','OP':'1'},{'ORTH':'A','OP':'*'},{'ORTH':'B','OP':'1'}]
+pattern5 	= [{'ORTH':'B','OP':'*'},{'ORTH':'A','OP':'*'},{'ORTH':'B','OP':'1'}]
+
+re_pattern1	= 'AA*'
+re_pattern2 = 'A*A'
+re_pattern3	= 'AA'
+re_pattern4	= 'BA*B'
+re_pattern5	= 'B*A*B'
+
+@pytest.fixture
+def text():
+	return "(ABBAAAAAB)."
+
+@pytest.fixture
+def doc(en_tokenizer,text):
+    doc = en_tokenizer(' '.join(text))
+    return doc
+
+@pytest.mark.parametrize('pattern,re_pattern',[
+	(pattern1,re_pattern1),
+	(pattern2,re_pattern2),
+	(pattern3,re_pattern3),
+	(pattern4,re_pattern4),
+	(pattern5,re_pattern5)])
+def test_greedy_matching(doc,text,pattern,re_pattern):
+	"""
+	Test that the greedy matching behavior of the * op
+	is consistant with other re implementations
+	"""
+	matcher = Matcher(doc.vocab)
+	matcher.add(re_pattern,None,pattern)
+	matches = matcher(doc)
+	re_matches = [m.span() for m in re.finditer(re_pattern,text)]
+	for match,re_match in zip(matches,re_matches):
+		assert match[1:]==re_match
+
+@pytest.mark.parametrize('pattern,re_pattern',[
+	(pattern1,re_pattern1),
+	(pattern2,re_pattern2),
+	(pattern3,re_pattern3),
+	(pattern4,re_pattern4),
+	(pattern5,re_pattern5)])
+def test_match_consuming(doc,text,pattern,re_pattern):
+	"""
+	Test that matcher.__call__ consumes tokens on a match
+	similar to re.findall
+	"""
+	matcher = Matcher(doc.vocab)
+	matcher.add(re_pattern,None,pattern)
+	matches = matcher(doc)
+	re_matches = [m.span() for m in re.finditer(re_pattern,text)]
+	assert len(matches)==len(re_matches)
--- a/spacy/tests/test_matcher_greedy.py
+++ b/spacy/tests/test_matcher_greedy.py
@ -0,0 +1,63 @@
+# coding: utf-8
+from __future__ import unicode_literals
+import re
+
+from ..matcher import Matcher
+
+import pytest
+
+pattern1	= [{'ORTH':'A','OP':'1'},{'ORTH':'A','OP':'*'}]
+pattern2	= [{'ORTH':'A','OP':'*'},{'ORTH':'A','OP':'1'}]
+pattern3	= [{'ORTH':'A','OP':'1'},{'ORTH':'A','OP':'1'}]
+pattern4	= [{'ORTH':'B','OP':'1'},{'ORTH':'A','OP':'*'},{'ORTH':'B','OP':'1'}]
+pattern5 	= [{'ORTH':'B','OP':'*'},{'ORTH':'A','OP':'*'},{'ORTH':'B','OP':'1'}]
+
+re_pattern1	= 'AA*'
+re_pattern2 = 'A*A'
+re_pattern3	= 'AA'
+re_pattern4	= 'BA*B'
+re_pattern5	= 'B*A*B'
+
+@pytest.fixture
+def text():
+	return "(ABBAAAAAB)."
+
+@pytest.fixture
+def doc(en_tokenizer,text):
+    doc = en_tokenizer(' '.join(text))
+    return doc
+
+@pytest.mark.parametrize('pattern,re_pattern',[
+	(pattern1,re_pattern1),
+	(pattern2,re_pattern2),
+	(pattern3,re_pattern3),
+	(pattern4,re_pattern4),
+	(pattern5,re_pattern5)])
+def test_greedy_matching(doc,text,pattern,re_pattern):
+	"""
+	Test that the greedy matching behavior of the * op
+	is consistant with other re implementations
+	"""
+	matcher = Matcher(doc.vocab)
+	matcher.add(re_pattern,None,pattern)
+	matches = matcher(doc)
+	re_matches = [m.span() for m in re.finditer(re_pattern,text)]
+	for match,re_match in zip(matches,re_matches):
+		assert match[1:]==re_match
+
+@pytest.mark.parametrize('pattern,re_pattern',[
+	(pattern1,re_pattern1),
+	(pattern2,re_pattern2),
+	(pattern3,re_pattern3),
+	(pattern4,re_pattern4),
+	(pattern5,re_pattern5)])
+def test_match_consuming(doc,text,pattern,re_pattern):
+	"""
+	Test that matcher.__call__ consumes tokens on a match
+	similar to re.findall
+	"""
+	matcher = Matcher(doc.vocab)
+	matcher.add(re_pattern,None,pattern)
+	matches = matcher(doc)
+	re_matches = [m.span() for m in re.finditer(re_pattern,text)]
+	assert len(matches)==len(re_matches)
--- a/website/usage/_linguistic-features/_rule-based-matching.jade
+++ b/website/usage/_linguistic-features/_rule-based-matching.jade
@ -161,11 +161,7 @@ p

 p
    |  The #[code +] and #[code *] operators are usually interpretted
-    |  "greedily", i.e. longer matches are returned where possible. However, if
-    |  you specify two #[code +] and #[code *] patterns in a row and their
-    |  matches overlap, the first operator will behave non-greedily. This quirk
-    |  in the semantics makes the matcher more efficient, by avoiding the need
-    |  for back-tracking.
+    |  "greedily", i.e. longer matches are returned where possible. 

 +h(3, "adding-phrase-patterns") Adding phrase patterns