Merge pull request #1876 from GregDubbin/master

Pattern matcher fixes
2025-10-15 00:07:15 +03:00 · 2018-01-24 16:38:11 +01:00 · 2018-01-24 16:38:11 +01:00 · 6a8cb905aa
commit 6a8cb905aa
parent 38b260e0c3 85ab99e692
5 changed files with 329 additions and 33 deletions
--- a/spacy/matcher.pyx
+++ b/spacy/matcher.pyx
@ -8,9 +8,15 @@ from cymem.cymem cimport Pool
 from preshed.maps cimport PreshMap
 from libcpp.vector cimport vector
 from libcpp.pair cimport pair
 from cython.operator cimport dereference as deref
 from murmurhash.mrmr cimport hash64
 from libc.stdint cimport int32_t
 # try:
 #     from libcpp.unordered_map cimport unordered_map as umap
 # except:
 #     from libcpp.map cimport map as umap
 from .typedefs cimport attr_t
 from .typedefs cimport hash_t
 from .structs cimport TokenC
@ -62,10 +68,11 @@ cdef enum action_t:
    REPEAT
    ACCEPT
    ADVANCE_ZERO
    ADVANCE_PLUS
    ACCEPT_PREV
    PANIC
-# A "match expression" conists of one or more token patterns
+
 # Each token pattern consists of a quantifier and 0+ (attr, value) pairs.
 # A state is an (int, pattern pointer) pair, where the int is the start
 # position, and the pattern pointer shows where we're up to
@ -83,7 +90,25 @@ cdef struct TokenPatternC:
 ctypedef TokenPatternC* TokenPatternC_ptr
-ctypedef pair[int, TokenPatternC_ptr] StateC
+# ctypedef pair[int, TokenPatternC_ptr] StateC
 # Match Dictionary entry type
 cdef struct MatchEntryC:
    int32_t start
    int32_t end
    int32_t offset
 # A state instance represents the information that defines a 
 # partial match
 # start: the index of the first token in the partial match
 # pattern: a pointer to the current token pattern in the full
 #       pattern
 # last_match: The entry of the last span matched by the
 #       same pattern
 cdef struct StateC:
    int32_t start
    TokenPatternC_ptr pattern
    MatchEntryC* last_match
 cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id,
@ -128,7 +153,10 @@ cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil:
    if pattern.quantifier == ZERO:
        return REJECT
    elif lookahead.nr_attr == 0:
-        return ACCEPT
+        if pattern.quantifier == ZERO_PLUS:
            return REPEAT
        else:
            return ACCEPT
    elif pattern.quantifier in (ONE, ZERO_ONE):
        return ADVANCE
    elif pattern.quantifier == ZERO_PLUS:
@ -138,7 +166,7 @@ cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil:
        if next_action is REJECT:
            return REPEAT
        else:
-            return ADVANCE_ZERO
+            return ADVANCE_PLUS
    else:
        return PANIC
@ -339,77 +367,223 @@ cdef class Matcher:
        cdef int i, token_i
        cdef const TokenC* token
        cdef StateC state
        cdef int j = 0
        cdef int k
        cdef bint overlap = False
        cdef MatchEntryC* state_match 
        cdef MatchEntryC* last_matches = <MatchEntryC*>self.mem.alloc(self.patterns.size(),sizeof(MatchEntryC))
        for i in range(self.patterns.size()):
            last_matches[i].start = 0
            last_matches[i].end = 0
            last_matches[i].offset = 0
        matches = []
        for token_i in range(doc.length):
            token = &doc.c[token_i]
            q = 0
            # Go over the open matches, extending or finalizing if able.
            # Otherwise, we over-write them (q doesn't advance)
-            for state in partials:
+            #for state in partials:
-                action = get_action(state.second, token)
+            j=0
            while j < n_partials:
                state = partials[j]
                action = get_action(state.pattern, token)
                j += 1
                # Skip patterns that would overlap with an existing match
                # Patterns overlap an existing match if they point to the
                # same final state and start between the start and end
                # of said match.
                # Different patterns with the same label are allowed to 
                # overlap.
                state_match = state.last_match
                if (state.start > state_match.start 
                    and state.start < state_match.end):
                    continue
                if action == PANIC:
                    raise Exception("Error selecting action in matcher")
                while action == ADVANCE_ZERO:
-                    state.second += 1
+                    state.pattern += 1
-                    action = get_action(state.second, token)
+                    action = get_action(state.pattern, token)
                if action == PANIC:
                    raise Exception("Error selecting action in matcher")
                # ADVANCE_PLUS acts like REPEAT, but also pushes a partial that
                # acts like and ADVANCE_ZERO
                if action == ADVANCE_PLUS:
                    state.pattern += 1
                    partials.push_back(state)
                    n_partials += 1
                    state.pattern -= 1
                    action = REPEAT
                if action == ADVANCE:
                    state.pattern += 1
                # Check for partial matches that are at the same spec in the same pattern
                # Keep the longer of the matches
                # This ensures that there are never more then 2 partials for every spec
                # in a pattern (one of which gets pruned in this step)
                overlap=False
                for i in range(q):
                    if state.pattern == partials[i].pattern and state.start < partials[i].start:
                        partials[i] = state
                        j = i
                        overlap = True
                        break
                if overlap:
                    continue
                overlap=False
                for i in range(q):
                    if state.pattern == partials[i].pattern:
                        overlap = True
                        break
                if overlap:
                    continue
                if action == REPEAT:
                    # Leave the state in the queue, and advance to next slot
                    # (i.e. we don't overwrite -- we want to greedily match
                    # more pattern.
                    partials[q] = state
                    q += 1
                elif action == REJECT:
                    pass
                elif action == ADVANCE:
                    partials[q] = state
                    partials[q].second += 1
                    q += 1
                elif action in (ACCEPT, ACCEPT_PREV):
                    # TODO: What to do about patterns starting with ZERO? Need
                    # to adjust the start position.
-                    start = state.first
+                    start = state.start
                    end = token_i+1 if action == ACCEPT else token_i
-                    ent_id = state.second[1].attrs[0].value
+                    ent_id = state.pattern[1].attrs[0].value
-                    label = state.second[1].attrs[1].value
+                    label = state.pattern[1].attrs[1].value
-                    matches.append((ent_id, start, end))
+                    # Check that this match doesn't overlap with an earlier match.
                    # Only overwrite an earlier match if it is a substring of this
                    # match (i.e. it starts after this match starts).
                    state_match = state.last_match
                    if start >= state_match.end:
                        state_match.start = start
                        state_match.end = end
                        state_match.offset = len(matches)
                        matches.append((ent_id,start,end))
                    elif start <= state_match.start and end >= state_match.end:
                        if len(matches) == 0:
                            assert state_match.offset==0
                            state_match.offset = 0
                            matches.append((ent_id,start,end))
                        else:
                            i = state_match.offset
                            matches[i] = (ent_id,start,end)
                        state_match.start = start
                        state_match.end = end
                    else:
                        pass
            partials.resize(q)
            n_partials = q
            # Check whether we open any new patterns on this token
            i=0
            for pattern in self.patterns:
                # Skip patterns that would overlap with an existing match
                # state_match = pattern.last_match
                state_match = &last_matches[i]
                i+=1
                if (token_i > state_match.start 
                    and token_i < state_match.end):
                    continue
                action = get_action(pattern, token)
                if action == PANIC:
                    raise Exception("Error selecting action in matcher")
-                while action == ADVANCE_ZERO:
+                while action in (ADVANCE_PLUS,ADVANCE_ZERO):
                    if action == ADVANCE_PLUS:
                        state.start = token_i
                        state.pattern = pattern
                        state.last_match = state_match
                        partials.push_back(state)
                        n_partials += 1
                    pattern += 1
                    action = get_action(pattern, token)
                if action == ADVANCE:
                    pattern += 1
                j=0
                overlap = False
                for j in range(q):
                    if pattern == partials[j].pattern:
                        overlap = True
                        break
                if overlap:
                    continue
                if action == REPEAT:
-                    state.first = token_i
+                    state.start = token_i
-                    state.second = pattern
+                    state.pattern = pattern
                    state.last_match = state_match
                    partials.push_back(state)
                    n_partials += 1
                elif action == ADVANCE:
                    # TODO: What to do about patterns starting with ZERO? Need
                    # to adjust the start position.
-                    state.first = token_i
+                    state.start = token_i
-                    state.second = pattern + 1
+                    state.pattern = pattern
                    state.last_match = state_match
                    partials.push_back(state)
                    n_partials += 1
                elif action in (ACCEPT, ACCEPT_PREV):
                    start = token_i
                    end = token_i+1 if action == ACCEPT else token_i
                    ent_id = pattern[1].attrs[0].value
                    label = pattern[1].attrs[1].value
-                    matches.append((ent_id, start, end))
+                    if start >= state_match.end:
                        state_match.start = start
                        state_match.end = end
                        state_match.offset = len(matches)
                        matches.append((ent_id,start,end))
                    if start <= state_match.start and end >= state_match.end:
                        if len(matches) == 0:
                            state_match.offset = 0
                            matches.append((ent_id,start,end))
                        else:
                            j = state_match.offset
                            matches[j] = (ent_id,start,end)
                        state_match.start = start
                        state_match.end = end
                    else:
                        pass
        # Look for open patterns that are actually satisfied
        for state in partials:
-            while state.second.quantifier in (ZERO, ZERO_ONE, ZERO_PLUS):
+            while state.pattern.quantifier in (ZERO, ZERO_ONE, ZERO_PLUS):
-                state.second += 1
+                state.pattern += 1
-                if state.second.nr_attr == 0:
+                if state.pattern.nr_attr == 0:
-                    start = state.first
+                    start = state.start
                    end = len(doc)
-                    ent_id = state.second.attrs[0].value
+                    ent_id = state.pattern.attrs[0].value
-                    label = state.second.attrs[0].value
+                    label = state.pattern.attrs[1].value
-                    matches.append((ent_id, start, end))
+                    state_match = state.last_match
                    if start >= state_match.end:
                        state_match.start = start
                        state_match.end = end
                        state_match.offset = len(matches)
                        matches.append((ent_id,start,end))
                    if start <= state_match.start and end >= state_match.end:
                        j = state_match.offset
                        if len(matches) == 0:
                            state_match.offset = 0
                            matches.append((ent_id,start,end))
                        else:
                            matches[j] = (ent_id,start,end)
                        state_match.start = start
                        state_match.end = end
                    else:
                        pass
        for i, (ent_id, start, end) in enumerate(matches):
            on_match = self._callbacks.get(ent_id)
            if on_match is not None:
--- a/spacy/tests/regression/test_issue1450.py
+++ b/spacy/tests/regression/test_issue1450.py
@ -13,8 +13,8 @@ from ...vocab import Vocab
        ('a b', 0, 2),
        ('a c', 0, 1),
        ('a b c', 0, 2),
-        ('a b b c', 0, 2),
+        ('a b b c', 0, 3),
-        ('a b b', 0, 2),
+        ('a b b', 0, 3),
    ]
 )
 def test_issue1450_matcher_end_zero_plus(string, start, end):
--- a/spacy/tests/regression/test_issue1855.py
+++ b/spacy/tests/regression/test_issue1855.py
@ -0,0 +1,63 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import re
 from ...matcher import Matcher
 import pytest
 pattern1	= [{'ORTH':'A','OP':'1'},{'ORTH':'A','OP':'*'}]
 pattern2	= [{'ORTH':'A','OP':'*'},{'ORTH':'A','OP':'1'}]
 pattern3	= [{'ORTH':'A','OP':'1'},{'ORTH':'A','OP':'1'}]
 pattern4	= [{'ORTH':'B','OP':'1'},{'ORTH':'A','OP':'*'},{'ORTH':'B','OP':'1'}]
 pattern5 	= [{'ORTH':'B','OP':'*'},{'ORTH':'A','OP':'*'},{'ORTH':'B','OP':'1'}]
 re_pattern1	= 'AA*'
 re_pattern2 = 'A*A'
 re_pattern3	= 'AA'
 re_pattern4	= 'BA*B'
 re_pattern5	= 'B*A*B'
@pytest.fixture
 def text():
 	return "(ABBAAAAAB)."
@pytest.fixture
 def doc(en_tokenizer,text):
    doc = en_tokenizer(' '.join(text))
    return doc
@pytest.mark.parametrize('pattern,re_pattern',[
 	(pattern1,re_pattern1),
 	(pattern2,re_pattern2),
 	(pattern3,re_pattern3),
 	(pattern4,re_pattern4),
 	(pattern5,re_pattern5)])
 def test_greedy_matching(doc,text,pattern,re_pattern):
 	"""
 	Test that the greedy matching behavior of the * op
 	is consistant with other re implementations
 	"""
 	matcher = Matcher(doc.vocab)
 	matcher.add(re_pattern,None,pattern)
 	matches = matcher(doc)
 	re_matches = [m.span() for m in re.finditer(re_pattern,text)]
 	for match,re_match in zip(matches,re_matches):
 		assert match[1:]==re_match
@pytest.mark.parametrize('pattern,re_pattern',[
 	(pattern1,re_pattern1),
 	(pattern2,re_pattern2),
 	(pattern3,re_pattern3),
 	(pattern4,re_pattern4),
 	(pattern5,re_pattern5)])
 def test_match_consuming(doc,text,pattern,re_pattern):
 	"""
 	Test that matcher.__call__ consumes tokens on a match
 	similar to re.findall
 	"""
 	matcher = Matcher(doc.vocab)
 	matcher.add(re_pattern,None,pattern)
 	matches = matcher(doc)
 	re_matches = [m.span() for m in re.finditer(re_pattern,text)]
 	assert len(matches)==len(re_matches)
--- a/spacy/tests/test_matcher_greedy.py
+++ b/spacy/tests/test_matcher_greedy.py
@ -0,0 +1,63 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import re
 from ..matcher import Matcher
 import pytest
 pattern1	= [{'ORTH':'A','OP':'1'},{'ORTH':'A','OP':'*'}]
 pattern2	= [{'ORTH':'A','OP':'*'},{'ORTH':'A','OP':'1'}]
 pattern3	= [{'ORTH':'A','OP':'1'},{'ORTH':'A','OP':'1'}]
 pattern4	= [{'ORTH':'B','OP':'1'},{'ORTH':'A','OP':'*'},{'ORTH':'B','OP':'1'}]
 pattern5 	= [{'ORTH':'B','OP':'*'},{'ORTH':'A','OP':'*'},{'ORTH':'B','OP':'1'}]
 re_pattern1	= 'AA*'
 re_pattern2 = 'A*A'
 re_pattern3	= 'AA'
 re_pattern4	= 'BA*B'
 re_pattern5	= 'B*A*B'
@pytest.fixture
 def text():
 	return "(ABBAAAAAB)."
@pytest.fixture
 def doc(en_tokenizer,text):
    doc = en_tokenizer(' '.join(text))
    return doc
@pytest.mark.parametrize('pattern,re_pattern',[
 	(pattern1,re_pattern1),
 	(pattern2,re_pattern2),
 	(pattern3,re_pattern3),
 	(pattern4,re_pattern4),
 	(pattern5,re_pattern5)])
 def test_greedy_matching(doc,text,pattern,re_pattern):
 	"""
 	Test that the greedy matching behavior of the * op
 	is consistant with other re implementations
 	"""
 	matcher = Matcher(doc.vocab)
 	matcher.add(re_pattern,None,pattern)
 	matches = matcher(doc)
 	re_matches = [m.span() for m in re.finditer(re_pattern,text)]
 	for match,re_match in zip(matches,re_matches):
 		assert match[1:]==re_match
@pytest.mark.parametrize('pattern,re_pattern',[
 	(pattern1,re_pattern1),
 	(pattern2,re_pattern2),
 	(pattern3,re_pattern3),
 	(pattern4,re_pattern4),
 	(pattern5,re_pattern5)])
 def test_match_consuming(doc,text,pattern,re_pattern):
 	"""
 	Test that matcher.__call__ consumes tokens on a match
 	similar to re.findall
 	"""
 	matcher = Matcher(doc.vocab)
 	matcher.add(re_pattern,None,pattern)
 	matches = matcher(doc)
 	re_matches = [m.span() for m in re.finditer(re_pattern,text)]
 	assert len(matches)==len(re_matches)
--- a/website/usage/_linguistic-features/_rule-based-matching.jade
+++ b/website/usage/_linguistic-features/_rule-based-matching.jade
@ -161,11 +161,7 @@ p
 p
    |  The #[code +] and #[code *] operators are usually interpretted
-    |  "greedily", i.e. longer matches are returned where possible. However, if
+    |  "greedily", i.e. longer matches are returned where possible. 
    |  you specify two #[code +] and #[code *] patterns in a row and their
    |  matches overlap, the first operator will behave non-greedily. This quirk
    |  in the semantics makes the matcher more efficient, by avoiding the need
    |  for back-tracking.
 +h(3, "adding-phrase-patterns") Adding phrase patterns