Add Python notes for rethinking matcher

2025-12-04 08:44:23 +03:00 · 2018-02-12 10:19:29 +01:00 · 2018-02-12 10:19:29 +01:00 · d34c732635
commit d34c732635
parent 0d3262a9f3
1 changed files with 251 additions and 0 deletions
--- a/spacy/_matcher2_notes.py
+++ b/spacy/_matcher2_notes.py
@ -0,0 +1,251 @@
 import pytest
 class Vocab(object):
    pass
 class Doc(list):
    def __init__(self, vocab, words=None):
        list.__init__(self)
        self.extend([Token(i, w) for i, w in enumerate(words)])
 class Token(object):
    def __init__(self, i, word):
        self.i = i
        self.text = word
 def find_matches(patterns, doc):
    init_states = [(pattern, 0, None) for pattern in patterns]
    curr_states = []
    matches = []
    for token in doc:
        nexts = []
        for state in (curr_states + init_states):
            matches, nexts = transition(state, token, matches, nexts)
        curr_states = nexts
    return matches
 def transition(state, token, matches, nexts):
    action = get_action(state, token)
    is_match, keep_state, advance_state = [bool(int(c)) for c in action]
    pattern, i, start = state
    if start is None:
        start = token.i
    if is_match:
        matches.append((pattern, start, token.i+1))
    if keep_state:
        nexts.append((pattern, i, start))
    if advance_state:
        nexts.append((pattern, i+1, start))
    return (matches, nexts)
 def get_action(state, token):
    '''We need to consider:
    a) Does the token match the specification? [Yes, No]
    b) What's the quantifier? [1, 1+, 0+]
    c) Is this the last specification? [final, non-final]
    We therefore have 12 cases to consider. For each case, we need to know
    whether to emit a match, whether to keep the current state in the partials,
    and whether to add an advanced state to the partials.
    We therefore have eight possible results for these three booleans, which
    we'll code as 000, 001 etc.
    - No match:
      000
    - Match, final:
        1: 100
        1+: 110
    - Match, non-final:
        1: 001
        1+: 011
    Problem: If a quantifier is matching, we're adding a lot of open partials
    '''
    is_match = get_is_match(state, token)
    operator = get_operator(state, token)
    is_final = get_is_final(state, token)
    if operator == '1':
        if not is_match:
            return '000'
        elif is_final:
            return '100'
        else:
            return '001'
    elif operator == '1+':
        if not is_match:
            return '000'
        if is_final:
            return '110'
        else:
            return '011'
    elif operator == '0+':
        if is_final:
            return '100'
        elif is_match:
            return '011'
        else:
            return '010'
    else:
        print(operator, is_match, is_final)
        raise ValueError
 def get_is_match(state, token):
    pattern, i, start = state
    is_match = token.text == pattern[i]['spec']
    if pattern[i].get('invert'):
        return not is_match
    else:
        return is_match
 def get_is_final(state, token):
    pattern, i, start = state
    return i == len(pattern)-1
 def get_operator(state, token):
    pattern, i, start = state
    return pattern[i].get('op', '1')
 ########################
 # Tests for get_action #
 ########################
 def test_get_action_simple_match():
    pattern = [{'spec': 'a', 'op': '1'}]
    doc = Doc(Vocab(), words=['a'])
    state = (pattern, 0, None)
    action = get_action(state, doc[0])
    assert action == '100'
 def test_get_action_simple_reject():
    pattern = [{'spec': 'b', 'op': '1'}]
    doc = Doc(Vocab(), words=['a'])
    state = (pattern, 0, None)
    action = get_action(state, doc[0])
    assert action == '000'
 def test_get_action_simple_match_match():
    pattern = [{'spec': 'a', 'op': '1'}, {'spec': 'a', 'op': '1'}]
    doc = Doc(Vocab(), words=['a', 'a'])
    state = (pattern, 0, None)
    action = get_action(state, doc[0])
    assert action == '001'
    state = (pattern, 1, 0)
    action = get_action(state, doc[1])
    assert action == '100'
 def test_get_action_simple_match_reject():
    pattern = [{'spec': 'a', 'op': '1'}, {'spec': 'b', 'op': '1'}]
    doc = Doc(Vocab(), words=['a', 'a'])
    state = (pattern, 0, None)
    action = get_action(state, doc[0])
    assert action == '001'
    state = (pattern, 1, 0)
    action = get_action(state, doc[1])
    assert action == '000'
 def test_get_action_simple_match_reject():
    pattern = [{'spec': 'a', 'op': '1'}, {'spec': 'b', 'op': '1'}]
    doc = Doc(Vocab(), words=['a', 'a'])
    state = (pattern, 0, None)
    action = get_action(state, doc[0])
    assert action == '001'
    state = (pattern, 1, 0)
    action = get_action(state, doc[1])
    assert action == '000'
 def test_get_action_plus_match():
    pattern = [{'spec': 'a', 'op': '1+'}]
    doc = Doc(Vocab(), words=['a'])
    state = (pattern, 0, None)
    action = get_action(state, doc[0])
    assert action == '110'
 def test_get_action_plus_match_match():
    pattern = [{'spec': 'a', 'op': '1+'}]
    doc = Doc(Vocab(), words=['a', 'a'])
    state = (pattern, 0, None)
    action = get_action(state, doc[0])
    assert action == '110'
    state = (pattern, 0, 0)
    action = get_action(state, doc[1])
    assert action == '110'
 ##########################
 # Tests for find_matches #
 ##########################
 def test_find_matches_simple_accept():
    pattern = [{'spec': 'a', 'op': '1'}]
    doc = Doc(Vocab(), words=['a'])
    matches = find_matches([pattern], doc)
    assert matches == [(pattern, 0, 1)]
 def test_find_matches_simple_reject():
    pattern = [{'spec': 'a', 'op': '1'}]
    doc = Doc(Vocab(), words=['b'])
    matches = find_matches([pattern], doc)
    assert matches == []
 def test_find_matches_match_twice():
    pattern = [{'spec': 'a', 'op': '1'}]
    doc = Doc(Vocab(), words=['a', 'a'])
    matches = find_matches([pattern], doc)
    assert matches == [(pattern, 0, 1), (pattern, 1, 2)]
 def test_find_matches_longer_pattern():
    pattern = [{'spec': 'a', 'op': '1'}, {'spec': 'b', 'op': '1'}]
    doc = Doc(Vocab(), words=['a', 'b'])
    matches = find_matches([pattern], doc)
    assert matches == [(pattern, 0, 2)]
 def test_find_matches_two_patterns():
    patterns = [[{'spec': 'a', 'op': '1'}], [{'spec': 'b', 'op': '1'}]]
    doc = Doc(Vocab(), words=['a', 'b'])
    matches = find_matches(patterns, doc)
    assert matches == [(patterns[0], 0, 1), (patterns[1], 1, 2)]
 def test_find_matches_two_patterns_overlap():
    patterns = [[{'spec': 'a'}, {'spec': 'b'}],
                [{'spec': 'b'}, {'spec': 'c'}]]
    doc = Doc(Vocab(), words=['a', 'b', 'c'])
    matches = find_matches(patterns, doc)
    assert matches == [(patterns[0], 0, 2), (patterns[1], 1, 3)]
 def test_find_matches_greedy():
    patterns = [[{'spec': 'a', 'op': '1+'}]]
    doc = Doc(Vocab(), words=['a'])
    matches = find_matches(patterns, doc)
    assert matches == [(patterns[0], 0, 1)]
    doc = Doc(Vocab(), words=['a', 'a'])
    matches = find_matches(patterns, doc)
    assert matches == [(patterns[0], 0, 1), (patterns[0], 0, 2), (patterns[0], 1, 2)]
 def test_find_matches_non_greedy():
    patterns = [[{'spec': 'a', 'op': '0+'}, {'spec': 'b'}]]
    doc = Doc(Vocab(), words=['b'])
    matches = find_matches(patterns, doc)
    assert matches == [(patterns[0], 0, 1)]