spaCy/spacy/_matcher2_notes.py

import pytest


class Vocab(object):
    pass


class Doc(list):
    def __init__(self, vocab, words=None):
        list.__init__(self)
        self.extend([Token(i, w) for i, w in enumerate(words)])


class Token(object):
    def __init__(self, i, word):
        self.i = i
        self.text = word


def find_matches(patterns, doc):
    init_states = [(pattern, 0, None) for pattern in patterns]
    curr_states = []
    matches = []
    for token in doc:
        nexts = []
        for state in (curr_states + init_states):
            matches, nexts = transition(state, token, matches, nexts)
        curr_states = nexts
    return matches
 

def transition(state, token, matches, nexts):
    action = get_action(state, token)
    is_match, keep_state, advance_state = [bool(int(c)) for c in action]
    pattern, i, start = state
    if start is None:
        start = token.i
    if is_match:
        matches.append((pattern, start, token.i+1))
    if advance_state:
        nexts.append((pattern, i+1, start))
    if keep_state:
        # TODO: This needs to be zero-width :(.
        nexts.append((pattern, i, start))
    return (matches, nexts)


def get_action(state, token):
    '''We need to consider:

    a) Does the token match the specification? [Yes, No]
    b) What's the quantifier? [1, 0+, ?]
    c) Is this the last specification? [final, non-final]

    We can transition in the following ways:

    a) Do we emit a match?
    b) Do we add a state with (next state, next token)?
    c) Do we add a state with (next state, same token)?
    d) Do we add a state with (same state, next token)?

    We'll code the actions as boolean strings, so 0000 means no to all 4,
    1000 means match but no states added, etc.
    
    1:
      Yes, final:
        1000
      Yes, non-final:
        0100
      No, final:
        0000
      No, non-final
        0000
    0+:
      Yes, final:
        1001
      Yes, non-final:
        0111
      No, final:
        1000 (note: Don't include last token!)
      No, non-final:
        0010
    ?:
      Yes, final:
        1000
      Yes, non-final:
        0100
      No, final:
        1000 (note: Don't include last token!)
      No, non-final:
        0010

    Problem: If a quantifier is matching, we're adding a lot of open partials
    '''
    is_match = get_is_match(state, token)
    operator = get_operator(state, token)
    is_final = get_is_final(state, token)
    raise NotImplementedError


def get_is_match(state, token):
    pattern, i, start = state
    is_match = token.text == pattern[i]['spec']
    if pattern[i].get('invert'):
        return not is_match
    else:
        return is_match

def get_is_final(state, token):
    pattern, i, start = state
    return i == len(pattern)-1

def get_operator(state, token):
    pattern, i, start = state
    return pattern[i].get('op', '1')


########################
# Tests for get_action #
########################


def test_get_action_simple_match():
    pattern = [{'spec': 'a', 'op': '1'}]
    doc = Doc(Vocab(), words=['a'])
    state = (pattern, 0, None)
    action = get_action(state, doc[0])
    assert action == '100'


def test_get_action_simple_reject():
    pattern = [{'spec': 'b', 'op': '1'}]
    doc = Doc(Vocab(), words=['a'])
    state = (pattern, 0, None)
    action = get_action(state, doc[0])
    assert action == '000'


def test_get_action_simple_match_match():
    pattern = [{'spec': 'a', 'op': '1'}, {'spec': 'a', 'op': '1'}]
    doc = Doc(Vocab(), words=['a', 'a'])
    state = (pattern, 0, None)
    action = get_action(state, doc[0])
    assert action == '001'
    state = (pattern, 1, 0)
    action = get_action(state, doc[1])
    assert action == '100'


def test_get_action_simple_match_reject():
    pattern = [{'spec': 'a', 'op': '1'}, {'spec': 'b', 'op': '1'}]
    doc = Doc(Vocab(), words=['a', 'a'])
    state = (pattern, 0, None)
    action = get_action(state, doc[0])
    assert action == '001'
    state = (pattern, 1, 0)
    action = get_action(state, doc[1])
    assert action == '000'


def test_get_action_simple_match_reject():
    pattern = [{'spec': 'a', 'op': '1'}, {'spec': 'b', 'op': '1'}]
    doc = Doc(Vocab(), words=['a', 'a'])
    state = (pattern, 0, None)
    action = get_action(state, doc[0])
    assert action == '001'
    state = (pattern, 1, 0)
    action = get_action(state, doc[1])
    assert action == '000'


def test_get_action_plus_match():
    pattern = [{'spec': 'a', 'op': '1+'}]
    doc = Doc(Vocab(), words=['a'])
    state = (pattern, 0, None)
    action = get_action(state, doc[0])
    assert action == '110'


def test_get_action_plus_match_match():
    pattern = [{'spec': 'a', 'op': '1+'}]
    doc = Doc(Vocab(), words=['a', 'a'])
    state = (pattern, 0, None)
    action = get_action(state, doc[0])
    assert action == '110'
    state = (pattern, 0, 0)
    action = get_action(state, doc[1])
    assert action == '110'


##########################
# Tests for find_matches #
##########################

def test_find_matches_simple_accept():
    pattern = [{'spec': 'a', 'op': '1'}]
    doc = Doc(Vocab(), words=['a'])
    matches = find_matches([pattern], doc)
    assert matches == [(pattern, 0, 1)]


def test_find_matches_simple_reject():
    pattern = [{'spec': 'a', 'op': '1'}]
    doc = Doc(Vocab(), words=['b'])
    matches = find_matches([pattern], doc)
    assert matches == []


def test_find_matches_match_twice():
    pattern = [{'spec': 'a', 'op': '1'}]
    doc = Doc(Vocab(), words=['a', 'a'])
    matches = find_matches([pattern], doc)
    assert matches == [(pattern, 0, 1), (pattern, 1, 2)]


def test_find_matches_longer_pattern():
    pattern = [{'spec': 'a', 'op': '1'}, {'spec': 'b', 'op': '1'}]
    doc = Doc(Vocab(), words=['a', 'b'])
    matches = find_matches([pattern], doc)
    assert matches == [(pattern, 0, 2)]


def test_find_matches_two_patterns():
    patterns = [[{'spec': 'a', 'op': '1'}], [{'spec': 'b', 'op': '1'}]]
    doc = Doc(Vocab(), words=['a', 'b'])
    matches = find_matches(patterns, doc)
    assert matches == [(patterns[0], 0, 1), (patterns[1], 1, 2)]


def test_find_matches_two_patterns_overlap():
    patterns = [[{'spec': 'a'}, {'spec': 'b'}],
                [{'spec': 'b'}, {'spec': 'c'}]]
    doc = Doc(Vocab(), words=['a', 'b', 'c'])
    matches = find_matches(patterns, doc)
    assert matches == [(patterns[0], 0, 2), (patterns[1], 1, 3)]


def test_find_matches_greedy():
    patterns = [[{'spec': 'a', 'op': '1+'}]]
    doc = Doc(Vocab(), words=['a'])
    matches = find_matches(patterns, doc)
    assert matches == [(patterns[0], 0, 1)]
    doc = Doc(Vocab(), words=['a', 'a'])
    matches = find_matches(patterns, doc)
    assert matches == [(patterns[0], 0, 1), (patterns[0], 0, 2), (patterns[0], 1, 2)]

def test_find_matches_non_greedy():
    patterns = [[{'spec': 'a', 'op': '0+'}, {'spec': 'b', "op": "1"}]]
    doc = Doc(Vocab(), words=['b'])
    matches = find_matches(patterns, doc)
    assert matches == [(patterns[0], 0, 1)]
Revert "Merge branch 'develop' of https://github.com/explosion/spaCy into develop" This reverts commit c9ba3d3c2dc7067cf8bd55f878cec45a8c6d73d4, reversing changes made to 92c26a35d425d4e8ca1b805ea776ea10f5ded3df. 2018-03-27 20:23:02 +03:00			`import pytest`


			`class Vocab(object):`
			`pass`


			`class Doc(list):`
			`def __init__(self, vocab, words=None):`
			`list.__init__(self)`
			`self.extend([Token(i, w) for i, w in enumerate(words)])`


			`class Token(object):`
			`def __init__(self, i, word):`
			`self.i = i`
			`self.text = word`


			`def find_matches(patterns, doc):`
			`init_states = [(pattern, 0, None) for pattern in patterns]`
			`curr_states = []`
			`matches = []`
			`for token in doc:`
			`nexts = []`
			`for state in (curr_states + init_states):`
			`matches, nexts = transition(state, token, matches, nexts)`
			`curr_states = nexts`
			`return matches`


			`def transition(state, token, matches, nexts):`
			`action = get_action(state, token)`
			`is_match, keep_state, advance_state = [bool(int(c)) for c in action]`
			`pattern, i, start = state`
			`if start is None:`
			`start = token.i`
			`if is_match:`
			`matches.append((pattern, start, token.i+1))`
			`if advance_state:`
			`nexts.append((pattern, i+1, start))`
			`if keep_state:`
			`# TODO: This needs to be zero-width :(.`
			`nexts.append((pattern, i, start))`
			`return (matches, nexts)`


			`def get_action(state, token):`
			`'''We need to consider:`

			`a) Does the token match the specification? [Yes, No]`
			`b) What's the quantifier? [1, 0+, ?]`
			`c) Is this the last specification? [final, non-final]`

			`We can transition in the following ways:`

			`a) Do we emit a match?`
			`b) Do we add a state with (next state, next token)?`
			`c) Do we add a state with (next state, same token)?`
			`d) Do we add a state with (same state, next token)?`

			`We'll code the actions as boolean strings, so 0000 means no to all 4,`
			`1000 means match but no states added, etc.`

			`1:`
			`Yes, final:`
			`1000`
			`Yes, non-final:`
			`0100`
			`No, final:`
			`0000`
			`No, non-final`
			`0000`
			`0+:`
			`Yes, final:`
			`1001`
			`Yes, non-final:`
			`0111`
			`No, final:`
			`1000 (note: Don't include last token!)`
			`No, non-final:`
			`0010`
			`?:`
			`Yes, final:`
			`1000`
			`Yes, non-final:`
			`0100`
			`No, final:`
			`1000 (note: Don't include last token!)`
			`No, non-final:`
			`0010`

			`Problem: If a quantifier is matching, we're adding a lot of open partials`
			`'''`
			`is_match = get_is_match(state, token)`
			`operator = get_operator(state, token)`
			`is_final = get_is_final(state, token)`
			`raise NotImplementedError`


			`def get_is_match(state, token):`
			`pattern, i, start = state`
			`is_match = token.text == pattern[i]['spec']`
			`if pattern[i].get('invert'):`
			`return not is_match`
			`else:`
			`return is_match`

			`def get_is_final(state, token):`
			`pattern, i, start = state`
			`return i == len(pattern)-1`

			`def get_operator(state, token):`
			`pattern, i, start = state`
			`return pattern[i].get('op', '1')`


			`########################`
			`# Tests for get_action #`
			`########################`


			`def test_get_action_simple_match():`
			`pattern = [{'spec': 'a', 'op': '1'}]`
			`doc = Doc(Vocab(), words=['a'])`
			`state = (pattern, 0, None)`
			`action = get_action(state, doc[0])`
			`assert action == '100'`


			`def test_get_action_simple_reject():`
			`pattern = [{'spec': 'b', 'op': '1'}]`
			`doc = Doc(Vocab(), words=['a'])`
			`state = (pattern, 0, None)`
			`action = get_action(state, doc[0])`
			`assert action == '000'`


			`def test_get_action_simple_match_match():`
			`pattern = [{'spec': 'a', 'op': '1'}, {'spec': 'a', 'op': '1'}]`
			`doc = Doc(Vocab(), words=['a', 'a'])`
			`state = (pattern, 0, None)`
			`action = get_action(state, doc[0])`
			`assert action == '001'`
			`state = (pattern, 1, 0)`
			`action = get_action(state, doc[1])`
			`assert action == '100'`


			`def test_get_action_simple_match_reject():`
			`pattern = [{'spec': 'a', 'op': '1'}, {'spec': 'b', 'op': '1'}]`
			`doc = Doc(Vocab(), words=['a', 'a'])`
			`state = (pattern, 0, None)`
			`action = get_action(state, doc[0])`
			`assert action == '001'`
			`state = (pattern, 1, 0)`
			`action = get_action(state, doc[1])`
			`assert action == '000'`


			`def test_get_action_simple_match_reject():`
			`pattern = [{'spec': 'a', 'op': '1'}, {'spec': 'b', 'op': '1'}]`
			`doc = Doc(Vocab(), words=['a', 'a'])`
			`state = (pattern, 0, None)`
			`action = get_action(state, doc[0])`
			`assert action == '001'`
			`state = (pattern, 1, 0)`
			`action = get_action(state, doc[1])`
			`assert action == '000'`


			`def test_get_action_plus_match():`
			`pattern = [{'spec': 'a', 'op': '1+'}]`
			`doc = Doc(Vocab(), words=['a'])`
			`state = (pattern, 0, None)`
			`action = get_action(state, doc[0])`
			`assert action == '110'`


			`def test_get_action_plus_match_match():`
			`pattern = [{'spec': 'a', 'op': '1+'}]`
			`doc = Doc(Vocab(), words=['a', 'a'])`
			`state = (pattern, 0, None)`
			`action = get_action(state, doc[0])`
			`assert action == '110'`
			`state = (pattern, 0, 0)`
			`action = get_action(state, doc[1])`
			`assert action == '110'`


			`##########################`
			`# Tests for find_matches #`
			`##########################`

			`def test_find_matches_simple_accept():`
			`pattern = [{'spec': 'a', 'op': '1'}]`
			`doc = Doc(Vocab(), words=['a'])`
			`matches = find_matches([pattern], doc)`
			`assert matches == [(pattern, 0, 1)]`


			`def test_find_matches_simple_reject():`
			`pattern = [{'spec': 'a', 'op': '1'}]`
			`doc = Doc(Vocab(), words=['b'])`
			`matches = find_matches([pattern], doc)`
			`assert matches == []`


			`def test_find_matches_match_twice():`
			`pattern = [{'spec': 'a', 'op': '1'}]`
			`doc = Doc(Vocab(), words=['a', 'a'])`
			`matches = find_matches([pattern], doc)`
			`assert matches == [(pattern, 0, 1), (pattern, 1, 2)]`


			`def test_find_matches_longer_pattern():`
			`pattern = [{'spec': 'a', 'op': '1'}, {'spec': 'b', 'op': '1'}]`
			`doc = Doc(Vocab(), words=['a', 'b'])`
			`matches = find_matches([pattern], doc)`
			`assert matches == [(pattern, 0, 2)]`


			`def test_find_matches_two_patterns():`
			`patterns = [[{'spec': 'a', 'op': '1'}], [{'spec': 'b', 'op': '1'}]]`
			`doc = Doc(Vocab(), words=['a', 'b'])`
			`matches = find_matches(patterns, doc)`
			`assert matches == [(patterns[0], 0, 1), (patterns[1], 1, 2)]`


			`def test_find_matches_two_patterns_overlap():`
			`patterns = [[{'spec': 'a'}, {'spec': 'b'}],`
			`[{'spec': 'b'}, {'spec': 'c'}]]`
			`doc = Doc(Vocab(), words=['a', 'b', 'c'])`
			`matches = find_matches(patterns, doc)`
			`assert matches == [(patterns[0], 0, 2), (patterns[1], 1, 3)]`


			`def test_find_matches_greedy():`
			`patterns = [[{'spec': 'a', 'op': '1+'}]]`
			`doc = Doc(Vocab(), words=['a'])`
			`matches = find_matches(patterns, doc)`
			`assert matches == [(patterns[0], 0, 1)]`
			`doc = Doc(Vocab(), words=['a', 'a'])`
			`matches = find_matches(patterns, doc)`
			`assert matches == [(patterns[0], 0, 1), (patterns[0], 0, 2), (patterns[0], 1, 2)]`

			`def test_find_matches_non_greedy():`
			`patterns = [[{'spec': 'a', 'op': '0+'}, {'spec': 'b', "op": "1"}]]`
			`doc = Doc(Vocab(), words=['b'])`
			`matches = find_matches(patterns, doc)`
			`assert matches == [(patterns[0], 0, 1)]`