Merge pull request #1402 from explosion/feature/fix-matcher-operators

💫 Fix Matcher variable-length operators
This commit is contained in:
Matthew Honnibal 2017-10-16 17:53:19 +02:00 committed by GitHub
commit 010a7309ff
3 changed files with 95 additions and 28 deletions

View File

@ -71,6 +71,11 @@ cdef enum action_t:
ADVANCE_ZERO
PANIC
# A "match expression" conists of one or more token patterns
# Each token pattern consists of a quantifier and 0+ (attr, value) pairs.
# A state is an (int, pattern pointer) pair, where the int is the start
# position, and the pattern pointer shows where we're up to
# in the pattern.
cdef struct AttrValueC:
attr_id_t attr
@ -130,7 +135,13 @@ cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil:
elif pattern.quantifier in (ONE, ZERO_ONE):
return ACCEPT if (pattern+1).nr_attr == 0 else ADVANCE
elif pattern.quantifier == ZERO_PLUS:
return REPEAT
# This is a bandaid over the 'shadowing' problem described here:
# https://github.com/explosion/spaCy/issues/864
next_action = get_action(pattern+1, token)
if next_action is REJECT:
return REPEAT
else:
return ADVANCE_ZERO
else:
return PANIC
@ -220,16 +231,28 @@ cdef class Matcher:
return len(self._patterns)
def add(self, key, on_match, *patterns):
"""Add a match-rule to the matcher.
A match-rule consists of: an ID key, an on_match callback, and one or
more patterns. If the key exists, the patterns are appended to the
previous ones, and the previous on_match callback is replaced. The
`on_match` callback will receive the arguments `(matcher, doc, i,
matches)`. You can also set `on_match` to `None` to not perform any
actions. A pattern consists of one or more `token_specs`, where a
`token_spec` is a dictionary mapping attribute IDs to values. Token
descriptors can also include quantifiers. There are currently important
known problems with the quantifiers see the docs.
"""Add a match-rule to the matcher. A match-rule consists of: an ID key,
an on_match callback, and one or more patterns.
If the key exists, the patterns are appended to the previous ones, and
the previous on_match callback is replaced. The `on_match` callback will
receive the arguments `(matcher, doc, i, matches)`. You can also set
`on_match` to `None` to not perform any actions.
A pattern consists of one or more `token_specs`, where a `token_spec`
is a dictionary mapping attribute IDs to values, and optionally a
quantifier operator under the key "op". The available quantifiers are:
'!': Negate the pattern, by requiring it to match exactly 0 times.
'?': Make the pattern optional, by allowing it to match 0 or 1 times.
'+': Require the pattern to match 1 or more times.
'*': Allow the pattern to zero or more times.
The + and * operators are usually interpretted "greedily", i.e. longer
matches are returned where possible. However, if you specify two '+'
and '*' patterns in a row and their matches overlap, the first
operator will behave non-greedily. This quirk in the semantics
makes the matcher more efficient, by avoiding the need for back-tracking.
"""
for pattern in patterns:
if len(pattern) == 0:

View File

@ -107,11 +107,22 @@ def test_matcher_empty_dict(en_vocab):
matches = matcher(doc)
assert len(matches) == 1
assert matches[0][1:] == (0, 3)
matcher = Matcher(en_vocab)
matcher.add('A.', None, [{'ORTH': 'a'}, {}])
matches = matcher(doc)
assert matches[0][1:] == (0, 2)
def test_matcher_operator_shadow(en_vocab):
matcher = Matcher(en_vocab)
abc = ["a", "b", "c"]
doc = get_doc(matcher.vocab, abc)
matcher.add('A.C', None, [{'ORTH': 'a'},
{"IS_ALPHA": True, "OP": "+"},
{'ORTH': 'c'}])
matches = matcher(doc)
assert len(matches) == 1
assert matches[0][1:] == (0, 3)
def test_matcher_phrase_matcher(en_vocab):
words = ["Google", "Now"]
doc = get_doc(en_vocab, words)
@ -165,3 +176,39 @@ def test_matcher_match_one_plus(matcher):
{'ORTH': 'Philippe', 'OP': '+'}])
m = matcher(doc)
assert len(m) == 1
def test_operator_combos(matcher):
cases = [
('aaab', 'a a a b', True),
('aaab', 'a+ b', True),
('aaab', 'a+ a+ b', True),
('aaab', 'a+ a+ a b', True),
('aaab', 'a+ a+ a+ b', True),
('aaab', 'a+ a a b', True),
('aaab', 'a+ a a', True),
('aaab', 'a+', True),
('aaa', 'a+ b', False),
('aaa', 'a+ a+ b', False),
('aaa', 'a+ a+ a+ b', False),
('aaa', 'a+ a b', False),
('aaa', 'a+ a a b', False),
('aaab', 'a+ a a', True),
('aaab', 'a+', True),
('aaab', 'a+ a b', True),
]
for string, pattern_str, result in cases:
matcher = Matcher(matcher.vocab)
doc = get_doc(matcher.vocab, words=list(string))
pattern = []
for part in pattern_str.split():
if part.endswith('+'):
pattern.append({'ORTH': part[0], 'op': '+'})
else:
pattern.append({'ORTH': part})
matcher.add('PATTERN', None, pattern)
matches = matcher(doc)
if result:
assert matches, (string, pattern_str)
else:
assert not matches, (string, pattern_str)

View File

@ -142,33 +142,30 @@ p
| are no nested or scoped quantifiers instead, you can build those
| behaviours with #[code on_match] callbacks.
+aside("Problems with quantifiers")
| Using quantifiers may lead to unexpected results when matching
| variable-length patterns, for example if the next token would also be
| matched by the previous token. This problem should be resolved in a future
| release. For more information, see
| #[+a(gh("spaCy") + "/issues/864") this issue].
+table([ "OP", "Description", "Example"])
+table([ "OP", "Description"])
+row
+cell #[code !]
+cell match exactly 0 times
+cell negation
+cell Negate the pattern, by requiring it to match exactly 0 times.
+row
+cell #[code *]
+cell match 0 or more times
+cell optional, variable number
+cell Make the pattern optional, by allowing it to match 0 or 1 times.
+row
+cell #[code +]
+cell match 1 or more times
+cell mandatory, variable number
+cell Require the pattern to match 1 or more times.
+row
+cell #[code ?]
+cell match 0 or 1 times
+cell optional, max one
+cell Allow the pattern to zero or more times.
p
| The #[code +] and #[code *] operators are usually interpretted
| "greedily", i.e. longer matches are returned where possible. However, if
| you specify two #[code +] and #[code *] patterns in a row and their
| matches overlap, the first operator will behave non-greedily. This quirk
| in the semantics makes the matcher more efficient, by avoiding the need
| for back-tracking.
+h(3, "adding-phrase-patterns") Adding phrase patterns