mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 10:46:29 +03:00
Merge pull request #1402 from explosion/feature/fix-matcher-operators
💫 Fix Matcher variable-length operators
This commit is contained in:
commit
010a7309ff
|
@ -71,6 +71,11 @@ cdef enum action_t:
|
||||||
ADVANCE_ZERO
|
ADVANCE_ZERO
|
||||||
PANIC
|
PANIC
|
||||||
|
|
||||||
|
# A "match expression" conists of one or more token patterns
|
||||||
|
# Each token pattern consists of a quantifier and 0+ (attr, value) pairs.
|
||||||
|
# A state is an (int, pattern pointer) pair, where the int is the start
|
||||||
|
# position, and the pattern pointer shows where we're up to
|
||||||
|
# in the pattern.
|
||||||
|
|
||||||
cdef struct AttrValueC:
|
cdef struct AttrValueC:
|
||||||
attr_id_t attr
|
attr_id_t attr
|
||||||
|
@ -130,7 +135,13 @@ cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil:
|
||||||
elif pattern.quantifier in (ONE, ZERO_ONE):
|
elif pattern.quantifier in (ONE, ZERO_ONE):
|
||||||
return ACCEPT if (pattern+1).nr_attr == 0 else ADVANCE
|
return ACCEPT if (pattern+1).nr_attr == 0 else ADVANCE
|
||||||
elif pattern.quantifier == ZERO_PLUS:
|
elif pattern.quantifier == ZERO_PLUS:
|
||||||
return REPEAT
|
# This is a bandaid over the 'shadowing' problem described here:
|
||||||
|
# https://github.com/explosion/spaCy/issues/864
|
||||||
|
next_action = get_action(pattern+1, token)
|
||||||
|
if next_action is REJECT:
|
||||||
|
return REPEAT
|
||||||
|
else:
|
||||||
|
return ADVANCE_ZERO
|
||||||
else:
|
else:
|
||||||
return PANIC
|
return PANIC
|
||||||
|
|
||||||
|
@ -220,16 +231,28 @@ cdef class Matcher:
|
||||||
return len(self._patterns)
|
return len(self._patterns)
|
||||||
|
|
||||||
def add(self, key, on_match, *patterns):
|
def add(self, key, on_match, *patterns):
|
||||||
"""Add a match-rule to the matcher.
|
"""Add a match-rule to the matcher. A match-rule consists of: an ID key,
|
||||||
A match-rule consists of: an ID key, an on_match callback, and one or
|
an on_match callback, and one or more patterns.
|
||||||
more patterns. If the key exists, the patterns are appended to the
|
|
||||||
previous ones, and the previous on_match callback is replaced. The
|
If the key exists, the patterns are appended to the previous ones, and
|
||||||
`on_match` callback will receive the arguments `(matcher, doc, i,
|
the previous on_match callback is replaced. The `on_match` callback will
|
||||||
matches)`. You can also set `on_match` to `None` to not perform any
|
receive the arguments `(matcher, doc, i, matches)`. You can also set
|
||||||
actions. A pattern consists of one or more `token_specs`, where a
|
`on_match` to `None` to not perform any actions.
|
||||||
`token_spec` is a dictionary mapping attribute IDs to values. Token
|
|
||||||
descriptors can also include quantifiers. There are currently important
|
A pattern consists of one or more `token_specs`, where a `token_spec`
|
||||||
known problems with the quantifiers – see the docs.
|
is a dictionary mapping attribute IDs to values, and optionally a
|
||||||
|
quantifier operator under the key "op". The available quantifiers are:
|
||||||
|
|
||||||
|
'!': Negate the pattern, by requiring it to match exactly 0 times.
|
||||||
|
'?': Make the pattern optional, by allowing it to match 0 or 1 times.
|
||||||
|
'+': Require the pattern to match 1 or more times.
|
||||||
|
'*': Allow the pattern to zero or more times.
|
||||||
|
|
||||||
|
The + and * operators are usually interpretted "greedily", i.e. longer
|
||||||
|
matches are returned where possible. However, if you specify two '+'
|
||||||
|
and '*' patterns in a row and their matches overlap, the first
|
||||||
|
operator will behave non-greedily. This quirk in the semantics
|
||||||
|
makes the matcher more efficient, by avoiding the need for back-tracking.
|
||||||
"""
|
"""
|
||||||
for pattern in patterns:
|
for pattern in patterns:
|
||||||
if len(pattern) == 0:
|
if len(pattern) == 0:
|
||||||
|
|
|
@ -107,11 +107,22 @@ def test_matcher_empty_dict(en_vocab):
|
||||||
matches = matcher(doc)
|
matches = matcher(doc)
|
||||||
assert len(matches) == 1
|
assert len(matches) == 1
|
||||||
assert matches[0][1:] == (0, 3)
|
assert matches[0][1:] == (0, 3)
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
matcher.add('A.', None, [{'ORTH': 'a'}, {}])
|
matcher.add('A.', None, [{'ORTH': 'a'}, {}])
|
||||||
matches = matcher(doc)
|
matches = matcher(doc)
|
||||||
assert matches[0][1:] == (0, 2)
|
assert matches[0][1:] == (0, 2)
|
||||||
|
|
||||||
|
def test_matcher_operator_shadow(en_vocab):
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
abc = ["a", "b", "c"]
|
||||||
|
doc = get_doc(matcher.vocab, abc)
|
||||||
|
matcher.add('A.C', None, [{'ORTH': 'a'},
|
||||||
|
{"IS_ALPHA": True, "OP": "+"},
|
||||||
|
{'ORTH': 'c'}])
|
||||||
|
matches = matcher(doc)
|
||||||
|
assert len(matches) == 1
|
||||||
|
assert matches[0][1:] == (0, 3)
|
||||||
|
|
||||||
def test_matcher_phrase_matcher(en_vocab):
|
def test_matcher_phrase_matcher(en_vocab):
|
||||||
words = ["Google", "Now"]
|
words = ["Google", "Now"]
|
||||||
doc = get_doc(en_vocab, words)
|
doc = get_doc(en_vocab, words)
|
||||||
|
@ -165,3 +176,39 @@ def test_matcher_match_one_plus(matcher):
|
||||||
{'ORTH': 'Philippe', 'OP': '+'}])
|
{'ORTH': 'Philippe', 'OP': '+'}])
|
||||||
m = matcher(doc)
|
m = matcher(doc)
|
||||||
assert len(m) == 1
|
assert len(m) == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_operator_combos(matcher):
|
||||||
|
cases = [
|
||||||
|
('aaab', 'a a a b', True),
|
||||||
|
('aaab', 'a+ b', True),
|
||||||
|
('aaab', 'a+ a+ b', True),
|
||||||
|
('aaab', 'a+ a+ a b', True),
|
||||||
|
('aaab', 'a+ a+ a+ b', True),
|
||||||
|
('aaab', 'a+ a a b', True),
|
||||||
|
('aaab', 'a+ a a', True),
|
||||||
|
('aaab', 'a+', True),
|
||||||
|
('aaa', 'a+ b', False),
|
||||||
|
('aaa', 'a+ a+ b', False),
|
||||||
|
('aaa', 'a+ a+ a+ b', False),
|
||||||
|
('aaa', 'a+ a b', False),
|
||||||
|
('aaa', 'a+ a a b', False),
|
||||||
|
('aaab', 'a+ a a', True),
|
||||||
|
('aaab', 'a+', True),
|
||||||
|
('aaab', 'a+ a b', True),
|
||||||
|
]
|
||||||
|
for string, pattern_str, result in cases:
|
||||||
|
matcher = Matcher(matcher.vocab)
|
||||||
|
doc = get_doc(matcher.vocab, words=list(string))
|
||||||
|
pattern = []
|
||||||
|
for part in pattern_str.split():
|
||||||
|
if part.endswith('+'):
|
||||||
|
pattern.append({'ORTH': part[0], 'op': '+'})
|
||||||
|
else:
|
||||||
|
pattern.append({'ORTH': part})
|
||||||
|
matcher.add('PATTERN', None, pattern)
|
||||||
|
matches = matcher(doc)
|
||||||
|
if result:
|
||||||
|
assert matches, (string, pattern_str)
|
||||||
|
else:
|
||||||
|
assert not matches, (string, pattern_str)
|
||||||
|
|
|
@ -142,33 +142,30 @@ p
|
||||||
| are no nested or scoped quantifiers – instead, you can build those
|
| are no nested or scoped quantifiers – instead, you can build those
|
||||||
| behaviours with #[code on_match] callbacks.
|
| behaviours with #[code on_match] callbacks.
|
||||||
|
|
||||||
+aside("Problems with quantifiers")
|
+table([ "OP", "Description"])
|
||||||
| Using quantifiers may lead to unexpected results when matching
|
|
||||||
| variable-length patterns, for example if the next token would also be
|
|
||||||
| matched by the previous token. This problem should be resolved in a future
|
|
||||||
| release. For more information, see
|
|
||||||
| #[+a(gh("spaCy") + "/issues/864") this issue].
|
|
||||||
|
|
||||||
+table([ "OP", "Description", "Example"])
|
|
||||||
+row
|
+row
|
||||||
+cell #[code !]
|
+cell #[code !]
|
||||||
+cell match exactly 0 times
|
+cell Negate the pattern, by requiring it to match exactly 0 times.
|
||||||
+cell negation
|
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code *]
|
+cell #[code *]
|
||||||
+cell match 0 or more times
|
+cell Make the pattern optional, by allowing it to match 0 or 1 times.
|
||||||
+cell optional, variable number
|
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code +]
|
+cell #[code +]
|
||||||
+cell match 1 or more times
|
+cell Require the pattern to match 1 or more times.
|
||||||
+cell mandatory, variable number
|
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code ?]
|
+cell #[code ?]
|
||||||
+cell match 0 or 1 times
|
+cell Allow the pattern to zero or more times.
|
||||||
+cell optional, max one
|
|
||||||
|
p
|
||||||
|
| The #[code +] and #[code *] operators are usually interpretted
|
||||||
|
| "greedily", i.e. longer matches are returned where possible. However, if
|
||||||
|
| you specify two #[code +] and #[code *] patterns in a row and their
|
||||||
|
| matches overlap, the first operator will behave non-greedily. This quirk
|
||||||
|
| in the semantics makes the matcher more efficient, by avoiding the need
|
||||||
|
| for back-tracking.
|
||||||
|
|
||||||
+h(3, "adding-phrase-patterns") Adding phrase patterns
|
+h(3, "adding-phrase-patterns") Adding phrase patterns
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user