mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 10:46:29 +03:00
Add 1 operator to matcher, and make sure open patterns are closed at end of document. Closes Issue #766
This commit is contained in:
parent
f028f8ad28
commit
8f94897d07
|
@ -138,7 +138,7 @@ cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil:
|
||||||
def _convert_strings(token_specs, string_store):
|
def _convert_strings(token_specs, string_store):
|
||||||
# Support 'syntactic sugar' operator '+', as combination of ONE, ZERO_PLUS
|
# Support 'syntactic sugar' operator '+', as combination of ONE, ZERO_PLUS
|
||||||
operators = {'!': (ZERO,), '*': (ZERO_PLUS,), '+': (ONE, ZERO_PLUS),
|
operators = {'!': (ZERO,), '*': (ZERO_PLUS,), '+': (ONE, ZERO_PLUS),
|
||||||
'?': (ZERO_ONE,)}
|
'?': (ZERO_ONE,), '1': (ONE,)}
|
||||||
tokens = []
|
tokens = []
|
||||||
op = ONE
|
op = ONE
|
||||||
for spec in token_specs:
|
for spec in token_specs:
|
||||||
|
@ -150,7 +150,7 @@ def _convert_strings(token_specs, string_store):
|
||||||
ops = operators[value]
|
ops = operators[value]
|
||||||
else:
|
else:
|
||||||
raise KeyError(
|
raise KeyError(
|
||||||
"Unknown operator. Options: %s" % ', '.join(operators.keys()))
|
"Unknown operator '%s'. Options: %s" % (value, ', '.join(operators.keys())))
|
||||||
if isinstance(attr, basestring):
|
if isinstance(attr, basestring):
|
||||||
attr = attrs.IDS.get(attr.upper())
|
attr = attrs.IDS.get(attr.upper())
|
||||||
if isinstance(value, basestring):
|
if isinstance(value, basestring):
|
||||||
|
@ -418,6 +418,22 @@ cdef class Matcher:
|
||||||
match = acceptor(doc, ent_id, label, start, end)
|
match = acceptor(doc, ent_id, label, start, end)
|
||||||
if match:
|
if match:
|
||||||
matches.append(match)
|
matches.append(match)
|
||||||
|
# Look for open patterns that are actually satisfied
|
||||||
|
for state in partials:
|
||||||
|
while state.second.quantifier in (ZERO, ZERO_PLUS):
|
||||||
|
state.second += 1
|
||||||
|
if state.second.nr_attr == 0:
|
||||||
|
start = state.first
|
||||||
|
end = len(doc)
|
||||||
|
ent_id = state.second.attrs[0].value
|
||||||
|
label = state.second.attrs[0].value
|
||||||
|
acceptor = self._acceptors.get(ent_id)
|
||||||
|
if acceptor is None:
|
||||||
|
matches.append((ent_id, label, start, end))
|
||||||
|
else:
|
||||||
|
match = acceptor(doc, ent_id, label, start, end)
|
||||||
|
if match:
|
||||||
|
matches.append(match)
|
||||||
for i, (ent_id, label, start, end) in enumerate(matches):
|
for i, (ent_id, label, start, end) in enumerate(matches):
|
||||||
on_match = self._callbacks.get(ent_id)
|
on_match = self._callbacks.get(ent_id)
|
||||||
if on_match is not None:
|
if on_match is not None:
|
||||||
|
|
|
@ -105,3 +105,21 @@ def test_matcher_match_zero_plus(matcher):
|
||||||
matcher.add('Quote', '', {}, [pattern])
|
matcher.add('Quote', '', {}, [pattern])
|
||||||
doc = get_doc(matcher.vocab, words)
|
doc = get_doc(matcher.vocab, words)
|
||||||
assert len(matcher(doc)) == 1
|
assert len(matcher(doc)) == 1
|
||||||
|
|
||||||
|
def test_matcher_match_one_plus(matcher):
|
||||||
|
control = Matcher(matcher.vocab)
|
||||||
|
control.add_pattern('BasicPhilippe',
|
||||||
|
[{'ORTH': 'Philippe'}], label=321)
|
||||||
|
|
||||||
|
doc = get_doc(control.vocab, ['Philippe', 'Philippe'])
|
||||||
|
|
||||||
|
m = control(doc)
|
||||||
|
assert len(m) == 2
|
||||||
|
matcher.add_pattern('KleenePhilippe',
|
||||||
|
[
|
||||||
|
{'ORTH': 'Philippe', 'OP': '1'},
|
||||||
|
{'ORTH': 'Philippe', 'OP': '+'}], label=321)
|
||||||
|
m = matcher(doc)
|
||||||
|
assert len(m) == 1
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user