Merge pull request #1876 from GregDubbin/master

Pattern matcher fixes
This commit is contained in:
Matthew Honnibal 2018-01-24 16:38:11 +01:00 committed by GitHub
commit 6a8cb905aa
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 329 additions and 33 deletions

View File

@ -8,9 +8,15 @@ from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap
from libcpp.vector cimport vector
from libcpp.pair cimport pair
from cython.operator cimport dereference as deref
from murmurhash.mrmr cimport hash64
from libc.stdint cimport int32_t
# try:
# from libcpp.unordered_map cimport unordered_map as umap
# except:
# from libcpp.map cimport map as umap
from .typedefs cimport attr_t
from .typedefs cimport hash_t
from .structs cimport TokenC
@ -62,10 +68,11 @@ cdef enum action_t:
REPEAT
ACCEPT
ADVANCE_ZERO
ADVANCE_PLUS
ACCEPT_PREV
PANIC
# A "match expression" conists of one or more token patterns
# Each token pattern consists of a quantifier and 0+ (attr, value) pairs.
# A state is an (int, pattern pointer) pair, where the int is the start
# position, and the pattern pointer shows where we're up to
@ -83,7 +90,25 @@ cdef struct TokenPatternC:
ctypedef TokenPatternC* TokenPatternC_ptr
ctypedef pair[int, TokenPatternC_ptr] StateC
# ctypedef pair[int, TokenPatternC_ptr] StateC
# Match Dictionary entry type
cdef struct MatchEntryC:
int32_t start
int32_t end
int32_t offset
# A state instance represents the information that defines a
# partial match
# start: the index of the first token in the partial match
# pattern: a pointer to the current token pattern in the full
# pattern
# last_match: The entry of the last span matched by the
# same pattern
cdef struct StateC:
int32_t start
TokenPatternC_ptr pattern
MatchEntryC* last_match
cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id,
@ -128,6 +153,9 @@ cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil:
if pattern.quantifier == ZERO:
return REJECT
elif lookahead.nr_attr == 0:
if pattern.quantifier == ZERO_PLUS:
return REPEAT
else:
return ACCEPT
elif pattern.quantifier in (ONE, ZERO_ONE):
return ADVANCE
@ -138,7 +166,7 @@ cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil:
if next_action is REJECT:
return REPEAT
else:
return ADVANCE_ZERO
return ADVANCE_PLUS
else:
return PANIC
@ -339,77 +367,223 @@ cdef class Matcher:
cdef int i, token_i
cdef const TokenC* token
cdef StateC state
cdef int j = 0
cdef int k
cdef bint overlap = False
cdef MatchEntryC* state_match
cdef MatchEntryC* last_matches = <MatchEntryC*>self.mem.alloc(self.patterns.size(),sizeof(MatchEntryC))
for i in range(self.patterns.size()):
last_matches[i].start = 0
last_matches[i].end = 0
last_matches[i].offset = 0
matches = []
for token_i in range(doc.length):
token = &doc.c[token_i]
q = 0
# Go over the open matches, extending or finalizing if able.
# Otherwise, we over-write them (q doesn't advance)
for state in partials:
action = get_action(state.second, token)
#for state in partials:
j=0
while j < n_partials:
state = partials[j]
action = get_action(state.pattern, token)
j += 1
# Skip patterns that would overlap with an existing match
# Patterns overlap an existing match if they point to the
# same final state and start between the start and end
# of said match.
# Different patterns with the same label are allowed to
# overlap.
state_match = state.last_match
if (state.start > state_match.start
and state.start < state_match.end):
continue
if action == PANIC:
raise Exception("Error selecting action in matcher")
while action == ADVANCE_ZERO:
state.second += 1
action = get_action(state.second, token)
state.pattern += 1
action = get_action(state.pattern, token)
if action == PANIC:
raise Exception("Error selecting action in matcher")
# ADVANCE_PLUS acts like REPEAT, but also pushes a partial that
# acts like and ADVANCE_ZERO
if action == ADVANCE_PLUS:
state.pattern += 1
partials.push_back(state)
n_partials += 1
state.pattern -= 1
action = REPEAT
if action == ADVANCE:
state.pattern += 1
# Check for partial matches that are at the same spec in the same pattern
# Keep the longer of the matches
# This ensures that there are never more then 2 partials for every spec
# in a pattern (one of which gets pruned in this step)
overlap=False
for i in range(q):
if state.pattern == partials[i].pattern and state.start < partials[i].start:
partials[i] = state
j = i
overlap = True
break
if overlap:
continue
overlap=False
for i in range(q):
if state.pattern == partials[i].pattern:
overlap = True
break
if overlap:
continue
if action == REPEAT:
# Leave the state in the queue, and advance to next slot
# (i.e. we don't overwrite -- we want to greedily match
# more pattern.
partials[q] = state
q += 1
elif action == REJECT:
pass
elif action == ADVANCE:
partials[q] = state
partials[q].second += 1
q += 1
elif action in (ACCEPT, ACCEPT_PREV):
# TODO: What to do about patterns starting with ZERO? Need
# to adjust the start position.
start = state.first
start = state.start
end = token_i+1 if action == ACCEPT else token_i
ent_id = state.second[1].attrs[0].value
label = state.second[1].attrs[1].value
matches.append((ent_id, start, end))
ent_id = state.pattern[1].attrs[0].value
label = state.pattern[1].attrs[1].value
# Check that this match doesn't overlap with an earlier match.
# Only overwrite an earlier match if it is a substring of this
# match (i.e. it starts after this match starts).
state_match = state.last_match
if start >= state_match.end:
state_match.start = start
state_match.end = end
state_match.offset = len(matches)
matches.append((ent_id,start,end))
elif start <= state_match.start and end >= state_match.end:
if len(matches) == 0:
assert state_match.offset==0
state_match.offset = 0
matches.append((ent_id,start,end))
else:
i = state_match.offset
matches[i] = (ent_id,start,end)
state_match.start = start
state_match.end = end
else:
pass
partials.resize(q)
n_partials = q
# Check whether we open any new patterns on this token
i=0
for pattern in self.patterns:
# Skip patterns that would overlap with an existing match
# state_match = pattern.last_match
state_match = &last_matches[i]
i+=1
if (token_i > state_match.start
and token_i < state_match.end):
continue
action = get_action(pattern, token)
if action == PANIC:
raise Exception("Error selecting action in matcher")
while action == ADVANCE_ZERO:
while action in (ADVANCE_PLUS,ADVANCE_ZERO):
if action == ADVANCE_PLUS:
state.start = token_i
state.pattern = pattern
state.last_match = state_match
partials.push_back(state)
n_partials += 1
pattern += 1
action = get_action(pattern, token)
if action == ADVANCE:
pattern += 1
j=0
overlap = False
for j in range(q):
if pattern == partials[j].pattern:
overlap = True
break
if overlap:
continue
if action == REPEAT:
state.first = token_i
state.second = pattern
state.start = token_i
state.pattern = pattern
state.last_match = state_match
partials.push_back(state)
n_partials += 1
elif action == ADVANCE:
# TODO: What to do about patterns starting with ZERO? Need
# to adjust the start position.
state.first = token_i
state.second = pattern + 1
state.start = token_i
state.pattern = pattern
state.last_match = state_match
partials.push_back(state)
n_partials += 1
elif action in (ACCEPT, ACCEPT_PREV):
start = token_i
end = token_i+1 if action == ACCEPT else token_i
ent_id = pattern[1].attrs[0].value
label = pattern[1].attrs[1].value
matches.append((ent_id, start, end))
if start >= state_match.end:
state_match.start = start
state_match.end = end
state_match.offset = len(matches)
matches.append((ent_id,start,end))
if start <= state_match.start and end >= state_match.end:
if len(matches) == 0:
state_match.offset = 0
matches.append((ent_id,start,end))
else:
j = state_match.offset
matches[j] = (ent_id,start,end)
state_match.start = start
state_match.end = end
else:
pass
# Look for open patterns that are actually satisfied
for state in partials:
while state.second.quantifier in (ZERO, ZERO_ONE, ZERO_PLUS):
state.second += 1
if state.second.nr_attr == 0:
start = state.first
while state.pattern.quantifier in (ZERO, ZERO_ONE, ZERO_PLUS):
state.pattern += 1
if state.pattern.nr_attr == 0:
start = state.start
end = len(doc)
ent_id = state.second.attrs[0].value
label = state.second.attrs[0].value
matches.append((ent_id, start, end))
ent_id = state.pattern.attrs[0].value
label = state.pattern.attrs[1].value
state_match = state.last_match
if start >= state_match.end:
state_match.start = start
state_match.end = end
state_match.offset = len(matches)
matches.append((ent_id,start,end))
if start <= state_match.start and end >= state_match.end:
j = state_match.offset
if len(matches) == 0:
state_match.offset = 0
matches.append((ent_id,start,end))
else:
matches[j] = (ent_id,start,end)
state_match.start = start
state_match.end = end
else:
pass
for i, (ent_id, start, end) in enumerate(matches):
on_match = self._callbacks.get(ent_id)
if on_match is not None:

View File

@ -13,8 +13,8 @@ from ...vocab import Vocab
('a b', 0, 2),
('a c', 0, 1),
('a b c', 0, 2),
('a b b c', 0, 2),
('a b b', 0, 2),
('a b b c', 0, 3),
('a b b', 0, 3),
]
)
def test_issue1450_matcher_end_zero_plus(string, start, end):

View File

@ -0,0 +1,63 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from ...matcher import Matcher
import pytest
pattern1 = [{'ORTH':'A','OP':'1'},{'ORTH':'A','OP':'*'}]
pattern2 = [{'ORTH':'A','OP':'*'},{'ORTH':'A','OP':'1'}]
pattern3 = [{'ORTH':'A','OP':'1'},{'ORTH':'A','OP':'1'}]
pattern4 = [{'ORTH':'B','OP':'1'},{'ORTH':'A','OP':'*'},{'ORTH':'B','OP':'1'}]
pattern5 = [{'ORTH':'B','OP':'*'},{'ORTH':'A','OP':'*'},{'ORTH':'B','OP':'1'}]
re_pattern1 = 'AA*'
re_pattern2 = 'A*A'
re_pattern3 = 'AA'
re_pattern4 = 'BA*B'
re_pattern5 = 'B*A*B'
@pytest.fixture
def text():
return "(ABBAAAAAB)."
@pytest.fixture
def doc(en_tokenizer,text):
doc = en_tokenizer(' '.join(text))
return doc
@pytest.mark.parametrize('pattern,re_pattern',[
(pattern1,re_pattern1),
(pattern2,re_pattern2),
(pattern3,re_pattern3),
(pattern4,re_pattern4),
(pattern5,re_pattern5)])
def test_greedy_matching(doc,text,pattern,re_pattern):
"""
Test that the greedy matching behavior of the * op
is consistant with other re implementations
"""
matcher = Matcher(doc.vocab)
matcher.add(re_pattern,None,pattern)
matches = matcher(doc)
re_matches = [m.span() for m in re.finditer(re_pattern,text)]
for match,re_match in zip(matches,re_matches):
assert match[1:]==re_match
@pytest.mark.parametrize('pattern,re_pattern',[
(pattern1,re_pattern1),
(pattern2,re_pattern2),
(pattern3,re_pattern3),
(pattern4,re_pattern4),
(pattern5,re_pattern5)])
def test_match_consuming(doc,text,pattern,re_pattern):
"""
Test that matcher.__call__ consumes tokens on a match
similar to re.findall
"""
matcher = Matcher(doc.vocab)
matcher.add(re_pattern,None,pattern)
matches = matcher(doc)
re_matches = [m.span() for m in re.finditer(re_pattern,text)]
assert len(matches)==len(re_matches)

View File

@ -0,0 +1,63 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from ..matcher import Matcher
import pytest
pattern1 = [{'ORTH':'A','OP':'1'},{'ORTH':'A','OP':'*'}]
pattern2 = [{'ORTH':'A','OP':'*'},{'ORTH':'A','OP':'1'}]
pattern3 = [{'ORTH':'A','OP':'1'},{'ORTH':'A','OP':'1'}]
pattern4 = [{'ORTH':'B','OP':'1'},{'ORTH':'A','OP':'*'},{'ORTH':'B','OP':'1'}]
pattern5 = [{'ORTH':'B','OP':'*'},{'ORTH':'A','OP':'*'},{'ORTH':'B','OP':'1'}]
re_pattern1 = 'AA*'
re_pattern2 = 'A*A'
re_pattern3 = 'AA'
re_pattern4 = 'BA*B'
re_pattern5 = 'B*A*B'
@pytest.fixture
def text():
return "(ABBAAAAAB)."
@pytest.fixture
def doc(en_tokenizer,text):
doc = en_tokenizer(' '.join(text))
return doc
@pytest.mark.parametrize('pattern,re_pattern',[
(pattern1,re_pattern1),
(pattern2,re_pattern2),
(pattern3,re_pattern3),
(pattern4,re_pattern4),
(pattern5,re_pattern5)])
def test_greedy_matching(doc,text,pattern,re_pattern):
"""
Test that the greedy matching behavior of the * op
is consistant with other re implementations
"""
matcher = Matcher(doc.vocab)
matcher.add(re_pattern,None,pattern)
matches = matcher(doc)
re_matches = [m.span() for m in re.finditer(re_pattern,text)]
for match,re_match in zip(matches,re_matches):
assert match[1:]==re_match
@pytest.mark.parametrize('pattern,re_pattern',[
(pattern1,re_pattern1),
(pattern2,re_pattern2),
(pattern3,re_pattern3),
(pattern4,re_pattern4),
(pattern5,re_pattern5)])
def test_match_consuming(doc,text,pattern,re_pattern):
"""
Test that matcher.__call__ consumes tokens on a match
similar to re.findall
"""
matcher = Matcher(doc.vocab)
matcher.add(re_pattern,None,pattern)
matches = matcher(doc)
re_matches = [m.span() for m in re.finditer(re_pattern,text)]
assert len(matches)==len(re_matches)

View File

@ -161,11 +161,7 @@ p
p
| The #[code +] and #[code *] operators are usually interpretted
| "greedily", i.e. longer matches are returned where possible. However, if
| you specify two #[code +] and #[code *] patterns in a row and their
| matches overlap, the first operator will behave non-greedily. This quirk
| in the semantics makes the matcher more efficient, by avoiding the need
| for back-tracking.
| "greedily", i.e. longer matches are returned where possible.
+h(3, "adding-phrase-patterns") Adding phrase patterns