Merge pull request #1876 from GregDubbin/master

Pattern matcher fixes
This commit is contained in:
Matthew Honnibal 2018-01-24 16:38:11 +01:00 committed by GitHub
commit 6a8cb905aa
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 329 additions and 33 deletions

View File

@ -8,9 +8,15 @@ from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap from preshed.maps cimport PreshMap
from libcpp.vector cimport vector from libcpp.vector cimport vector
from libcpp.pair cimport pair from libcpp.pair cimport pair
from cython.operator cimport dereference as deref
from murmurhash.mrmr cimport hash64 from murmurhash.mrmr cimport hash64
from libc.stdint cimport int32_t from libc.stdint cimport int32_t
# try:
# from libcpp.unordered_map cimport unordered_map as umap
# except:
# from libcpp.map cimport map as umap
from .typedefs cimport attr_t from .typedefs cimport attr_t
from .typedefs cimport hash_t from .typedefs cimport hash_t
from .structs cimport TokenC from .structs cimport TokenC
@ -62,10 +68,11 @@ cdef enum action_t:
REPEAT REPEAT
ACCEPT ACCEPT
ADVANCE_ZERO ADVANCE_ZERO
ADVANCE_PLUS
ACCEPT_PREV ACCEPT_PREV
PANIC PANIC
# A "match expression" conists of one or more token patterns
# Each token pattern consists of a quantifier and 0+ (attr, value) pairs. # Each token pattern consists of a quantifier and 0+ (attr, value) pairs.
# A state is an (int, pattern pointer) pair, where the int is the start # A state is an (int, pattern pointer) pair, where the int is the start
# position, and the pattern pointer shows where we're up to # position, and the pattern pointer shows where we're up to
@ -83,7 +90,25 @@ cdef struct TokenPatternC:
ctypedef TokenPatternC* TokenPatternC_ptr ctypedef TokenPatternC* TokenPatternC_ptr
ctypedef pair[int, TokenPatternC_ptr] StateC # ctypedef pair[int, TokenPatternC_ptr] StateC
# Match Dictionary entry type
cdef struct MatchEntryC:
int32_t start
int32_t end
int32_t offset
# A state instance represents the information that defines a
# partial match
# start: the index of the first token in the partial match
# pattern: a pointer to the current token pattern in the full
# pattern
# last_match: The entry of the last span matched by the
# same pattern
cdef struct StateC:
int32_t start
TokenPatternC_ptr pattern
MatchEntryC* last_match
cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id,
@ -128,7 +153,10 @@ cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil:
if pattern.quantifier == ZERO: if pattern.quantifier == ZERO:
return REJECT return REJECT
elif lookahead.nr_attr == 0: elif lookahead.nr_attr == 0:
return ACCEPT if pattern.quantifier == ZERO_PLUS:
return REPEAT
else:
return ACCEPT
elif pattern.quantifier in (ONE, ZERO_ONE): elif pattern.quantifier in (ONE, ZERO_ONE):
return ADVANCE return ADVANCE
elif pattern.quantifier == ZERO_PLUS: elif pattern.quantifier == ZERO_PLUS:
@ -138,7 +166,7 @@ cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil:
if next_action is REJECT: if next_action is REJECT:
return REPEAT return REPEAT
else: else:
return ADVANCE_ZERO return ADVANCE_PLUS
else: else:
return PANIC return PANIC
@ -339,77 +367,223 @@ cdef class Matcher:
cdef int i, token_i cdef int i, token_i
cdef const TokenC* token cdef const TokenC* token
cdef StateC state cdef StateC state
cdef int j = 0
cdef int k
cdef bint overlap = False
cdef MatchEntryC* state_match
cdef MatchEntryC* last_matches = <MatchEntryC*>self.mem.alloc(self.patterns.size(),sizeof(MatchEntryC))
for i in range(self.patterns.size()):
last_matches[i].start = 0
last_matches[i].end = 0
last_matches[i].offset = 0
matches = [] matches = []
for token_i in range(doc.length): for token_i in range(doc.length):
token = &doc.c[token_i] token = &doc.c[token_i]
q = 0 q = 0
# Go over the open matches, extending or finalizing if able. # Go over the open matches, extending or finalizing if able.
# Otherwise, we over-write them (q doesn't advance) # Otherwise, we over-write them (q doesn't advance)
for state in partials: #for state in partials:
action = get_action(state.second, token) j=0
while j < n_partials:
state = partials[j]
action = get_action(state.pattern, token)
j += 1
# Skip patterns that would overlap with an existing match
# Patterns overlap an existing match if they point to the
# same final state and start between the start and end
# of said match.
# Different patterns with the same label are allowed to
# overlap.
state_match = state.last_match
if (state.start > state_match.start
and state.start < state_match.end):
continue
if action == PANIC: if action == PANIC:
raise Exception("Error selecting action in matcher") raise Exception("Error selecting action in matcher")
while action == ADVANCE_ZERO: while action == ADVANCE_ZERO:
state.second += 1 state.pattern += 1
action = get_action(state.second, token) action = get_action(state.pattern, token)
if action == PANIC: if action == PANIC:
raise Exception("Error selecting action in matcher") raise Exception("Error selecting action in matcher")
# ADVANCE_PLUS acts like REPEAT, but also pushes a partial that
# acts like and ADVANCE_ZERO
if action == ADVANCE_PLUS:
state.pattern += 1
partials.push_back(state)
n_partials += 1
state.pattern -= 1
action = REPEAT
if action == ADVANCE:
state.pattern += 1
# Check for partial matches that are at the same spec in the same pattern
# Keep the longer of the matches
# This ensures that there are never more then 2 partials for every spec
# in a pattern (one of which gets pruned in this step)
overlap=False
for i in range(q):
if state.pattern == partials[i].pattern and state.start < partials[i].start:
partials[i] = state
j = i
overlap = True
break
if overlap:
continue
overlap=False
for i in range(q):
if state.pattern == partials[i].pattern:
overlap = True
break
if overlap:
continue
if action == REPEAT: if action == REPEAT:
# Leave the state in the queue, and advance to next slot # Leave the state in the queue, and advance to next slot
# (i.e. we don't overwrite -- we want to greedily match # (i.e. we don't overwrite -- we want to greedily match
# more pattern. # more pattern.
partials[q] = state
q += 1 q += 1
elif action == REJECT: elif action == REJECT:
pass pass
elif action == ADVANCE: elif action == ADVANCE:
partials[q] = state partials[q] = state
partials[q].second += 1
q += 1 q += 1
elif action in (ACCEPT, ACCEPT_PREV): elif action in (ACCEPT, ACCEPT_PREV):
# TODO: What to do about patterns starting with ZERO? Need # TODO: What to do about patterns starting with ZERO? Need
# to adjust the start position. # to adjust the start position.
start = state.first start = state.start
end = token_i+1 if action == ACCEPT else token_i end = token_i+1 if action == ACCEPT else token_i
ent_id = state.second[1].attrs[0].value ent_id = state.pattern[1].attrs[0].value
label = state.second[1].attrs[1].value label = state.pattern[1].attrs[1].value
matches.append((ent_id, start, end)) # Check that this match doesn't overlap with an earlier match.
# Only overwrite an earlier match if it is a substring of this
# match (i.e. it starts after this match starts).
state_match = state.last_match
if start >= state_match.end:
state_match.start = start
state_match.end = end
state_match.offset = len(matches)
matches.append((ent_id,start,end))
elif start <= state_match.start and end >= state_match.end:
if len(matches) == 0:
assert state_match.offset==0
state_match.offset = 0
matches.append((ent_id,start,end))
else:
i = state_match.offset
matches[i] = (ent_id,start,end)
state_match.start = start
state_match.end = end
else:
pass
partials.resize(q) partials.resize(q)
n_partials = q
# Check whether we open any new patterns on this token # Check whether we open any new patterns on this token
i=0
for pattern in self.patterns: for pattern in self.patterns:
# Skip patterns that would overlap with an existing match
# state_match = pattern.last_match
state_match = &last_matches[i]
i+=1
if (token_i > state_match.start
and token_i < state_match.end):
continue
action = get_action(pattern, token) action = get_action(pattern, token)
if action == PANIC: if action == PANIC:
raise Exception("Error selecting action in matcher") raise Exception("Error selecting action in matcher")
while action == ADVANCE_ZERO: while action in (ADVANCE_PLUS,ADVANCE_ZERO):
if action == ADVANCE_PLUS:
state.start = token_i
state.pattern = pattern
state.last_match = state_match
partials.push_back(state)
n_partials += 1
pattern += 1 pattern += 1
action = get_action(pattern, token) action = get_action(pattern, token)
if action == ADVANCE:
pattern += 1
j=0
overlap = False
for j in range(q):
if pattern == partials[j].pattern:
overlap = True
break
if overlap:
continue
if action == REPEAT: if action == REPEAT:
state.first = token_i state.start = token_i
state.second = pattern state.pattern = pattern
state.last_match = state_match
partials.push_back(state) partials.push_back(state)
n_partials += 1
elif action == ADVANCE: elif action == ADVANCE:
# TODO: What to do about patterns starting with ZERO? Need # TODO: What to do about patterns starting with ZERO? Need
# to adjust the start position. # to adjust the start position.
state.first = token_i state.start = token_i
state.second = pattern + 1 state.pattern = pattern
state.last_match = state_match
partials.push_back(state) partials.push_back(state)
n_partials += 1
elif action in (ACCEPT, ACCEPT_PREV): elif action in (ACCEPT, ACCEPT_PREV):
start = token_i start = token_i
end = token_i+1 if action == ACCEPT else token_i end = token_i+1 if action == ACCEPT else token_i
ent_id = pattern[1].attrs[0].value ent_id = pattern[1].attrs[0].value
label = pattern[1].attrs[1].value label = pattern[1].attrs[1].value
matches.append((ent_id, start, end)) if start >= state_match.end:
state_match.start = start
state_match.end = end
state_match.offset = len(matches)
matches.append((ent_id,start,end))
if start <= state_match.start and end >= state_match.end:
if len(matches) == 0:
state_match.offset = 0
matches.append((ent_id,start,end))
else:
j = state_match.offset
matches[j] = (ent_id,start,end)
state_match.start = start
state_match.end = end
else:
pass
# Look for open patterns that are actually satisfied # Look for open patterns that are actually satisfied
for state in partials: for state in partials:
while state.second.quantifier in (ZERO, ZERO_ONE, ZERO_PLUS): while state.pattern.quantifier in (ZERO, ZERO_ONE, ZERO_PLUS):
state.second += 1 state.pattern += 1
if state.second.nr_attr == 0: if state.pattern.nr_attr == 0:
start = state.first start = state.start
end = len(doc) end = len(doc)
ent_id = state.second.attrs[0].value ent_id = state.pattern.attrs[0].value
label = state.second.attrs[0].value label = state.pattern.attrs[1].value
matches.append((ent_id, start, end)) state_match = state.last_match
if start >= state_match.end:
state_match.start = start
state_match.end = end
state_match.offset = len(matches)
matches.append((ent_id,start,end))
if start <= state_match.start and end >= state_match.end:
j = state_match.offset
if len(matches) == 0:
state_match.offset = 0
matches.append((ent_id,start,end))
else:
matches[j] = (ent_id,start,end)
state_match.start = start
state_match.end = end
else:
pass
for i, (ent_id, start, end) in enumerate(matches): for i, (ent_id, start, end) in enumerate(matches):
on_match = self._callbacks.get(ent_id) on_match = self._callbacks.get(ent_id)
if on_match is not None: if on_match is not None:

View File

@ -13,8 +13,8 @@ from ...vocab import Vocab
('a b', 0, 2), ('a b', 0, 2),
('a c', 0, 1), ('a c', 0, 1),
('a b c', 0, 2), ('a b c', 0, 2),
('a b b c', 0, 2), ('a b b c', 0, 3),
('a b b', 0, 2), ('a b b', 0, 3),
] ]
) )
def test_issue1450_matcher_end_zero_plus(string, start, end): def test_issue1450_matcher_end_zero_plus(string, start, end):

View File

@ -0,0 +1,63 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from ...matcher import Matcher
import pytest
pattern1 = [{'ORTH':'A','OP':'1'},{'ORTH':'A','OP':'*'}]
pattern2 = [{'ORTH':'A','OP':'*'},{'ORTH':'A','OP':'1'}]
pattern3 = [{'ORTH':'A','OP':'1'},{'ORTH':'A','OP':'1'}]
pattern4 = [{'ORTH':'B','OP':'1'},{'ORTH':'A','OP':'*'},{'ORTH':'B','OP':'1'}]
pattern5 = [{'ORTH':'B','OP':'*'},{'ORTH':'A','OP':'*'},{'ORTH':'B','OP':'1'}]
re_pattern1 = 'AA*'
re_pattern2 = 'A*A'
re_pattern3 = 'AA'
re_pattern4 = 'BA*B'
re_pattern5 = 'B*A*B'
@pytest.fixture
def text():
return "(ABBAAAAAB)."
@pytest.fixture
def doc(en_tokenizer,text):
doc = en_tokenizer(' '.join(text))
return doc
@pytest.mark.parametrize('pattern,re_pattern',[
(pattern1,re_pattern1),
(pattern2,re_pattern2),
(pattern3,re_pattern3),
(pattern4,re_pattern4),
(pattern5,re_pattern5)])
def test_greedy_matching(doc,text,pattern,re_pattern):
"""
Test that the greedy matching behavior of the * op
is consistant with other re implementations
"""
matcher = Matcher(doc.vocab)
matcher.add(re_pattern,None,pattern)
matches = matcher(doc)
re_matches = [m.span() for m in re.finditer(re_pattern,text)]
for match,re_match in zip(matches,re_matches):
assert match[1:]==re_match
@pytest.mark.parametrize('pattern,re_pattern',[
(pattern1,re_pattern1),
(pattern2,re_pattern2),
(pattern3,re_pattern3),
(pattern4,re_pattern4),
(pattern5,re_pattern5)])
def test_match_consuming(doc,text,pattern,re_pattern):
"""
Test that matcher.__call__ consumes tokens on a match
similar to re.findall
"""
matcher = Matcher(doc.vocab)
matcher.add(re_pattern,None,pattern)
matches = matcher(doc)
re_matches = [m.span() for m in re.finditer(re_pattern,text)]
assert len(matches)==len(re_matches)

View File

@ -0,0 +1,63 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from ..matcher import Matcher
import pytest
pattern1 = [{'ORTH':'A','OP':'1'},{'ORTH':'A','OP':'*'}]
pattern2 = [{'ORTH':'A','OP':'*'},{'ORTH':'A','OP':'1'}]
pattern3 = [{'ORTH':'A','OP':'1'},{'ORTH':'A','OP':'1'}]
pattern4 = [{'ORTH':'B','OP':'1'},{'ORTH':'A','OP':'*'},{'ORTH':'B','OP':'1'}]
pattern5 = [{'ORTH':'B','OP':'*'},{'ORTH':'A','OP':'*'},{'ORTH':'B','OP':'1'}]
re_pattern1 = 'AA*'
re_pattern2 = 'A*A'
re_pattern3 = 'AA'
re_pattern4 = 'BA*B'
re_pattern5 = 'B*A*B'
@pytest.fixture
def text():
return "(ABBAAAAAB)."
@pytest.fixture
def doc(en_tokenizer,text):
doc = en_tokenizer(' '.join(text))
return doc
@pytest.mark.parametrize('pattern,re_pattern',[
(pattern1,re_pattern1),
(pattern2,re_pattern2),
(pattern3,re_pattern3),
(pattern4,re_pattern4),
(pattern5,re_pattern5)])
def test_greedy_matching(doc,text,pattern,re_pattern):
"""
Test that the greedy matching behavior of the * op
is consistant with other re implementations
"""
matcher = Matcher(doc.vocab)
matcher.add(re_pattern,None,pattern)
matches = matcher(doc)
re_matches = [m.span() for m in re.finditer(re_pattern,text)]
for match,re_match in zip(matches,re_matches):
assert match[1:]==re_match
@pytest.mark.parametrize('pattern,re_pattern',[
(pattern1,re_pattern1),
(pattern2,re_pattern2),
(pattern3,re_pattern3),
(pattern4,re_pattern4),
(pattern5,re_pattern5)])
def test_match_consuming(doc,text,pattern,re_pattern):
"""
Test that matcher.__call__ consumes tokens on a match
similar to re.findall
"""
matcher = Matcher(doc.vocab)
matcher.add(re_pattern,None,pattern)
matches = matcher(doc)
re_matches = [m.span() for m in re.finditer(re_pattern,text)]
assert len(matches)==len(re_matches)

View File

@ -161,11 +161,7 @@ p
p p
| The #[code +] and #[code *] operators are usually interpretted | The #[code +] and #[code *] operators are usually interpretted
| "greedily", i.e. longer matches are returned where possible. However, if | "greedily", i.e. longer matches are returned where possible.
| you specify two #[code +] and #[code *] patterns in a row and their
| matches overlap, the first operator will behave non-greedily. This quirk
| in the semantics makes the matcher more efficient, by avoiding the need
| for back-tracking.
+h(3, "adding-phrase-patterns") Adding phrase patterns +h(3, "adding-phrase-patterns") Adding phrase patterns