diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index 501fc5e5d..ec87dce12 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -8,15 +8,9 @@ from cymem.cymem cimport Pool from preshed.maps cimport PreshMap from libcpp.vector cimport vector from libcpp.pair cimport pair -from cython.operator cimport dereference as deref from murmurhash.mrmr cimport hash64 from libc.stdint cimport int32_t -# try: -# from libcpp.unordered_map cimport unordered_map as umap -# except: -# from libcpp.map cimport map as umap - from .typedefs cimport attr_t from .typedefs cimport hash_t from .structs cimport TokenC @@ -68,11 +62,10 @@ cdef enum action_t: REPEAT ACCEPT ADVANCE_ZERO - ADVANCE_PLUS ACCEPT_PREV PANIC - +# A "match expression" conists of one or more token patterns # Each token pattern consists of a quantifier and 0+ (attr, value) pairs. # A state is an (int, pattern pointer) pair, where the int is the start # position, and the pattern pointer shows where we're up to @@ -90,25 +83,7 @@ cdef struct TokenPatternC: ctypedef TokenPatternC* TokenPatternC_ptr -# ctypedef pair[int, TokenPatternC_ptr] StateC - -# Match Dictionary entry type -cdef struct MatchEntryC: - int32_t start - int32_t end - int32_t offset - -# A state instance represents the information that defines a -# partial match -# start: the index of the first token in the partial match -# pattern: a pointer to the current token pattern in the full -# pattern -# last_match: The entry of the last span matched by the -# same pattern -cdef struct StateC: - int32_t start - TokenPatternC_ptr pattern - MatchEntryC* last_match +ctypedef pair[int, TokenPatternC_ptr] StateC cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, @@ -153,10 +128,7 @@ cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil: if pattern.quantifier == ZERO: return REJECT elif lookahead.nr_attr == 0: - if pattern.quantifier == ZERO_PLUS: - return REPEAT - else: - return ACCEPT + return ACCEPT elif pattern.quantifier in (ONE, ZERO_ONE): return ADVANCE elif pattern.quantifier == ZERO_PLUS: @@ -166,7 +138,7 @@ cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil: if next_action is REJECT: return REPEAT else: - return ADVANCE_PLUS + return ADVANCE_ZERO else: return PANIC @@ -367,223 +339,77 @@ cdef class Matcher: cdef int i, token_i cdef const TokenC* token cdef StateC state - cdef int j = 0 - cdef int k - cdef bint overlap = False - cdef MatchEntryC* state_match - cdef MatchEntryC* last_matches = self.mem.alloc(self.patterns.size(),sizeof(MatchEntryC)) - - for i in range(self.patterns.size()): - last_matches[i].start = 0 - last_matches[i].end = 0 - last_matches[i].offset = 0 - matches = [] for token_i in range(doc.length): token = &doc.c[token_i] q = 0 # Go over the open matches, extending or finalizing if able. # Otherwise, we over-write them (q doesn't advance) - #for state in partials: - j=0 - while j < n_partials: - state = partials[j] - action = get_action(state.pattern, token) - j += 1 - # Skip patterns that would overlap with an existing match - # Patterns overlap an existing match if they point to the - # same final state and start between the start and end - # of said match. - # Different patterns with the same label are allowed to - # overlap. - state_match = state.last_match - if (state.start > state_match.start - and state.start < state_match.end): - continue + for state in partials: + action = get_action(state.second, token) if action == PANIC: raise Exception("Error selecting action in matcher") while action == ADVANCE_ZERO: - state.pattern += 1 - action = get_action(state.pattern, token) + state.second += 1 + action = get_action(state.second, token) if action == PANIC: raise Exception("Error selecting action in matcher") - - # ADVANCE_PLUS acts like REPEAT, but also pushes a partial that - # acts like and ADVANCE_ZERO - if action == ADVANCE_PLUS: - state.pattern += 1 - partials.push_back(state) - n_partials += 1 - state.pattern -= 1 - action = REPEAT - if action == ADVANCE: - state.pattern += 1 - - # Check for partial matches that are at the same spec in the same pattern - # Keep the longer of the matches - # This ensures that there are never more then 2 partials for every spec - # in a pattern (one of which gets pruned in this step) - - overlap=False - for i in range(q): - if state.pattern == partials[i].pattern and state.start < partials[i].start: - partials[i] = state - j = i - overlap = True - break - if overlap: - continue - overlap=False - for i in range(q): - if state.pattern == partials[i].pattern: - overlap = True - break - if overlap: - continue - - if action == REPEAT: # Leave the state in the queue, and advance to next slot # (i.e. we don't overwrite -- we want to greedily match # more pattern. - partials[q] = state q += 1 elif action == REJECT: pass elif action == ADVANCE: partials[q] = state + partials[q].second += 1 q += 1 elif action in (ACCEPT, ACCEPT_PREV): # TODO: What to do about patterns starting with ZERO? Need # to adjust the start position. - start = state.start + start = state.first end = token_i+1 if action == ACCEPT else token_i - ent_id = state.pattern[1].attrs[0].value - label = state.pattern[1].attrs[1].value - # Check that this match doesn't overlap with an earlier match. - # Only overwrite an earlier match if it is a substring of this - # match (i.e. it starts after this match starts). - state_match = state.last_match - - if start >= state_match.end: - state_match.start = start - state_match.end = end - state_match.offset = len(matches) - matches.append((ent_id,start,end)) - elif start <= state_match.start and end >= state_match.end: - if len(matches) == 0: - assert state_match.offset==0 - state_match.offset = 0 - matches.append((ent_id,start,end)) - else: - i = state_match.offset - matches[i] = (ent_id,start,end) - state_match.start = start - state_match.end = end - else: - pass + ent_id = state.second[1].attrs[0].value + label = state.second[1].attrs[1].value + matches.append((ent_id, start, end)) partials.resize(q) - n_partials = q # Check whether we open any new patterns on this token - i=0 for pattern in self.patterns: - # Skip patterns that would overlap with an existing match - # state_match = pattern.last_match - state_match = &last_matches[i] - i+=1 - if (token_i > state_match.start - and token_i < state_match.end): - continue action = get_action(pattern, token) if action == PANIC: raise Exception("Error selecting action in matcher") - while action in (ADVANCE_PLUS,ADVANCE_ZERO): - if action == ADVANCE_PLUS: - state.start = token_i - state.pattern = pattern - state.last_match = state_match - partials.push_back(state) - n_partials += 1 + while action == ADVANCE_ZERO: pattern += 1 action = get_action(pattern, token) - - if action == ADVANCE: - pattern += 1 - j=0 - overlap = False - for j in range(q): - if pattern == partials[j].pattern: - overlap = True - break - if overlap: - continue - - if action == REPEAT: - state.start = token_i - state.pattern = pattern - state.last_match = state_match + state.first = token_i + state.second = pattern partials.push_back(state) - n_partials += 1 elif action == ADVANCE: # TODO: What to do about patterns starting with ZERO? Need # to adjust the start position. - state.start = token_i - state.pattern = pattern - state.last_match = state_match + state.first = token_i + state.second = pattern + 1 partials.push_back(state) - n_partials += 1 elif action in (ACCEPT, ACCEPT_PREV): start = token_i end = token_i+1 if action == ACCEPT else token_i ent_id = pattern[1].attrs[0].value - label = pattern[1].attrs[1].value - if start >= state_match.end: - state_match.start = start - state_match.end = end - state_match.offset = len(matches) - matches.append((ent_id,start,end)) - if start <= state_match.start and end >= state_match.end: - if len(matches) == 0: - state_match.offset = 0 - matches.append((ent_id,start,end)) - else: - j = state_match.offset - matches[j] = (ent_id,start,end) - state_match.start = start - state_match.end = end - else: - pass - + matches.append((ent_id, start, end)) # Look for open patterns that are actually satisfied for state in partials: - while state.pattern.quantifier in (ZERO, ZERO_ONE, ZERO_PLUS): - state.pattern += 1 - if state.pattern.nr_attr == 0: - start = state.start + while state.second.quantifier in (ZERO, ZERO_ONE, ZERO_PLUS): + state.second += 1 + if state.second.nr_attr == 0: + start = state.first end = len(doc) - ent_id = state.pattern.attrs[0].value - label = state.pattern.attrs[1].value - state_match = state.last_match - if start >= state_match.end: - state_match.start = start - state_match.end = end - state_match.offset = len(matches) - matches.append((ent_id,start,end)) - if start <= state_match.start and end >= state_match.end: - j = state_match.offset - if len(matches) == 0: - state_match.offset = 0 - matches.append((ent_id,start,end)) - else: - matches[j] = (ent_id,start,end) - state_match.start = start - state_match.end = end - else: - pass + ent_id = state.second.attrs[0].value + label = state.second.attrs[0].value + matches.append((ent_id, start, end)) for i, (ent_id, start, end) in enumerate(matches): on_match = self._callbacks.get(ent_id) if on_match is not None: diff --git a/spacy/tests/regression/test_issue1450.py b/spacy/tests/regression/test_issue1450.py index 3c8f975d9..6f1d4f568 100644 --- a/spacy/tests/regression/test_issue1450.py +++ b/spacy/tests/regression/test_issue1450.py @@ -13,8 +13,8 @@ from ...vocab import Vocab ('a b', 0, 2), ('a c', 0, 1), ('a b c', 0, 2), - ('a b b c', 0, 3), - ('a b b', 0, 3), + ('a b b c', 0, 2), + ('a b b', 0, 2), ] ) def test_issue1450_matcher_end_zero_plus(string, start, end): diff --git a/spacy/tests/regression/test_issue1855.py b/spacy/tests/regression/test_issue1855.py deleted file mode 100644 index aeaad9413..000000000 --- a/spacy/tests/regression/test_issue1855.py +++ /dev/null @@ -1,63 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals -import re - -from ...matcher import Matcher - -import pytest - -pattern1 = [{'ORTH':'A','OP':'1'},{'ORTH':'A','OP':'*'}] -pattern2 = [{'ORTH':'A','OP':'*'},{'ORTH':'A','OP':'1'}] -pattern3 = [{'ORTH':'A','OP':'1'},{'ORTH':'A','OP':'1'}] -pattern4 = [{'ORTH':'B','OP':'1'},{'ORTH':'A','OP':'*'},{'ORTH':'B','OP':'1'}] -pattern5 = [{'ORTH':'B','OP':'*'},{'ORTH':'A','OP':'*'},{'ORTH':'B','OP':'1'}] - -re_pattern1 = 'AA*' -re_pattern2 = 'A*A' -re_pattern3 = 'AA' -re_pattern4 = 'BA*B' -re_pattern5 = 'B*A*B' - -@pytest.fixture -def text(): - return "(ABBAAAAAB)." - -@pytest.fixture -def doc(en_tokenizer,text): - doc = en_tokenizer(' '.join(text)) - return doc - -@pytest.mark.parametrize('pattern,re_pattern',[ - (pattern1,re_pattern1), - (pattern2,re_pattern2), - (pattern3,re_pattern3), - (pattern4,re_pattern4), - (pattern5,re_pattern5)]) -def test_greedy_matching(doc,text,pattern,re_pattern): - """ - Test that the greedy matching behavior of the * op - is consistant with other re implementations - """ - matcher = Matcher(doc.vocab) - matcher.add(re_pattern,None,pattern) - matches = matcher(doc) - re_matches = [m.span() for m in re.finditer(re_pattern,text)] - for match,re_match in zip(matches,re_matches): - assert match[1:]==re_match - -@pytest.mark.parametrize('pattern,re_pattern',[ - (pattern1,re_pattern1), - (pattern2,re_pattern2), - (pattern3,re_pattern3), - (pattern4,re_pattern4), - (pattern5,re_pattern5)]) -def test_match_consuming(doc,text,pattern,re_pattern): - """ - Test that matcher.__call__ consumes tokens on a match - similar to re.findall - """ - matcher = Matcher(doc.vocab) - matcher.add(re_pattern,None,pattern) - matches = matcher(doc) - re_matches = [m.span() for m in re.finditer(re_pattern,text)] - assert len(matches)==len(re_matches) \ No newline at end of file diff --git a/spacy/tests/test_matcher_greedy.py b/spacy/tests/test_matcher_greedy.py deleted file mode 100644 index 882c356ca..000000000 --- a/spacy/tests/test_matcher_greedy.py +++ /dev/null @@ -1,63 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals -import re - -from ..matcher import Matcher - -import pytest - -pattern1 = [{'ORTH':'A','OP':'1'},{'ORTH':'A','OP':'*'}] -pattern2 = [{'ORTH':'A','OP':'*'},{'ORTH':'A','OP':'1'}] -pattern3 = [{'ORTH':'A','OP':'1'},{'ORTH':'A','OP':'1'}] -pattern4 = [{'ORTH':'B','OP':'1'},{'ORTH':'A','OP':'*'},{'ORTH':'B','OP':'1'}] -pattern5 = [{'ORTH':'B','OP':'*'},{'ORTH':'A','OP':'*'},{'ORTH':'B','OP':'1'}] - -re_pattern1 = 'AA*' -re_pattern2 = 'A*A' -re_pattern3 = 'AA' -re_pattern4 = 'BA*B' -re_pattern5 = 'B*A*B' - -@pytest.fixture -def text(): - return "(ABBAAAAAB)." - -@pytest.fixture -def doc(en_tokenizer,text): - doc = en_tokenizer(' '.join(text)) - return doc - -@pytest.mark.parametrize('pattern,re_pattern',[ - (pattern1,re_pattern1), - (pattern2,re_pattern2), - (pattern3,re_pattern3), - (pattern4,re_pattern4), - (pattern5,re_pattern5)]) -def test_greedy_matching(doc,text,pattern,re_pattern): - """ - Test that the greedy matching behavior of the * op - is consistant with other re implementations - """ - matcher = Matcher(doc.vocab) - matcher.add(re_pattern,None,pattern) - matches = matcher(doc) - re_matches = [m.span() for m in re.finditer(re_pattern,text)] - for match,re_match in zip(matches,re_matches): - assert match[1:]==re_match - -@pytest.mark.parametrize('pattern,re_pattern',[ - (pattern1,re_pattern1), - (pattern2,re_pattern2), - (pattern3,re_pattern3), - (pattern4,re_pattern4), - (pattern5,re_pattern5)]) -def test_match_consuming(doc,text,pattern,re_pattern): - """ - Test that matcher.__call__ consumes tokens on a match - similar to re.findall - """ - matcher = Matcher(doc.vocab) - matcher.add(re_pattern,None,pattern) - matches = matcher(doc) - re_matches = [m.span() for m in re.finditer(re_pattern,text)] - assert len(matches)==len(re_matches) \ No newline at end of file diff --git a/website/usage/_linguistic-features/_rule-based-matching.jade b/website/usage/_linguistic-features/_rule-based-matching.jade index e1a7c8a81..8e39746b4 100644 --- a/website/usage/_linguistic-features/_rule-based-matching.jade +++ b/website/usage/_linguistic-features/_rule-based-matching.jade @@ -179,8 +179,12 @@ p +cell Allow the pattern to match zero or more times. p - | The #[code +] and #[code *] operators are usually interpretted - | "greedily", i.e. longer matches are returned where possible. + | In versions before v2.1.0, the semantics of the #[code +] and #[code *] operators + | behave inconsistently. They were usually interpretted + | "greedily", i.e. longer matches are returned where possible. However, if + | you specify two #[code +] and #[code *] patterns in a row and their + | matches overlap, the first operator will behave non-greedily. This quirk + | in the semantics is corrected in spaCy v2.1.0. +h(3, "adding-phrase-patterns") Adding phrase patterns