From 8bea62f26ea8f69fbf97bc29933461217b5e3e8e Mon Sep 17 00:00:00 2001 From: greg Date: Tue, 16 Jan 2018 13:21:43 -0500 Subject: [PATCH 1/9] Correct bugs for greedy matching and introduce ADVANCE_PLUS action --- spacy/matcher.pyx | 169 ++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 157 insertions(+), 12 deletions(-) diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index a6b02ba2c..6d40045ae 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -62,10 +62,10 @@ cdef enum action_t: REPEAT ACCEPT ADVANCE_ZERO + ADVANCE_PLUS ACCEPT_PREV PANIC -# A "match expression" conists of one or more token patterns # Each token pattern consists of a quantifier and 0+ (attr, value) pairs. # A state is an (int, pattern pointer) pair, where the int is the start # position, and the pattern pointer shows where we're up to @@ -128,7 +128,10 @@ cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil: if pattern.quantifier == ZERO: return REJECT elif lookahead.nr_attr == 0: - return ACCEPT + if pattern.quantifier == ZERO_PLUS: + return REPEAT + else: + return ACCEPT elif pattern.quantifier in (ONE, ZERO_ONE): return ADVANCE elif pattern.quantifier == ZERO_PLUS: @@ -138,7 +141,7 @@ cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil: if next_action is REJECT: return REPEAT else: - return ADVANCE_ZERO + return ADVANCE_PLUS else: return PANIC @@ -330,14 +333,26 @@ cdef class Matcher: cdef int i, token_i cdef const TokenC* token cdef StateC state + cdef int j = 0 + cdef int k + cdef bint add_match,overlap = False matches = [] + matches_dict = {} for token_i in range(doc.length): token = &doc.c[token_i] q = 0 # Go over the open matches, extending or finalizing if able. # Otherwise, we over-write them (q doesn't advance) - for state in partials: + #for state in partials: + j=0 + while j < n_partials: + state = partials[j] action = get_action(state.second, token) + j += 1 + # Skip patterns that would overlap with an existing match + ent_id = get_pattern_key(state.second) + if ent_id in matches_dict and state.first>matches_dict[ent_id][0] and state.first= matches_dict[ent_id][1]: + matches_dict[ent_id] = (start,end,len(matches)) + matches.append((ent_id,start,end)) + elif start <= matches_dict[ent_id][0] and end>=matches_dict[ent_id][1]: + i = matches_dict[ent_id][2] + matches[i] = (ent_id,start,end) + matches_dict[ent_id] = (start,end,i) + else: + pass partials.resize(q) + n_partials = q # Check whether we open any new patterns on this token for pattern in self.patterns: + # Skip patterns that would overlap with an existing match + ent_id = get_pattern_key(pattern) + if ent_id in matches_dict and token_i>matches_dict[ent_id][0] and token_i= matches_dict[ent_id][1]: + matches_dict[ent_id] = (start,end,len(matches)) + matches.append((ent_id,start,end)) + elif start <= matches_dict[ent_id][0] and end>=matches_dict[ent_id][1]: + j = matches_dict[ent_id][2] + matches[j] = (ent_id,start,end) + matches_dict[ent_id] = (start,end,j) + else: + pass + # Look for open patterns that are actually satisfied for state in partials: while state.second.quantifier in (ZERO, ZERO_ONE, ZERO_PLUS): @@ -399,8 +531,21 @@ cdef class Matcher: start = state.first end = len(doc) ent_id = state.second.attrs[0].value - label = state.second.attrs[0].value - matches.append((ent_id, start, end)) + # ent_id = get_pattern_key(state.second) + label = state.second.attrs[1].value + # matches.append((ent_id, start, end)) + if ent_id not in matches_dict: + matches_dict[ent_id] = (start,end,len(matches)) + matches.append((ent_id,start,end)) + elif start >= matches_dict[ent_id][1]: + matches_dict[ent_id] = (start,end,len(matches)) + matches.append((ent_id,start,end)) + elif start <= matches_dict[ent_id][0] and end>=matches_dict[ent_id][1]: + j = matches_dict[ent_id][2] + matches[j] = (ent_id,start,end) + matches_dict[ent_id] = (start,end,j) + else: + pass for i, (ent_id, start, end) in enumerate(matches): on_match = self._callbacks.get(ent_id) if on_match is not None: From 7072b395c9e0705338c857e7b1ef7087cdb7b928 Mon Sep 17 00:00:00 2001 From: greg Date: Tue, 16 Jan 2018 15:46:13 -0500 Subject: [PATCH 2/9] Add greedy matcher tests --- spacy/tests/regression/test_issue1855.py | 63 ++++++++++++++++++++++++ spacy/tests/test_matcher_greedy.py | 63 ++++++++++++++++++++++++ 2 files changed, 126 insertions(+) create mode 100644 spacy/tests/regression/test_issue1855.py create mode 100644 spacy/tests/test_matcher_greedy.py diff --git a/spacy/tests/regression/test_issue1855.py b/spacy/tests/regression/test_issue1855.py new file mode 100644 index 000000000..882c356ca --- /dev/null +++ b/spacy/tests/regression/test_issue1855.py @@ -0,0 +1,63 @@ +# coding: utf-8 +from __future__ import unicode_literals +import re + +from ..matcher import Matcher + +import pytest + +pattern1 = [{'ORTH':'A','OP':'1'},{'ORTH':'A','OP':'*'}] +pattern2 = [{'ORTH':'A','OP':'*'},{'ORTH':'A','OP':'1'}] +pattern3 = [{'ORTH':'A','OP':'1'},{'ORTH':'A','OP':'1'}] +pattern4 = [{'ORTH':'B','OP':'1'},{'ORTH':'A','OP':'*'},{'ORTH':'B','OP':'1'}] +pattern5 = [{'ORTH':'B','OP':'*'},{'ORTH':'A','OP':'*'},{'ORTH':'B','OP':'1'}] + +re_pattern1 = 'AA*' +re_pattern2 = 'A*A' +re_pattern3 = 'AA' +re_pattern4 = 'BA*B' +re_pattern5 = 'B*A*B' + +@pytest.fixture +def text(): + return "(ABBAAAAAB)." + +@pytest.fixture +def doc(en_tokenizer,text): + doc = en_tokenizer(' '.join(text)) + return doc + +@pytest.mark.parametrize('pattern,re_pattern',[ + (pattern1,re_pattern1), + (pattern2,re_pattern2), + (pattern3,re_pattern3), + (pattern4,re_pattern4), + (pattern5,re_pattern5)]) +def test_greedy_matching(doc,text,pattern,re_pattern): + """ + Test that the greedy matching behavior of the * op + is consistant with other re implementations + """ + matcher = Matcher(doc.vocab) + matcher.add(re_pattern,None,pattern) + matches = matcher(doc) + re_matches = [m.span() for m in re.finditer(re_pattern,text)] + for match,re_match in zip(matches,re_matches): + assert match[1:]==re_match + +@pytest.mark.parametrize('pattern,re_pattern',[ + (pattern1,re_pattern1), + (pattern2,re_pattern2), + (pattern3,re_pattern3), + (pattern4,re_pattern4), + (pattern5,re_pattern5)]) +def test_match_consuming(doc,text,pattern,re_pattern): + """ + Test that matcher.__call__ consumes tokens on a match + similar to re.findall + """ + matcher = Matcher(doc.vocab) + matcher.add(re_pattern,None,pattern) + matches = matcher(doc) + re_matches = [m.span() for m in re.finditer(re_pattern,text)] + assert len(matches)==len(re_matches) \ No newline at end of file diff --git a/spacy/tests/test_matcher_greedy.py b/spacy/tests/test_matcher_greedy.py new file mode 100644 index 000000000..882c356ca --- /dev/null +++ b/spacy/tests/test_matcher_greedy.py @@ -0,0 +1,63 @@ +# coding: utf-8 +from __future__ import unicode_literals +import re + +from ..matcher import Matcher + +import pytest + +pattern1 = [{'ORTH':'A','OP':'1'},{'ORTH':'A','OP':'*'}] +pattern2 = [{'ORTH':'A','OP':'*'},{'ORTH':'A','OP':'1'}] +pattern3 = [{'ORTH':'A','OP':'1'},{'ORTH':'A','OP':'1'}] +pattern4 = [{'ORTH':'B','OP':'1'},{'ORTH':'A','OP':'*'},{'ORTH':'B','OP':'1'}] +pattern5 = [{'ORTH':'B','OP':'*'},{'ORTH':'A','OP':'*'},{'ORTH':'B','OP':'1'}] + +re_pattern1 = 'AA*' +re_pattern2 = 'A*A' +re_pattern3 = 'AA' +re_pattern4 = 'BA*B' +re_pattern5 = 'B*A*B' + +@pytest.fixture +def text(): + return "(ABBAAAAAB)." + +@pytest.fixture +def doc(en_tokenizer,text): + doc = en_tokenizer(' '.join(text)) + return doc + +@pytest.mark.parametrize('pattern,re_pattern',[ + (pattern1,re_pattern1), + (pattern2,re_pattern2), + (pattern3,re_pattern3), + (pattern4,re_pattern4), + (pattern5,re_pattern5)]) +def test_greedy_matching(doc,text,pattern,re_pattern): + """ + Test that the greedy matching behavior of the * op + is consistant with other re implementations + """ + matcher = Matcher(doc.vocab) + matcher.add(re_pattern,None,pattern) + matches = matcher(doc) + re_matches = [m.span() for m in re.finditer(re_pattern,text)] + for match,re_match in zip(matches,re_matches): + assert match[1:]==re_match + +@pytest.mark.parametrize('pattern,re_pattern',[ + (pattern1,re_pattern1), + (pattern2,re_pattern2), + (pattern3,re_pattern3), + (pattern4,re_pattern4), + (pattern5,re_pattern5)]) +def test_match_consuming(doc,text,pattern,re_pattern): + """ + Test that matcher.__call__ consumes tokens on a match + similar to re.findall + """ + matcher = Matcher(doc.vocab) + matcher.add(re_pattern,None,pattern) + matches = matcher(doc) + re_matches = [m.span() for m in re.finditer(re_pattern,text)] + assert len(matches)==len(re_matches) \ No newline at end of file From 490bc82c27d411fbaa8d49b0d581114e2ee88460 Mon Sep 17 00:00:00 2001 From: greg Date: Mon, 22 Jan 2018 10:03:12 -0500 Subject: [PATCH 3/9] Add comments clarifying matcher logic for '*' --- spacy/matcher.pyx | 47 ++++++++--------------------------------------- 1 file changed, 8 insertions(+), 39 deletions(-) diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index 6d40045ae..738cd8f5d 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -361,7 +361,8 @@ cdef class Matcher: if action == PANIC: raise Exception("Error selecting action in matcher") - + # ADVANCE_PLUS acts like REPEAT, but also pushes a partial that + # acts like and ADVANCE_ZERO if action == ADVANCE_PLUS: state.second += 1 partials.push_back(state) @@ -372,10 +373,13 @@ cdef class Matcher: if action == ADVANCE: state.second += 1 + # Check for partial matches that are at the same spec in the same pattern + # Keep the longer of the matches + # This ensures that there are never more then 2 partials for every spec + # in a pattern (one of which gets pruned in this step) + overlap=False for i in range(q): - if ent_id != get_pattern_key(partials[i].second): - continue if state.second == partials[i].second and state.first < partials[i].first: partials[i] = state j = i @@ -385,26 +389,12 @@ cdef class Matcher: continue overlap=False for i in range(q): - if ent_id != get_pattern_key(partials[i].second): - continue if state.second == partials[i].second: overlap = True break if overlap: continue - # overlap=False - # for i in range(q): - # if state.second == partials[i].second: - # if state.first < partials[i].first: - # partials[i] = state - # j = i-1 - # else: - # overlap=True - # break - # if overlap: - # continue - if action == REPEAT: # Leave the state in the queue, and advance to next slot @@ -425,10 +415,9 @@ cdef class Matcher: # ent_id = state.second[1].attrs[0].value # ent_id = get_pattern_key(state.second) label = state.second[1].attrs[1].value - # matches.append((ent_id, start, end)) # Check that this match doesn't overlap with an earlier match. # Only overwrite an earlier match if it is a substring of this - # match. + # match (i.e. it starts after this match starts). if ent_id not in matches_dict: matches_dict[ent_id] = (start,end,len(matches)) @@ -454,23 +443,8 @@ cdef class Matcher: action = get_action(pattern, token) if action == PANIC: raise Exception("Error selecting action in matcher") - # while acton == ADVANCE_ZERO: - # pattern += 1 - # action = get_action(pattern,token) - # if action == PANIC: - # raise Exception("Error selecting action in matcher") while action in (ADVANCE_PLUS,ADVANCE_ZERO): if action == ADVANCE_PLUS: - # j=0 - # overlap = False - # for j in range(q): - # if pattern == partials[j].second: - # overlap = True - # break - # if overlap: - # pattern += 1 - # action = get_action(pattern, token) - # continue state.first = token_i state.second = pattern partials.push_back(state) @@ -483,8 +457,6 @@ cdef class Matcher: j=0 overlap = False for j in range(q): - if ent_id == get_pattern_key(partials[j].second): - continue if pattern == partials[j].second: overlap = True break @@ -508,7 +480,6 @@ cdef class Matcher: start = token_i end = token_i+1 if action == ACCEPT else token_i ent_id = pattern[1].attrs[0].value - # ent_id = get_pattern_key(state.second) label = pattern[1].attrs[1].value if ent_id not in matches_dict: matches_dict[ent_id] = (start,end,len(matches)) @@ -531,9 +502,7 @@ cdef class Matcher: start = state.first end = len(doc) ent_id = state.second.attrs[0].value - # ent_id = get_pattern_key(state.second) label = state.second.attrs[1].value - # matches.append((ent_id, start, end)) if ent_id not in matches_dict: matches_dict[ent_id] = (start,end,len(matches)) matches.append((ent_id,start,end)) From d55992bdf085611f55cb6b7fb61a65b0bcfab31d Mon Sep 17 00:00:00 2001 From: greg Date: Mon, 22 Jan 2018 15:36:47 -0500 Subject: [PATCH 4/9] Switch match dictionary to use final state pointer rather than ID --- spacy/matcher.pyx | 114 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 88 insertions(+), 26 deletions(-) diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index 738cd8f5d..dd8e0b55c 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -8,9 +8,13 @@ from cymem.cymem cimport Pool from preshed.maps cimport PreshMap from libcpp.vector cimport vector from libcpp.pair cimport pair +from libcpp.unordered_map cimport unordered_map as umap +from cython.operator cimport dereference as deref from murmurhash.mrmr cimport hash64 from libc.stdint cimport int32_t +from libc.stdio cimport printf + from .typedefs cimport attr_t from .typedefs cimport hash_t from .structs cimport TokenC @@ -85,6 +89,11 @@ cdef struct TokenPatternC: ctypedef TokenPatternC* TokenPatternC_ptr ctypedef pair[int, TokenPatternC_ptr] StateC +# Match Dictionary entry type +cdef struct MatchEntryC: + int32_t start + int32_t end + int32_t offset cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, object token_specs) except NULL: @@ -336,8 +345,11 @@ cdef class Matcher: cdef int j = 0 cdef int k cdef bint add_match,overlap = False + cdef TokenPatternC_ptr final_state + cdef umap[TokenPatternC_ptr,MatchEntryC] matches_dict + cdef umap[TokenPatternC_ptr,MatchEntryC].iterator state_match + cdef MatchEntryC new_match matches = [] - matches_dict = {} for token_i in range(doc.length): token = &doc.c[token_i] q = 0 @@ -350,8 +362,18 @@ cdef class Matcher: action = get_action(state.second, token) j += 1 # Skip patterns that would overlap with an existing match - ent_id = get_pattern_key(state.second) - if ent_id in matches_dict and state.first>matches_dict[ent_id][0] and state.firstderef(state_match).second.start + and state.first= matches_dict[ent_id][1]: - matches_dict[ent_id] = (start,end,len(matches)) + elif start >= deref(state_match).second.end: + new_match.start = start + new_match.end = end + new_match.offset = len(matches) + matches_dict[final_state] = new_match matches.append((ent_id,start,end)) - elif start <= matches_dict[ent_id][0] and end>=matches_dict[ent_id][1]: - i = matches_dict[ent_id][2] + elif start <= deref(state_match).second.start and end>=deref(state_match).second.end: + i = deref(state_match).second.offset matches[i] = (ent_id,start,end) - matches_dict[ent_id] = (start,end,i) + new_match.start = start + new_match.end = end + new_match.offset = i + matches_dict[final_state] = new_match else: pass @@ -438,7 +471,13 @@ cdef class Matcher: for pattern in self.patterns: # Skip patterns that would overlap with an existing match ent_id = get_pattern_key(pattern) - if ent_id in matches_dict and token_i>matches_dict[ent_id][0] and token_ideref(state_match).second.start + and token_i= matches_dict[ent_id][1]: - matches_dict[ent_id] = (start,end,len(matches)) + elif start >= deref(state_match).second.end: + new_match.start = start + new_match.end = end + new_match.offset = len(matches) + matches_dict[final_state] = new_match matches.append((ent_id,start,end)) - elif start <= matches_dict[ent_id][0] and end>=matches_dict[ent_id][1]: - j = matches_dict[ent_id][2] + elif start <= deref(state_match).second.start and end>=deref(state_match).second.end: + j = deref(state_match).second.offset matches[j] = (ent_id,start,end) - matches_dict[ent_id] = (start,end,j) + new_match.start = start + new_match.end = end + new_match.offset = j + matches_dict[final_state] = new_match else: pass @@ -503,16 +554,27 @@ cdef class Matcher: end = len(doc) ent_id = state.second.attrs[0].value label = state.second.attrs[1].value - if ent_id not in matches_dict: - matches_dict[ent_id] = (start,end,len(matches)) + final_state = state.second + state_match = matches_dict.find(final_state) + if state_match == matches_dict.end(): + new_match.start = start + new_match.end = end + new_match.offset = len(matches) + matches_dict[final_state] = new_match matches.append((ent_id,start,end)) - elif start >= matches_dict[ent_id][1]: - matches_dict[ent_id] = (start,end,len(matches)) + elif start >= deref(state_match).second.end: + new_match.start = start + new_match.end = end + new_match.offset = len(matches) + matches_dict[final_state] = new_match matches.append((ent_id,start,end)) - elif start <= matches_dict[ent_id][0] and end>=matches_dict[ent_id][1]: - j = matches_dict[ent_id][2] + elif start <= deref(state_match).second.start and end>=deref(state_match).second.end: + j = deref(state_match).second.offset matches[j] = (ent_id,start,end) - matches_dict[ent_id] = (start,end,j) + new_match.start = start + new_match.end = end + new_match.offset = j + matches_dict[final_state] = new_match else: pass for i, (ent_id, start, end) in enumerate(matches): From daefed0a342d5cac87544070ab762dd9349a1de5 Mon Sep 17 00:00:00 2001 From: greg Date: Mon, 22 Jan 2018 15:55:44 -0500 Subject: [PATCH 5/9] Correct documentation of '+' and '*' ops --- .../usage/_linguistic-features/_rule-based-matching.jade | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/website/usage/_linguistic-features/_rule-based-matching.jade b/website/usage/_linguistic-features/_rule-based-matching.jade index 82d48e438..794b6595c 100644 --- a/website/usage/_linguistic-features/_rule-based-matching.jade +++ b/website/usage/_linguistic-features/_rule-based-matching.jade @@ -161,11 +161,7 @@ p p | The #[code +] and #[code *] operators are usually interpretted - | "greedily", i.e. longer matches are returned where possible. However, if - | you specify two #[code +] and #[code *] patterns in a row and their - | matches overlap, the first operator will behave non-greedily. This quirk - | in the semantics makes the matcher more efficient, by avoiding the need - | for back-tracking. + | "greedily", i.e. longer matches are returned where possible. +h(3, "adding-phrase-patterns") Adding phrase patterns From 3a491093eed839c298e9aefdc3d491b86f54c436 Mon Sep 17 00:00:00 2001 From: greg Date: Mon, 22 Jan 2018 16:46:25 -0500 Subject: [PATCH 6/9] Import libcpp.map if libcpp.unordered_map doesn't exist --- spacy/matcher.pyx | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index dd8e0b55c..061936105 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -8,12 +8,14 @@ from cymem.cymem cimport Pool from preshed.maps cimport PreshMap from libcpp.vector cimport vector from libcpp.pair cimport pair -from libcpp.unordered_map cimport unordered_map as umap from cython.operator cimport dereference as deref from murmurhash.mrmr cimport hash64 from libc.stdint cimport int32_t -from libc.stdio cimport printf +try: + from libcpp.unordered_map cimport unordered_map as umap +except: + from libcpp.map cimport map as umap from .typedefs cimport attr_t from .typedefs cimport hash_t From 686735b94e0a79c4111e5783ec108bd1bdc71238 Mon Sep 17 00:00:00 2001 From: greg Date: Mon, 22 Jan 2018 16:53:05 -0500 Subject: [PATCH 7/9] Fix matcher import --- spacy/tests/regression/test_issue1855.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/regression/test_issue1855.py b/spacy/tests/regression/test_issue1855.py index 882c356ca..aeaad9413 100644 --- a/spacy/tests/regression/test_issue1855.py +++ b/spacy/tests/regression/test_issue1855.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals import re -from ..matcher import Matcher +from ...matcher import Matcher import pytest From f50bb1aafc70bfd728dde79f741e8917dd9741d2 Mon Sep 17 00:00:00 2001 From: greg Date: Tue, 23 Jan 2018 14:40:03 -0500 Subject: [PATCH 8/9] Restructure StateC to eliminate dependency on unordered_map --- spacy/matcher.pyx | 216 +++++++++++++++++++++++----------------------- 1 file changed, 106 insertions(+), 110 deletions(-) diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index 061936105..d9804a922 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -12,10 +12,10 @@ from cython.operator cimport dereference as deref from murmurhash.mrmr cimport hash64 from libc.stdint cimport int32_t -try: - from libcpp.unordered_map cimport unordered_map as umap -except: - from libcpp.map cimport map as umap +# try: +# from libcpp.unordered_map cimport unordered_map as umap +# except: +# from libcpp.map cimport map as umap from .typedefs cimport attr_t from .typedefs cimport hash_t @@ -72,6 +72,7 @@ cdef enum action_t: ACCEPT_PREV PANIC + # Each token pattern consists of a quantifier and 0+ (attr, value) pairs. # A state is an (int, pattern pointer) pair, where the int is the start # position, and the pattern pointer shows where we're up to @@ -89,7 +90,7 @@ cdef struct TokenPatternC: ctypedef TokenPatternC* TokenPatternC_ptr -ctypedef pair[int, TokenPatternC_ptr] StateC +# ctypedef pair[int, TokenPatternC_ptr] StateC # Match Dictionary entry type cdef struct MatchEntryC: @@ -97,6 +98,19 @@ cdef struct MatchEntryC: int32_t end int32_t offset +# A state instance represents the information that defines a +# partial match +# start: the index of the first token in the partial match +# pattern: a pointer to the current token pattern in the full +# pattern +# last_match: The entry of the last span matched by the +# same pattern +cdef struct StateC: + int32_t start + TokenPatternC_ptr pattern + MatchEntryC* last_match + + cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, object token_specs) except NULL: pattern = mem.alloc(len(token_specs) + 1, sizeof(TokenPatternC)) @@ -346,11 +360,15 @@ cdef class Matcher: cdef StateC state cdef int j = 0 cdef int k - cdef bint add_match,overlap = False - cdef TokenPatternC_ptr final_state - cdef umap[TokenPatternC_ptr,MatchEntryC] matches_dict - cdef umap[TokenPatternC_ptr,MatchEntryC].iterator state_match - cdef MatchEntryC new_match + cdef bint overlap = False + cdef MatchEntryC* state_match + cdef MatchEntryC* last_matches = self.mem.alloc(self.patterns.size(),sizeof(MatchEntryC)) + + for i in range(self.patterns.size()): + last_matches[i].start = 0 + last_matches[i].end = 0 + last_matches[i].offset = 0 + matches = [] for token_i in range(doc.length): token = &doc.c[token_i] @@ -361,7 +379,7 @@ cdef class Matcher: j=0 while j < n_partials: state = partials[j] - action = get_action(state.second, token) + action = get_action(state.pattern, token) j += 1 # Skip patterns that would overlap with an existing match # Patterns overlap an existing match if they point to the @@ -369,33 +387,29 @@ cdef class Matcher: # of said match. # Different patterns with the same label are allowed to # overlap. - final_state = state.second - while final_state.nr_attr != 0: - final_state+=1 - state_match = matches_dict.find(final_state) - if (state_match != matches_dict.end() - and state.first>deref(state_match).second.start - and state.first state_match.start + and state.start < state_match.end): continue if action == PANIC: raise Exception("Error selecting action in matcher") while action == ADVANCE_ZERO: - state.second += 1 - action = get_action(state.second, token) + state.pattern += 1 + action = get_action(state.pattern, token) if action == PANIC: raise Exception("Error selecting action in matcher") # ADVANCE_PLUS acts like REPEAT, but also pushes a partial that # acts like and ADVANCE_ZERO if action == ADVANCE_PLUS: - state.second += 1 + state.pattern += 1 partials.push_back(state) n_partials += 1 - state.second -= 1 + state.pattern -= 1 action = REPEAT if action == ADVANCE: - state.second += 1 + state.pattern += 1 # Check for partial matches that are at the same spec in the same pattern # Keep the longer of the matches @@ -404,7 +418,7 @@ cdef class Matcher: overlap=False for i in range(q): - if state.second == partials[i].second and state.first < partials[i].first: + if state.pattern == partials[i].pattern and state.start < partials[i].start: partials[i] = state j = i overlap = True @@ -413,7 +427,7 @@ cdef class Matcher: continue overlap=False for i in range(q): - if state.second == partials[i].second: + if state.pattern == partials[i].pattern: overlap = True break if overlap: @@ -434,60 +448,53 @@ cdef class Matcher: elif action in (ACCEPT, ACCEPT_PREV): # TODO: What to do about patterns starting with ZERO? Need # to adjust the start position. - start = state.first + start = state.start end = token_i+1 if action == ACCEPT else token_i - ent_id = state.second[1].attrs[0].value - # ent_id = get_pattern_key(state.second) - label = state.second[1].attrs[1].value + ent_id = state.pattern[1].attrs[0].value + label = state.pattern[1].attrs[1].value # Check that this match doesn't overlap with an earlier match. # Only overwrite an earlier match if it is a substring of this # match (i.e. it starts after this match starts). - final_state = state.second+1 - state_match = matches_dict.find(final_state) + state_match = state.last_match - if state_match == matches_dict.end(): - new_match.start = start - new_match.end = end - new_match.offset = len(matches) - matches_dict[final_state] = new_match + if start >= state_match.end: + state_match.start = start + state_match.end = end + state_match.offset = len(matches) matches.append((ent_id,start,end)) - elif start >= deref(state_match).second.end: - new_match.start = start - new_match.end = end - new_match.offset = len(matches) - matches_dict[final_state] = new_match - matches.append((ent_id,start,end)) - elif start <= deref(state_match).second.start and end>=deref(state_match).second.end: - i = deref(state_match).second.offset - matches[i] = (ent_id,start,end) - new_match.start = start - new_match.end = end - new_match.offset = i - matches_dict[final_state] = new_match + elif start <= state_match.start and end >= state_match.end: + if len(matches) == 0: + assert state_match.offset==0 + state_match.offset = 0 + matches.append((ent_id,start,end)) + else: + i = state_match.offset + matches[i] = (ent_id,start,end) + state_match.start = start + state_match.end = end else: pass partials.resize(q) n_partials = q # Check whether we open any new patterns on this token + i=0 for pattern in self.patterns: # Skip patterns that would overlap with an existing match - ent_id = get_pattern_key(pattern) - final_state = pattern - while final_state.nr_attr != 0: - final_state+=1 - state_match = matches_dict.find(final_state) - if (state_match != matches_dict.end() - and token_i>deref(state_match).second.start - and token_i state_match.start + and token_i < state_match.end): continue action = get_action(pattern, token) if action == PANIC: raise Exception("Error selecting action in matcher") while action in (ADVANCE_PLUS,ADVANCE_ZERO): if action == ADVANCE_PLUS: - state.first = token_i - state.second = pattern + state.start = token_i + state.pattern = pattern + state.last_match = state_match partials.push_back(state) n_partials += 1 pattern += 1 @@ -498,7 +505,7 @@ cdef class Matcher: j=0 overlap = False for j in range(q): - if pattern == partials[j].second: + if pattern == partials[j].pattern: overlap = True break if overlap: @@ -506,15 +513,17 @@ cdef class Matcher: if action == REPEAT: - state.first = token_i - state.second = pattern + state.start = token_i + state.pattern = pattern + state.last_match = state_match partials.push_back(state) n_partials += 1 elif action == ADVANCE: # TODO: What to do about patterns starting with ZERO? Need # to adjust the start position. - state.first = token_i - state.second = pattern + state.start = token_i + state.pattern = pattern + state.last_match = state_match partials.push_back(state) n_partials += 1 elif action in (ACCEPT, ACCEPT_PREV): @@ -523,60 +532,47 @@ cdef class Matcher: ent_id = pattern[1].attrs[0].value label = pattern[1].attrs[1].value - final_state = pattern+1 - state_match = matches_dict.find(final_state) - if state_match == matches_dict.end(): - new_match.start = start - new_match.end = end - new_match.offset = len(matches) - matches_dict[final_state] = new_match + if start >= state_match.end: + state_match.start = start + state_match.end = end + state_match.offset = len(matches) matches.append((ent_id,start,end)) - elif start >= deref(state_match).second.end: - new_match.start = start - new_match.end = end - new_match.offset = len(matches) - matches_dict[final_state] = new_match - matches.append((ent_id,start,end)) - elif start <= deref(state_match).second.start and end>=deref(state_match).second.end: - j = deref(state_match).second.offset - matches[j] = (ent_id,start,end) - new_match.start = start - new_match.end = end - new_match.offset = j - matches_dict[final_state] = new_match + if start <= state_match.start and end >= state_match.end: + if len(matches) == 0: + state_match.offset = 0 + matches.append((ent_id,start,end)) + else: + j = state_match.offset + matches[j] = (ent_id,start,end) + state_match.start = start + state_match.end = end else: pass # Look for open patterns that are actually satisfied for state in partials: - while state.second.quantifier in (ZERO, ZERO_ONE, ZERO_PLUS): - state.second += 1 - if state.second.nr_attr == 0: - start = state.first + while state.pattern.quantifier in (ZERO, ZERO_ONE, ZERO_PLUS): + state.pattern += 1 + if state.pattern.nr_attr == 0: + start = state.start end = len(doc) - ent_id = state.second.attrs[0].value - label = state.second.attrs[1].value - final_state = state.second - state_match = matches_dict.find(final_state) - if state_match == matches_dict.end(): - new_match.start = start - new_match.end = end - new_match.offset = len(matches) - matches_dict[final_state] = new_match + ent_id = state.pattern.attrs[0].value + label = state.pattern.attrs[1].value + state_match = state.last_match + if start >= state_match.end: + state_match.start = start + state_match.end = end + state_match.offset = len(matches) matches.append((ent_id,start,end)) - elif start >= deref(state_match).second.end: - new_match.start = start - new_match.end = end - new_match.offset = len(matches) - matches_dict[final_state] = new_match - matches.append((ent_id,start,end)) - elif start <= deref(state_match).second.start and end>=deref(state_match).second.end: - j = deref(state_match).second.offset - matches[j] = (ent_id,start,end) - new_match.start = start - new_match.end = end - new_match.offset = j - matches_dict[final_state] = new_match + if start <= state_match.start and end >= state_match.end: + j = state_match.offset + if len(matches) == 0: + state_match.offset = 0 + matches.append((ent_id,start,end)) + else: + matches[j] = (ent_id,start,end) + state_match.start = start + state_match.end = end else: pass for i, (ent_id, start, end) in enumerate(matches): From 85ab99e6929b91ce0331267116b7142fc3be612f Mon Sep 17 00:00:00 2001 From: greg Date: Tue, 23 Jan 2018 15:00:14 -0500 Subject: [PATCH 9/9] Correct test examples --- spacy/tests/regression/test_issue1450.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/tests/regression/test_issue1450.py b/spacy/tests/regression/test_issue1450.py index 6f1d4f568..3c8f975d9 100644 --- a/spacy/tests/regression/test_issue1450.py +++ b/spacy/tests/regression/test_issue1450.py @@ -13,8 +13,8 @@ from ...vocab import Vocab ('a b', 0, 2), ('a c', 0, 1), ('a b c', 0, 2), - ('a b b c', 0, 2), - ('a b b', 0, 2), + ('a b b c', 0, 3), + ('a b b', 0, 3), ] ) def test_issue1450_matcher_end_zero_plus(string, start, end):