From 8bea62f26ea8f69fbf97bc29933461217b5e3e8e Mon Sep 17 00:00:00 2001
From: greg <greg.dubbin@spreemo.com>
Date: Tue, 16 Jan 2018 13:21:43 -0500
Subject: [PATCH 1/9] Correct bugs for greedy matching and introduce
 ADVANCE_PLUS action

---
 spacy/matcher.pyx | 169 ++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 157 insertions(+), 12 deletions(-)

diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx
index a6b02ba2c..6d40045ae 100644
--- a/spacy/matcher.pyx
+++ b/spacy/matcher.pyx
@@ -62,10 +62,10 @@ cdef enum action_t:
     REPEAT
     ACCEPT
     ADVANCE_ZERO
+    ADVANCE_PLUS
     ACCEPT_PREV
     PANIC
 
-# A "match expression" conists of one or more token patterns
 # Each token pattern consists of a quantifier and 0+ (attr, value) pairs.
 # A state is an (int, pattern pointer) pair, where the int is the start
 # position, and the pattern pointer shows where we're up to
@@ -128,7 +128,10 @@ cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil:
     if pattern.quantifier == ZERO:
         return REJECT
     elif lookahead.nr_attr == 0:
-        return ACCEPT
+        if pattern.quantifier == ZERO_PLUS:
+            return REPEAT
+        else:
+            return ACCEPT
     elif pattern.quantifier in (ONE, ZERO_ONE):
         return ADVANCE
     elif pattern.quantifier == ZERO_PLUS:
@@ -138,7 +141,7 @@ cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil:
         if next_action is REJECT:
             return REPEAT
         else:
-            return ADVANCE_ZERO
+            return ADVANCE_PLUS
     else:
         return PANIC
 
@@ -330,14 +333,26 @@ cdef class Matcher:
         cdef int i, token_i
         cdef const TokenC* token
         cdef StateC state
+        cdef int j = 0
+        cdef int k
+        cdef bint add_match,overlap = False
         matches = []
+        matches_dict = {}
         for token_i in range(doc.length):
             token = &doc.c[token_i]
             q = 0
             # Go over the open matches, extending or finalizing if able.
             # Otherwise, we over-write them (q doesn't advance)
-            for state in partials:
+            #for state in partials:
+            j=0
+            while j < n_partials:
+                state = partials[j]
                 action = get_action(state.second, token)
+                j += 1
+                # Skip patterns that would overlap with an existing match
+                ent_id = get_pattern_key(state.second)
+                if ent_id in matches_dict and state.first>matches_dict[ent_id][0] and state.first<matches_dict[ent_id][1]:
+                    continue
                 if action == PANIC:
                     raise Exception("Error selecting action in matcher")
                 while action == ADVANCE_ZERO:
@@ -345,52 +360,169 @@ cdef class Matcher:
                     action = get_action(state.second, token)
                 if action == PANIC:
                     raise Exception("Error selecting action in matcher")
+                
 
+                if action == ADVANCE_PLUS:
+                    state.second += 1
+                    partials.push_back(state)
+                    n_partials += 1
+                    state.second -= 1
+                    action = REPEAT
+
+                if action == ADVANCE:
+                    state.second += 1
+
+                overlap=False
+                for i in range(q):
+                    if ent_id != get_pattern_key(partials[i].second):
+                        continue
+                    if state.second == partials[i].second and state.first < partials[i].first:
+                        partials[i] = state
+                        j = i
+                        overlap = True
+                        break
+                if overlap:
+                    continue
+                overlap=False
+                for i in range(q):
+                    if ent_id != get_pattern_key(partials[i].second):
+                        continue
+                    if state.second == partials[i].second:
+                        overlap = True
+                        break
+                if overlap:
+                    continue
+
+                # overlap=False
+                # for i in range(q):
+                #     if state.second == partials[i].second:
+                #         if state.first < partials[i].first:
+                #             partials[i] = state
+                #             j = i-1
+                #         else:
+                #             overlap=True
+                #         break
+                # if overlap:
+                #     continue
+
+    
                 if action == REPEAT:
                     # Leave the state in the queue, and advance to next slot
                     # (i.e. we don't overwrite -- we want to greedily match
                     # more pattern.
+                    partials[q] = state
                     q += 1
                 elif action == REJECT:
                     pass
                 elif action == ADVANCE:
                     partials[q] = state
-                    partials[q].second += 1
                     q += 1
                 elif action in (ACCEPT, ACCEPT_PREV):
                     # TODO: What to do about patterns starting with ZERO? Need
                     # to adjust the start position.
                     start = state.first
                     end = token_i+1 if action == ACCEPT else token_i
-                    ent_id = state.second[1].attrs[0].value
+                    # ent_id = state.second[1].attrs[0].value
+                    # ent_id = get_pattern_key(state.second)
                     label = state.second[1].attrs[1].value
-                    matches.append((ent_id, start, end))
+                    # matches.append((ent_id, start, end))
+                    # Check that this match doesn't overlap with an earlier match.
+                    # Only overwrite an earlier match if it is a substring of this
+                    # match.
+
+                    if ent_id not in matches_dict:
+                        matches_dict[ent_id] = (start,end,len(matches))
+                        matches.append((ent_id,start,end))
+                    elif start >= matches_dict[ent_id][1]:
+                        matches_dict[ent_id] = (start,end,len(matches))
+                        matches.append((ent_id,start,end))
+                    elif start <= matches_dict[ent_id][0] and end>=matches_dict[ent_id][1]:
+                        i = matches_dict[ent_id][2]
+                        matches[i] = (ent_id,start,end)
+                        matches_dict[ent_id] = (start,end,i)
+                    else:
+                        pass
 
             partials.resize(q)
+            n_partials = q
             # Check whether we open any new patterns on this token
             for pattern in self.patterns:
+                # Skip patterns that would overlap with an existing match
+                ent_id = get_pattern_key(pattern)
+                if ent_id in matches_dict and token_i>matches_dict[ent_id][0] and token_i<matches_dict[ent_id][1]:
+                    continue
                 action = get_action(pattern, token)
                 if action == PANIC:
                     raise Exception("Error selecting action in matcher")
-                while action == ADVANCE_ZERO:
+                # while acton == ADVANCE_ZERO:
+                #     pattern += 1
+                #     action = get_action(pattern,token)
+                # if action == PANIC:
+                #     raise Exception("Error selecting action in matcher")
+                while action in (ADVANCE_PLUS,ADVANCE_ZERO):
+                    if action == ADVANCE_PLUS:
+                        # j=0
+                        # overlap = False
+                        # for j in range(q):
+                        #     if pattern == partials[j].second:
+                        #         overlap = True
+                        #         break
+                        # if overlap:
+                        #     pattern += 1
+                        #     action = get_action(pattern, token)
+                        #     continue
+                        state.first = token_i
+                        state.second = pattern
+                        partials.push_back(state)
+                        n_partials += 1
                     pattern += 1
                     action = get_action(pattern, token)
+
+                if action == ADVANCE:
+                    pattern += 1
+                j=0
+                overlap = False
+                for j in range(q):
+                    if ent_id == get_pattern_key(partials[j].second):
+                        continue
+                    if pattern == partials[j].second:
+                        overlap = True
+                        break
+                if overlap:
+                    continue
+
+
                 if action == REPEAT:
                     state.first = token_i
                     state.second = pattern
                     partials.push_back(state)
+                    n_partials += 1
                 elif action == ADVANCE:
                     # TODO: What to do about patterns starting with ZERO? Need
                     # to adjust the start position.
                     state.first = token_i
-                    state.second = pattern + 1
+                    state.second = pattern
                     partials.push_back(state)
+                    n_partials += 1
                 elif action in (ACCEPT, ACCEPT_PREV):
                     start = token_i
                     end = token_i+1 if action == ACCEPT else token_i
                     ent_id = pattern[1].attrs[0].value
+                    # ent_id = get_pattern_key(state.second)
                     label = pattern[1].attrs[1].value
-                    matches.append((ent_id, start, end))
+                    if ent_id not in matches_dict:
+                        matches_dict[ent_id] = (start,end,len(matches))
+                        matches.append((ent_id,start,end))
+                    elif start >= matches_dict[ent_id][1]:
+                        matches_dict[ent_id] = (start,end,len(matches))
+                        matches.append((ent_id,start,end))
+                    elif start <= matches_dict[ent_id][0] and end>=matches_dict[ent_id][1]:
+                        j = matches_dict[ent_id][2]
+                        matches[j] = (ent_id,start,end)
+                        matches_dict[ent_id] = (start,end,j)
+                    else:
+                        pass
+
         # Look for open patterns that are actually satisfied
         for state in partials:
             while state.second.quantifier in (ZERO, ZERO_ONE, ZERO_PLUS):
@@ -399,8 +531,21 @@ cdef class Matcher:
                     start = state.first
                     end = len(doc)
                     ent_id = state.second.attrs[0].value
-                    label = state.second.attrs[0].value
-                    matches.append((ent_id, start, end))
+                    # ent_id = get_pattern_key(state.second)
+                    label = state.second.attrs[1].value
+                    # matches.append((ent_id, start, end))
+                    if ent_id not in matches_dict:
+                        matches_dict[ent_id] = (start,end,len(matches))
+                        matches.append((ent_id,start,end))
+                    elif start >= matches_dict[ent_id][1]:
+                        matches_dict[ent_id] = (start,end,len(matches))
+                        matches.append((ent_id,start,end))
+                    elif start <= matches_dict[ent_id][0] and end>=matches_dict[ent_id][1]:
+                        j = matches_dict[ent_id][2]
+                        matches[j] = (ent_id,start,end)
+                        matches_dict[ent_id] = (start,end,j)
+                    else:
+                        pass
         for i, (ent_id, start, end) in enumerate(matches):
             on_match = self._callbacks.get(ent_id)
             if on_match is not None:

From 7072b395c9e0705338c857e7b1ef7087cdb7b928 Mon Sep 17 00:00:00 2001
From: greg <greg.dubbin@spreemo.com>
Date: Tue, 16 Jan 2018 15:46:13 -0500
Subject: [PATCH 2/9] Add greedy matcher tests

---
 spacy/tests/regression/test_issue1855.py | 63 ++++++++++++++++++++++++
 spacy/tests/test_matcher_greedy.py       | 63 ++++++++++++++++++++++++
 2 files changed, 126 insertions(+)
 create mode 100644 spacy/tests/regression/test_issue1855.py
 create mode 100644 spacy/tests/test_matcher_greedy.py

diff --git a/spacy/tests/regression/test_issue1855.py b/spacy/tests/regression/test_issue1855.py
new file mode 100644
index 000000000..882c356ca
--- /dev/null
+++ b/spacy/tests/regression/test_issue1855.py
@@ -0,0 +1,63 @@
+# coding: utf-8
+from __future__ import unicode_literals
+import re
+
+from ..matcher import Matcher
+
+import pytest
+
+pattern1	= [{'ORTH':'A','OP':'1'},{'ORTH':'A','OP':'*'}]
+pattern2	= [{'ORTH':'A','OP':'*'},{'ORTH':'A','OP':'1'}]
+pattern3	= [{'ORTH':'A','OP':'1'},{'ORTH':'A','OP':'1'}]
+pattern4	= [{'ORTH':'B','OP':'1'},{'ORTH':'A','OP':'*'},{'ORTH':'B','OP':'1'}]
+pattern5 	= [{'ORTH':'B','OP':'*'},{'ORTH':'A','OP':'*'},{'ORTH':'B','OP':'1'}]
+
+re_pattern1	= 'AA*'
+re_pattern2 = 'A*A'
+re_pattern3	= 'AA'
+re_pattern4	= 'BA*B'
+re_pattern5	= 'B*A*B'
+
+@pytest.fixture
+def text():
+	return "(ABBAAAAAB)."
+
+@pytest.fixture
+def doc(en_tokenizer,text):
+    doc = en_tokenizer(' '.join(text))
+    return doc
+
+@pytest.mark.parametrize('pattern,re_pattern',[
+	(pattern1,re_pattern1),
+	(pattern2,re_pattern2),
+	(pattern3,re_pattern3),
+	(pattern4,re_pattern4),
+	(pattern5,re_pattern5)])
+def test_greedy_matching(doc,text,pattern,re_pattern):
+	"""
+	Test that the greedy matching behavior of the * op
+	is consistant with other re implementations
+	"""
+	matcher = Matcher(doc.vocab)
+	matcher.add(re_pattern,None,pattern)
+	matches = matcher(doc)
+	re_matches = [m.span() for m in re.finditer(re_pattern,text)]
+	for match,re_match in zip(matches,re_matches):
+		assert match[1:]==re_match
+
+@pytest.mark.parametrize('pattern,re_pattern',[
+	(pattern1,re_pattern1),
+	(pattern2,re_pattern2),
+	(pattern3,re_pattern3),
+	(pattern4,re_pattern4),
+	(pattern5,re_pattern5)])
+def test_match_consuming(doc,text,pattern,re_pattern):
+	"""
+	Test that matcher.__call__ consumes tokens on a match
+	similar to re.findall
+	"""
+	matcher = Matcher(doc.vocab)
+	matcher.add(re_pattern,None,pattern)
+	matches = matcher(doc)
+	re_matches = [m.span() for m in re.finditer(re_pattern,text)]
+	assert len(matches)==len(re_matches)
\ No newline at end of file
diff --git a/spacy/tests/test_matcher_greedy.py b/spacy/tests/test_matcher_greedy.py
new file mode 100644
index 000000000..882c356ca
--- /dev/null
+++ b/spacy/tests/test_matcher_greedy.py
@@ -0,0 +1,63 @@
+# coding: utf-8
+from __future__ import unicode_literals
+import re
+
+from ..matcher import Matcher
+
+import pytest
+
+pattern1	= [{'ORTH':'A','OP':'1'},{'ORTH':'A','OP':'*'}]
+pattern2	= [{'ORTH':'A','OP':'*'},{'ORTH':'A','OP':'1'}]
+pattern3	= [{'ORTH':'A','OP':'1'},{'ORTH':'A','OP':'1'}]
+pattern4	= [{'ORTH':'B','OP':'1'},{'ORTH':'A','OP':'*'},{'ORTH':'B','OP':'1'}]
+pattern5 	= [{'ORTH':'B','OP':'*'},{'ORTH':'A','OP':'*'},{'ORTH':'B','OP':'1'}]
+
+re_pattern1	= 'AA*'
+re_pattern2 = 'A*A'
+re_pattern3	= 'AA'
+re_pattern4	= 'BA*B'
+re_pattern5	= 'B*A*B'
+
+@pytest.fixture
+def text():
+	return "(ABBAAAAAB)."
+
+@pytest.fixture
+def doc(en_tokenizer,text):
+    doc = en_tokenizer(' '.join(text))
+    return doc
+
+@pytest.mark.parametrize('pattern,re_pattern',[
+	(pattern1,re_pattern1),
+	(pattern2,re_pattern2),
+	(pattern3,re_pattern3),
+	(pattern4,re_pattern4),
+	(pattern5,re_pattern5)])
+def test_greedy_matching(doc,text,pattern,re_pattern):
+	"""
+	Test that the greedy matching behavior of the * op
+	is consistant with other re implementations
+	"""
+	matcher = Matcher(doc.vocab)
+	matcher.add(re_pattern,None,pattern)
+	matches = matcher(doc)
+	re_matches = [m.span() for m in re.finditer(re_pattern,text)]
+	for match,re_match in zip(matches,re_matches):
+		assert match[1:]==re_match
+
+@pytest.mark.parametrize('pattern,re_pattern',[
+	(pattern1,re_pattern1),
+	(pattern2,re_pattern2),
+	(pattern3,re_pattern3),
+	(pattern4,re_pattern4),
+	(pattern5,re_pattern5)])
+def test_match_consuming(doc,text,pattern,re_pattern):
+	"""
+	Test that matcher.__call__ consumes tokens on a match
+	similar to re.findall
+	"""
+	matcher = Matcher(doc.vocab)
+	matcher.add(re_pattern,None,pattern)
+	matches = matcher(doc)
+	re_matches = [m.span() for m in re.finditer(re_pattern,text)]
+	assert len(matches)==len(re_matches)
\ No newline at end of file

From 490bc82c27d411fbaa8d49b0d581114e2ee88460 Mon Sep 17 00:00:00 2001
From: greg <greg.dubbin@spreemo.com>
Date: Mon, 22 Jan 2018 10:03:12 -0500
Subject: [PATCH 3/9] Add comments clarifying matcher logic for '*'

---
 spacy/matcher.pyx | 47 ++++++++---------------------------------------
 1 file changed, 8 insertions(+), 39 deletions(-)

diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx
index 6d40045ae..738cd8f5d 100644
--- a/spacy/matcher.pyx
+++ b/spacy/matcher.pyx
@@ -361,7 +361,8 @@ cdef class Matcher:
                 if action == PANIC:
                     raise Exception("Error selecting action in matcher")
                 
-
+                # ADVANCE_PLUS acts like REPEAT, but also pushes a partial that
+                # acts like and ADVANCE_ZERO
                 if action == ADVANCE_PLUS:
                     state.second += 1
                     partials.push_back(state)
@@ -372,10 +373,13 @@ cdef class Matcher:
                 if action == ADVANCE:
                     state.second += 1
 
+                # Check for partial matches that are at the same spec in the same pattern
+                # Keep the longer of the matches
+                # This ensures that there are never more then 2 partials for every spec
+                # in a pattern (one of which gets pruned in this step)
+
                 overlap=False
                 for i in range(q):
-                    if ent_id != get_pattern_key(partials[i].second):
-                        continue
                     if state.second == partials[i].second and state.first < partials[i].first:
                         partials[i] = state
                         j = i
@@ -385,26 +389,12 @@ cdef class Matcher:
                     continue
                 overlap=False
                 for i in range(q):
-                    if ent_id != get_pattern_key(partials[i].second):
-                        continue
                     if state.second == partials[i].second:
                         overlap = True
                         break
                 if overlap:
                     continue
 
-                # overlap=False
-                # for i in range(q):
-                #     if state.second == partials[i].second:
-                #         if state.first < partials[i].first:
-                #             partials[i] = state
-                #             j = i-1
-                #         else:
-                #             overlap=True
-                #         break
-                # if overlap:
-                #     continue
-
     
                 if action == REPEAT:
                     # Leave the state in the queue, and advance to next slot
@@ -425,10 +415,9 @@ cdef class Matcher:
                     # ent_id = state.second[1].attrs[0].value
                     # ent_id = get_pattern_key(state.second)
                     label = state.second[1].attrs[1].value
-                    # matches.append((ent_id, start, end))
                     # Check that this match doesn't overlap with an earlier match.
                     # Only overwrite an earlier match if it is a substring of this
-                    # match.
+                    # match (i.e. it starts after this match starts).
 
                     if ent_id not in matches_dict:
                         matches_dict[ent_id] = (start,end,len(matches))
@@ -454,23 +443,8 @@ cdef class Matcher:
                 action = get_action(pattern, token)
                 if action == PANIC:
                     raise Exception("Error selecting action in matcher")
-                # while acton == ADVANCE_ZERO:
-                #     pattern += 1
-                #     action = get_action(pattern,token)
-                # if action == PANIC:
-                #     raise Exception("Error selecting action in matcher")
                 while action in (ADVANCE_PLUS,ADVANCE_ZERO):
                     if action == ADVANCE_PLUS:
-                        # j=0
-                        # overlap = False
-                        # for j in range(q):
-                        #     if pattern == partials[j].second:
-                        #         overlap = True
-                        #         break
-                        # if overlap:
-                        #     pattern += 1
-                        #     action = get_action(pattern, token)
-                        #     continue
                         state.first = token_i
                         state.second = pattern
                         partials.push_back(state)
@@ -483,8 +457,6 @@ cdef class Matcher:
                 j=0
                 overlap = False
                 for j in range(q):
-                    if ent_id == get_pattern_key(partials[j].second):
-                        continue
                     if pattern == partials[j].second:
                         overlap = True
                         break
@@ -508,7 +480,6 @@ cdef class Matcher:
                     start = token_i
                     end = token_i+1 if action == ACCEPT else token_i
                     ent_id = pattern[1].attrs[0].value
-                    # ent_id = get_pattern_key(state.second)
                     label = pattern[1].attrs[1].value
                     if ent_id not in matches_dict:
                         matches_dict[ent_id] = (start,end,len(matches))
@@ -531,9 +502,7 @@ cdef class Matcher:
                     start = state.first
                     end = len(doc)
                     ent_id = state.second.attrs[0].value
-                    # ent_id = get_pattern_key(state.second)
                     label = state.second.attrs[1].value
-                    # matches.append((ent_id, start, end))
                     if ent_id not in matches_dict:
                         matches_dict[ent_id] = (start,end,len(matches))
                         matches.append((ent_id,start,end))

From d55992bdf085611f55cb6b7fb61a65b0bcfab31d Mon Sep 17 00:00:00 2001
From: greg <greg.dubbin@spreemo.com>
Date: Mon, 22 Jan 2018 15:36:47 -0500
Subject: [PATCH 4/9] Switch match dictionary to use final state pointer rather
 than ID

---
 spacy/matcher.pyx | 114 +++++++++++++++++++++++++++++++++++-----------
 1 file changed, 88 insertions(+), 26 deletions(-)

diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx
index 738cd8f5d..dd8e0b55c 100644
--- a/spacy/matcher.pyx
+++ b/spacy/matcher.pyx
@@ -8,9 +8,13 @@ from cymem.cymem cimport Pool
 from preshed.maps cimport PreshMap
 from libcpp.vector cimport vector
 from libcpp.pair cimport pair
+from libcpp.unordered_map cimport unordered_map as umap
+from cython.operator cimport dereference as deref
 from murmurhash.mrmr cimport hash64
 from libc.stdint cimport int32_t
 
+from libc.stdio cimport printf
+
 from .typedefs cimport attr_t
 from .typedefs cimport hash_t
 from .structs cimport TokenC
@@ -85,6 +89,11 @@ cdef struct TokenPatternC:
 ctypedef TokenPatternC* TokenPatternC_ptr
 ctypedef pair[int, TokenPatternC_ptr] StateC
 
+# Match Dictionary entry type
+cdef struct MatchEntryC:
+    int32_t start
+    int32_t end
+    int32_t offset
 
 cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id,
                                  object token_specs) except NULL:
@@ -336,8 +345,11 @@ cdef class Matcher:
         cdef int j = 0
         cdef int k
         cdef bint add_match,overlap = False
+        cdef TokenPatternC_ptr final_state
+        cdef umap[TokenPatternC_ptr,MatchEntryC] matches_dict
+        cdef umap[TokenPatternC_ptr,MatchEntryC].iterator state_match
+        cdef MatchEntryC new_match
         matches = []
-        matches_dict = {}
         for token_i in range(doc.length):
             token = &doc.c[token_i]
             q = 0
@@ -350,8 +362,18 @@ cdef class Matcher:
                 action = get_action(state.second, token)
                 j += 1
                 # Skip patterns that would overlap with an existing match
-                ent_id = get_pattern_key(state.second)
-                if ent_id in matches_dict and state.first>matches_dict[ent_id][0] and state.first<matches_dict[ent_id][1]:
+                # Patterns overlap an existing match if they point to the
+                # same final state and start between the start and end
+                # of said match.
+                # Different patterns with the same label are allowed to 
+                # overlap.
+                final_state = state.second
+                while final_state.nr_attr != 0:
+                    final_state+=1
+                state_match = matches_dict.find(final_state)
+                if (state_match != matches_dict.end() 
+                    and state.first>deref(state_match).second.start 
+                    and state.first<deref(state_match).second.end):
                     continue
                 if action == PANIC:
                     raise Exception("Error selecting action in matcher")
@@ -412,23 +434,34 @@ cdef class Matcher:
                     # to adjust the start position.
                     start = state.first
                     end = token_i+1 if action == ACCEPT else token_i
-                    # ent_id = state.second[1].attrs[0].value
+                    ent_id = state.second[1].attrs[0].value
                     # ent_id = get_pattern_key(state.second)
                     label = state.second[1].attrs[1].value
                     # Check that this match doesn't overlap with an earlier match.
                     # Only overwrite an earlier match if it is a substring of this
                     # match (i.e. it starts after this match starts).
+                    final_state = state.second+1
+                    state_match = matches_dict.find(final_state)
 
-                    if ent_id not in matches_dict:
-                        matches_dict[ent_id] = (start,end,len(matches))
+                    if state_match == matches_dict.end():
+                        new_match.start = start
+                        new_match.end = end
+                        new_match.offset = len(matches)
+                        matches_dict[final_state] = new_match
                         matches.append((ent_id,start,end))
-                    elif start >= matches_dict[ent_id][1]:
-                        matches_dict[ent_id] = (start,end,len(matches))
+                    elif start >= deref(state_match).second.end:
+                        new_match.start = start
+                        new_match.end = end
+                        new_match.offset = len(matches)
+                        matches_dict[final_state] = new_match
                         matches.append((ent_id,start,end))
-                    elif start <= matches_dict[ent_id][0] and end>=matches_dict[ent_id][1]:
-                        i = matches_dict[ent_id][2]
+                    elif start <= deref(state_match).second.start and end>=deref(state_match).second.end:
+                        i = deref(state_match).second.offset
                         matches[i] = (ent_id,start,end)
-                        matches_dict[ent_id] = (start,end,i)
+                        new_match.start = start
+                        new_match.end = end
+                        new_match.offset = i
+                        matches_dict[final_state] = new_match
                     else:
                         pass
 
@@ -438,7 +471,13 @@ cdef class Matcher:
             for pattern in self.patterns:
                 # Skip patterns that would overlap with an existing match
                 ent_id = get_pattern_key(pattern)
-                if ent_id in matches_dict and token_i>matches_dict[ent_id][0] and token_i<matches_dict[ent_id][1]:
+                final_state = pattern
+                while final_state.nr_attr != 0:
+                    final_state+=1
+                state_match = matches_dict.find(final_state)
+                if (state_match != matches_dict.end() 
+                    and token_i>deref(state_match).second.start 
+                    and token_i<deref(state_match).second.end):
                     continue
                 action = get_action(pattern, token)
                 if action == PANIC:
@@ -480,17 +519,29 @@ cdef class Matcher:
                     start = token_i
                     end = token_i+1 if action == ACCEPT else token_i
                     ent_id = pattern[1].attrs[0].value
+
                     label = pattern[1].attrs[1].value
-                    if ent_id not in matches_dict:
-                        matches_dict[ent_id] = (start,end,len(matches))
+                    final_state = pattern+1
+                    state_match = matches_dict.find(final_state)
+                    if state_match == matches_dict.end():
+                        new_match.start = start
+                        new_match.end = end
+                        new_match.offset = len(matches)
+                        matches_dict[final_state] = new_match
                         matches.append((ent_id,start,end))
-                    elif start >= matches_dict[ent_id][1]:
-                        matches_dict[ent_id] = (start,end,len(matches))
+                    elif start >= deref(state_match).second.end:
+                        new_match.start = start
+                        new_match.end = end
+                        new_match.offset = len(matches)
+                        matches_dict[final_state] = new_match
                         matches.append((ent_id,start,end))
-                    elif start <= matches_dict[ent_id][0] and end>=matches_dict[ent_id][1]:
-                        j = matches_dict[ent_id][2]
+                    elif start <= deref(state_match).second.start and end>=deref(state_match).second.end:
+                        j = deref(state_match).second.offset
                         matches[j] = (ent_id,start,end)
-                        matches_dict[ent_id] = (start,end,j)
+                        new_match.start = start
+                        new_match.end = end
+                        new_match.offset = j
+                        matches_dict[final_state] = new_match
                     else:
                         pass
 
@@ -503,16 +554,27 @@ cdef class Matcher:
                     end = len(doc)
                     ent_id = state.second.attrs[0].value
                     label = state.second.attrs[1].value
-                    if ent_id not in matches_dict:
-                        matches_dict[ent_id] = (start,end,len(matches))
+                    final_state = state.second
+                    state_match = matches_dict.find(final_state)
+                    if state_match == matches_dict.end():
+                        new_match.start = start
+                        new_match.end = end
+                        new_match.offset = len(matches)
+                        matches_dict[final_state] = new_match
                         matches.append((ent_id,start,end))
-                    elif start >= matches_dict[ent_id][1]:
-                        matches_dict[ent_id] = (start,end,len(matches))
+                    elif start >= deref(state_match).second.end:
+                        new_match.start = start
+                        new_match.end = end
+                        new_match.offset = len(matches)
+                        matches_dict[final_state] = new_match
                         matches.append((ent_id,start,end))
-                    elif start <= matches_dict[ent_id][0] and end>=matches_dict[ent_id][1]:
-                        j = matches_dict[ent_id][2]
+                    elif start <= deref(state_match).second.start and end>=deref(state_match).second.end:
+                        j = deref(state_match).second.offset
                         matches[j] = (ent_id,start,end)
-                        matches_dict[ent_id] = (start,end,j)
+                        new_match.start = start
+                        new_match.end = end
+                        new_match.offset = j
+                        matches_dict[final_state] = new_match
                     else:
                         pass
         for i, (ent_id, start, end) in enumerate(matches):

From daefed0a342d5cac87544070ab762dd9349a1de5 Mon Sep 17 00:00:00 2001
From: greg <greg.dubbin@spreemo.com>
Date: Mon, 22 Jan 2018 15:55:44 -0500
Subject: [PATCH 5/9] Correct documentation of '+' and '*' ops

---
 .../usage/_linguistic-features/_rule-based-matching.jade    | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/website/usage/_linguistic-features/_rule-based-matching.jade b/website/usage/_linguistic-features/_rule-based-matching.jade
index 82d48e438..794b6595c 100644
--- a/website/usage/_linguistic-features/_rule-based-matching.jade
+++ b/website/usage/_linguistic-features/_rule-based-matching.jade
@@ -161,11 +161,7 @@ p
 
 p
     |  The #[code +] and #[code *] operators are usually interpretted
-    |  "greedily", i.e. longer matches are returned where possible. However, if
-    |  you specify two #[code +] and #[code *] patterns in a row and their
-    |  matches overlap, the first operator will behave non-greedily. This quirk
-    |  in the semantics makes the matcher more efficient, by avoiding the need
-    |  for back-tracking.
+    |  "greedily", i.e. longer matches are returned where possible. 
 
 +h(3, "adding-phrase-patterns") Adding phrase patterns
 

From 3a491093eed839c298e9aefdc3d491b86f54c436 Mon Sep 17 00:00:00 2001
From: greg <greg.dubbin@spreemo.com>
Date: Mon, 22 Jan 2018 16:46:25 -0500
Subject: [PATCH 6/9] Import libcpp.map if libcpp.unordered_map doesn't exist

---
 spacy/matcher.pyx | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx
index dd8e0b55c..061936105 100644
--- a/spacy/matcher.pyx
+++ b/spacy/matcher.pyx
@@ -8,12 +8,14 @@ from cymem.cymem cimport Pool
 from preshed.maps cimport PreshMap
 from libcpp.vector cimport vector
 from libcpp.pair cimport pair
-from libcpp.unordered_map cimport unordered_map as umap
 from cython.operator cimport dereference as deref
 from murmurhash.mrmr cimport hash64
 from libc.stdint cimport int32_t
 
-from libc.stdio cimport printf
+try:
+    from libcpp.unordered_map cimport unordered_map as umap
+except:
+    from libcpp.map cimport map as umap
 
 from .typedefs cimport attr_t
 from .typedefs cimport hash_t

From 686735b94e0a79c4111e5783ec108bd1bdc71238 Mon Sep 17 00:00:00 2001
From: greg <greg.dubbin@spreemo.com>
Date: Mon, 22 Jan 2018 16:53:05 -0500
Subject: [PATCH 7/9] Fix matcher import

---
 spacy/tests/regression/test_issue1855.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/tests/regression/test_issue1855.py b/spacy/tests/regression/test_issue1855.py
index 882c356ca..aeaad9413 100644
--- a/spacy/tests/regression/test_issue1855.py
+++ b/spacy/tests/regression/test_issue1855.py
@@ -2,7 +2,7 @@
 from __future__ import unicode_literals
 import re
 
-from ..matcher import Matcher
+from ...matcher import Matcher
 
 import pytest
 

From f50bb1aafc70bfd728dde79f741e8917dd9741d2 Mon Sep 17 00:00:00 2001
From: greg <greg.dubbin@spreemo.com>
Date: Tue, 23 Jan 2018 14:40:03 -0500
Subject: [PATCH 8/9] Restructure StateC to eliminate dependency on
 unordered_map

---
 spacy/matcher.pyx | 216 +++++++++++++++++++++++-----------------------
 1 file changed, 106 insertions(+), 110 deletions(-)

diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx
index 061936105..d9804a922 100644
--- a/spacy/matcher.pyx
+++ b/spacy/matcher.pyx
@@ -12,10 +12,10 @@ from cython.operator cimport dereference as deref
 from murmurhash.mrmr cimport hash64
 from libc.stdint cimport int32_t
 
-try:
-    from libcpp.unordered_map cimport unordered_map as umap
-except:
-    from libcpp.map cimport map as umap
+# try:
+#     from libcpp.unordered_map cimport unordered_map as umap
+# except:
+#     from libcpp.map cimport map as umap
 
 from .typedefs cimport attr_t
 from .typedefs cimport hash_t
@@ -72,6 +72,7 @@ cdef enum action_t:
     ACCEPT_PREV
     PANIC
 
+
 # Each token pattern consists of a quantifier and 0+ (attr, value) pairs.
 # A state is an (int, pattern pointer) pair, where the int is the start
 # position, and the pattern pointer shows where we're up to
@@ -89,7 +90,7 @@ cdef struct TokenPatternC:
 
 
 ctypedef TokenPatternC* TokenPatternC_ptr
-ctypedef pair[int, TokenPatternC_ptr] StateC
+# ctypedef pair[int, TokenPatternC_ptr] StateC
 
 # Match Dictionary entry type
 cdef struct MatchEntryC:
@@ -97,6 +98,19 @@ cdef struct MatchEntryC:
     int32_t end
     int32_t offset
 
+# A state instance represents the information that defines a 
+# partial match
+# start: the index of the first token in the partial match
+# pattern: a pointer to the current token pattern in the full
+#       pattern
+# last_match: The entry of the last span matched by the
+#       same pattern
+cdef struct StateC:
+    int32_t start
+    TokenPatternC_ptr pattern
+    MatchEntryC* last_match
+
+
 cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id,
                                  object token_specs) except NULL:
     pattern = <TokenPatternC*>mem.alloc(len(token_specs) + 1, sizeof(TokenPatternC))
@@ -346,11 +360,15 @@ cdef class Matcher:
         cdef StateC state
         cdef int j = 0
         cdef int k
-        cdef bint add_match,overlap = False
-        cdef TokenPatternC_ptr final_state
-        cdef umap[TokenPatternC_ptr,MatchEntryC] matches_dict
-        cdef umap[TokenPatternC_ptr,MatchEntryC].iterator state_match
-        cdef MatchEntryC new_match
+        cdef bint overlap = False
+        cdef MatchEntryC* state_match 
+        cdef MatchEntryC* last_matches = <MatchEntryC*>self.mem.alloc(self.patterns.size(),sizeof(MatchEntryC))
+
+        for i in range(self.patterns.size()):
+            last_matches[i].start = 0
+            last_matches[i].end = 0
+            last_matches[i].offset = 0
+
         matches = []
         for token_i in range(doc.length):
             token = &doc.c[token_i]
@@ -361,7 +379,7 @@ cdef class Matcher:
             j=0
             while j < n_partials:
                 state = partials[j]
-                action = get_action(state.second, token)
+                action = get_action(state.pattern, token)
                 j += 1
                 # Skip patterns that would overlap with an existing match
                 # Patterns overlap an existing match if they point to the
@@ -369,33 +387,29 @@ cdef class Matcher:
                 # of said match.
                 # Different patterns with the same label are allowed to 
                 # overlap.
-                final_state = state.second
-                while final_state.nr_attr != 0:
-                    final_state+=1
-                state_match = matches_dict.find(final_state)
-                if (state_match != matches_dict.end() 
-                    and state.first>deref(state_match).second.start 
-                    and state.first<deref(state_match).second.end):
+                state_match = state.last_match
+                if (state.start > state_match.start 
+                    and state.start < state_match.end):
                     continue
                 if action == PANIC:
                     raise Exception("Error selecting action in matcher")
                 while action == ADVANCE_ZERO:
-                    state.second += 1
-                    action = get_action(state.second, token)
+                    state.pattern += 1
+                    action = get_action(state.pattern, token)
                 if action == PANIC:
                     raise Exception("Error selecting action in matcher")
                 
                 # ADVANCE_PLUS acts like REPEAT, but also pushes a partial that
                 # acts like and ADVANCE_ZERO
                 if action == ADVANCE_PLUS:
-                    state.second += 1
+                    state.pattern += 1
                     partials.push_back(state)
                     n_partials += 1
-                    state.second -= 1
+                    state.pattern -= 1
                     action = REPEAT
 
                 if action == ADVANCE:
-                    state.second += 1
+                    state.pattern += 1
 
                 # Check for partial matches that are at the same spec in the same pattern
                 # Keep the longer of the matches
@@ -404,7 +418,7 @@ cdef class Matcher:
 
                 overlap=False
                 for i in range(q):
-                    if state.second == partials[i].second and state.first < partials[i].first:
+                    if state.pattern == partials[i].pattern and state.start < partials[i].start:
                         partials[i] = state
                         j = i
                         overlap = True
@@ -413,7 +427,7 @@ cdef class Matcher:
                     continue
                 overlap=False
                 for i in range(q):
-                    if state.second == partials[i].second:
+                    if state.pattern == partials[i].pattern:
                         overlap = True
                         break
                 if overlap:
@@ -434,60 +448,53 @@ cdef class Matcher:
                 elif action in (ACCEPT, ACCEPT_PREV):
                     # TODO: What to do about patterns starting with ZERO? Need
                     # to adjust the start position.
-                    start = state.first
+                    start = state.start
                     end = token_i+1 if action == ACCEPT else token_i
-                    ent_id = state.second[1].attrs[0].value
-                    # ent_id = get_pattern_key(state.second)
-                    label = state.second[1].attrs[1].value
+                    ent_id = state.pattern[1].attrs[0].value
+                    label = state.pattern[1].attrs[1].value
                     # Check that this match doesn't overlap with an earlier match.
                     # Only overwrite an earlier match if it is a substring of this
                     # match (i.e. it starts after this match starts).
-                    final_state = state.second+1
-                    state_match = matches_dict.find(final_state)
+                    state_match = state.last_match
 
-                    if state_match == matches_dict.end():
-                        new_match.start = start
-                        new_match.end = end
-                        new_match.offset = len(matches)
-                        matches_dict[final_state] = new_match
+                    if start >= state_match.end:
+                        state_match.start = start
+                        state_match.end = end
+                        state_match.offset = len(matches)
                         matches.append((ent_id,start,end))
-                    elif start >= deref(state_match).second.end:
-                        new_match.start = start
-                        new_match.end = end
-                        new_match.offset = len(matches)
-                        matches_dict[final_state] = new_match
-                        matches.append((ent_id,start,end))
-                    elif start <= deref(state_match).second.start and end>=deref(state_match).second.end:
-                        i = deref(state_match).second.offset
-                        matches[i] = (ent_id,start,end)
-                        new_match.start = start
-                        new_match.end = end
-                        new_match.offset = i
-                        matches_dict[final_state] = new_match
+                    elif start <= state_match.start and end >= state_match.end:
+                        if len(matches) == 0:
+                            assert state_match.offset==0
+                            state_match.offset = 0
+                            matches.append((ent_id,start,end))
+                        else:
+                            i = state_match.offset
+                            matches[i] = (ent_id,start,end)
+                        state_match.start = start
+                        state_match.end = end
                     else:
                         pass
 
             partials.resize(q)
             n_partials = q
             # Check whether we open any new patterns on this token
+            i=0
             for pattern in self.patterns:
                 # Skip patterns that would overlap with an existing match
-                ent_id = get_pattern_key(pattern)
-                final_state = pattern
-                while final_state.nr_attr != 0:
-                    final_state+=1
-                state_match = matches_dict.find(final_state)
-                if (state_match != matches_dict.end() 
-                    and token_i>deref(state_match).second.start 
-                    and token_i<deref(state_match).second.end):
+                # state_match = pattern.last_match
+                state_match = &last_matches[i]
+                i+=1
+                if (token_i > state_match.start 
+                    and token_i < state_match.end):
                     continue
                 action = get_action(pattern, token)
                 if action == PANIC:
                     raise Exception("Error selecting action in matcher")
                 while action in (ADVANCE_PLUS,ADVANCE_ZERO):
                     if action == ADVANCE_PLUS:
-                        state.first = token_i
-                        state.second = pattern
+                        state.start = token_i
+                        state.pattern = pattern
+                        state.last_match = state_match
                         partials.push_back(state)
                         n_partials += 1
                     pattern += 1
@@ -498,7 +505,7 @@ cdef class Matcher:
                 j=0
                 overlap = False
                 for j in range(q):
-                    if pattern == partials[j].second:
+                    if pattern == partials[j].pattern:
                         overlap = True
                         break
                 if overlap:
@@ -506,15 +513,17 @@ cdef class Matcher:
 
 
                 if action == REPEAT:
-                    state.first = token_i
-                    state.second = pattern
+                    state.start = token_i
+                    state.pattern = pattern
+                    state.last_match = state_match
                     partials.push_back(state)
                     n_partials += 1
                 elif action == ADVANCE:
                     # TODO: What to do about patterns starting with ZERO? Need
                     # to adjust the start position.
-                    state.first = token_i
-                    state.second = pattern
+                    state.start = token_i
+                    state.pattern = pattern
+                    state.last_match = state_match
                     partials.push_back(state)
                     n_partials += 1
                 elif action in (ACCEPT, ACCEPT_PREV):
@@ -523,60 +532,47 @@ cdef class Matcher:
                     ent_id = pattern[1].attrs[0].value
 
                     label = pattern[1].attrs[1].value
-                    final_state = pattern+1
-                    state_match = matches_dict.find(final_state)
-                    if state_match == matches_dict.end():
-                        new_match.start = start
-                        new_match.end = end
-                        new_match.offset = len(matches)
-                        matches_dict[final_state] = new_match
+                    if start >= state_match.end:
+                        state_match.start = start
+                        state_match.end = end
+                        state_match.offset = len(matches)
                         matches.append((ent_id,start,end))
-                    elif start >= deref(state_match).second.end:
-                        new_match.start = start
-                        new_match.end = end
-                        new_match.offset = len(matches)
-                        matches_dict[final_state] = new_match
-                        matches.append((ent_id,start,end))
-                    elif start <= deref(state_match).second.start and end>=deref(state_match).second.end:
-                        j = deref(state_match).second.offset
-                        matches[j] = (ent_id,start,end)
-                        new_match.start = start
-                        new_match.end = end
-                        new_match.offset = j
-                        matches_dict[final_state] = new_match
+                    if start <= state_match.start and end >= state_match.end:
+                        if len(matches) == 0:
+                            state_match.offset = 0
+                            matches.append((ent_id,start,end))
+                        else:
+                            j = state_match.offset
+                            matches[j] = (ent_id,start,end)
+                        state_match.start = start
+                        state_match.end = end
                     else:
                         pass
 
         # Look for open patterns that are actually satisfied
         for state in partials:
-            while state.second.quantifier in (ZERO, ZERO_ONE, ZERO_PLUS):
-                state.second += 1
-                if state.second.nr_attr == 0:
-                    start = state.first
+            while state.pattern.quantifier in (ZERO, ZERO_ONE, ZERO_PLUS):
+                state.pattern += 1
+                if state.pattern.nr_attr == 0:
+                    start = state.start
                     end = len(doc)
-                    ent_id = state.second.attrs[0].value
-                    label = state.second.attrs[1].value
-                    final_state = state.second
-                    state_match = matches_dict.find(final_state)
-                    if state_match == matches_dict.end():
-                        new_match.start = start
-                        new_match.end = end
-                        new_match.offset = len(matches)
-                        matches_dict[final_state] = new_match
+                    ent_id = state.pattern.attrs[0].value
+                    label = state.pattern.attrs[1].value
+                    state_match = state.last_match
+                    if start >= state_match.end:
+                        state_match.start = start
+                        state_match.end = end
+                        state_match.offset = len(matches)
                         matches.append((ent_id,start,end))
-                    elif start >= deref(state_match).second.end:
-                        new_match.start = start
-                        new_match.end = end
-                        new_match.offset = len(matches)
-                        matches_dict[final_state] = new_match
-                        matches.append((ent_id,start,end))
-                    elif start <= deref(state_match).second.start and end>=deref(state_match).second.end:
-                        j = deref(state_match).second.offset
-                        matches[j] = (ent_id,start,end)
-                        new_match.start = start
-                        new_match.end = end
-                        new_match.offset = j
-                        matches_dict[final_state] = new_match
+                    if start <= state_match.start and end >= state_match.end:
+                        j = state_match.offset
+                        if len(matches) == 0:
+                            state_match.offset = 0
+                            matches.append((ent_id,start,end))
+                        else:
+                            matches[j] = (ent_id,start,end)
+                        state_match.start = start
+                        state_match.end = end
                     else:
                         pass
         for i, (ent_id, start, end) in enumerate(matches):

From 85ab99e6929b91ce0331267116b7142fc3be612f Mon Sep 17 00:00:00 2001
From: greg <greg.dubbin@spreemo.com>
Date: Tue, 23 Jan 2018 15:00:14 -0500
Subject: [PATCH 9/9] Correct test examples

---
 spacy/tests/regression/test_issue1450.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/tests/regression/test_issue1450.py b/spacy/tests/regression/test_issue1450.py
index 6f1d4f568..3c8f975d9 100644
--- a/spacy/tests/regression/test_issue1450.py
+++ b/spacy/tests/regression/test_issue1450.py
@@ -13,8 +13,8 @@ from ...vocab import Vocab
         ('a b', 0, 2),
         ('a c', 0, 1),
         ('a b c', 0, 2),
-        ('a b b c', 0, 2),
-        ('a b b', 0, 2),
+        ('a b b c', 0, 3),
+        ('a b b', 0, 3),
     ]
 )
 def test_issue1450_matcher_end_zero_plus(string, start, end):