From 0d9740e8266cfbdc74d15c708427f759f1f02588 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 19 Sep 2019 16:36:12 +0200
Subject: [PATCH 01/12] Replace PhraseMatcher with Aho-Corasick

Replace PhraseMatcher with the Aho-Corasick algorithm over numpy arrays
of the hash values for the relevant attribute. The implementation is
based on FlashText.

The speed should be similar to the previous PhraseMatcher. It is now
possible to easily remove match IDs and matches don't go missing with
large keyword lists / vocabularies.

Fixes #4308.
---
 spacy/matcher/phrasematcher.pxd            |   5 -
 spacy/matcher/phrasematcher.pyx            | 267 +++++++++++----------
 spacy/tests/matcher/test_phrase_matcher.py |  52 ++++
 3 files changed, 189 insertions(+), 135 deletions(-)

diff --git a/spacy/matcher/phrasematcher.pxd b/spacy/matcher/phrasematcher.pxd
index 3aba1686f..e69de29bb 100644
--- a/spacy/matcher/phrasematcher.pxd
+++ b/spacy/matcher/phrasematcher.pxd
@@ -1,5 +0,0 @@
-from libcpp.vector cimport vector
-
-from ..typedefs cimport hash_t
-
-ctypedef vector[hash_t] hash_vec
diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx
index 9e8801cc1..76a66c506 100644
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@@ -2,28 +2,14 @@
 # cython: profile=True
 from __future__ import unicode_literals
 
-from libcpp.vector cimport vector
-from cymem.cymem cimport Pool
-from murmurhash.mrmr cimport hash64
-from preshed.maps cimport PreshMap
+import numpy as np
 
-from .matcher cimport Matcher
 from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA, attr_id_t
 from ..vocab cimport Vocab
 from ..tokens.doc cimport Doc, get_token_attr
-from ..typedefs cimport attr_t, hash_t
 
 from ._schemas import TOKEN_PATTERN_SCHEMA
 from ..errors import Errors, Warnings, deprecation_warning, user_warning
-from ..attrs import FLAG61 as U_ENT
-from ..attrs import FLAG60 as B2_ENT
-from ..attrs import FLAG59 as B3_ENT
-from ..attrs import FLAG58 as B4_ENT
-from ..attrs import FLAG43 as L2_ENT
-from ..attrs import FLAG42 as L3_ENT
-from ..attrs import FLAG41 as L4_ENT
-from ..attrs import FLAG42 as I3_ENT
-from ..attrs import FLAG41 as I4_ENT
 
 
 cdef class PhraseMatcher:
@@ -33,18 +19,18 @@ cdef class PhraseMatcher:
 
     DOCS: https://spacy.io/api/phrasematcher
     USAGE: https://spacy.io/usage/rule-based-matching#phrasematcher
+
+    Adapted from FlashText: https://github.com/vi3k6i5/flashtext
+    MIT License (see `LICENSE`)
+    Copyright (c) 2017 Vikash Singh (vikash.duliajan@gmail.com)
     """
-    cdef Pool mem
     cdef Vocab vocab
-    cdef Matcher matcher
-    cdef PreshMap phrase_ids
-    cdef vector[hash_vec] ent_id_matrix
-    cdef int max_length
+    cdef unicode _terminal
+    cdef object keyword_trie_dict
     cdef attr_id_t attr
-    cdef public object _callbacks
-    cdef public object _patterns
-    cdef public object _docs
-    cdef public object _validate
+    cdef object _callbacks
+    cdef object _keywords
+    cdef bint _validate
 
     def __init__(self, Vocab vocab, max_length=0, attr="ORTH", validate=False):
         """Initialize the PhraseMatcher.
@@ -58,10 +44,13 @@ cdef class PhraseMatcher:
         """
         if max_length != 0:
             deprecation_warning(Warnings.W010)
-        self.mem = Pool()
-        self.max_length = max_length
         self.vocab = vocab
-        self.matcher = Matcher(self.vocab, validate=False)
+        self._terminal = '_terminal_'
+        self.keyword_trie_dict = dict()
+        self._callbacks = {}
+        self._keywords = {}
+        self._validate = validate
+
         if isinstance(attr, long):
             self.attr = attr
         else:
@@ -71,28 +60,15 @@ cdef class PhraseMatcher:
             if attr not in TOKEN_PATTERN_SCHEMA["items"]["properties"]:
                 raise ValueError(Errors.E152.format(attr=attr))
             self.attr = self.vocab.strings[attr]
-        self.phrase_ids = PreshMap()
-        abstract_patterns = [
-            [{U_ENT: True}],
-            [{B2_ENT: True}, {L2_ENT: True}],
-            [{B3_ENT: True}, {I3_ENT: True}, {L3_ENT: True}],
-            [{B4_ENT: True}, {I4_ENT: True}, {I4_ENT: True, "OP": "+"}, {L4_ENT: True}],
-        ]
-        self.matcher.add("Candidate", None, *abstract_patterns)
-        self._callbacks = {}
-        self._docs = {}
-        self._validate = validate
 
     def __len__(self):
-        """Get the number of rules added to the matcher. Note that this only
-        returns the number of rules (identical with the number of IDs), not the
-        number of individual patterns.
+        """Get the number of match IDs added to the matcher.
 
         RETURNS (int): The number of rules.
 
         DOCS: https://spacy.io/api/phrasematcher#len
         """
-        return len(self._docs)
+        return len(self._callbacks)
 
     def __contains__(self, key):
         """Check whether the matcher contains rules for a match ID.
@@ -102,12 +78,48 @@ cdef class PhraseMatcher:
 
         DOCS: https://spacy.io/api/phrasematcher#contains
         """
-        cdef hash_t ent_id = self.matcher._normalize_key(key)
-        return ent_id in self._callbacks
+        return key in self._callbacks
 
-    def __reduce__(self):
-        data = (self.vocab, self._docs, self._callbacks)
-        return (unpickle_matcher, data, None, None)
+    def remove(self, key):
+        """Remove a match-rule from the matcher by match ID.
+
+        key (unicode): The match ID.
+        """
+        if key not in self._keywords:
+            return
+        for keyword in self._keywords[key]:
+            current_dict = self.keyword_trie_dict
+            token_trie_list = []
+            for tokens in keyword:
+                if tokens in current_dict:
+                    token_trie_list.append((tokens, current_dict))
+                    current_dict = current_dict[tokens]
+                else:
+                    # if token is not found, break out of the loop
+                    current_dict = None
+                    break
+            # remove the tokens from trie dict if there are no other
+            # keywords with them
+            if current_dict and self._terminal in current_dict:
+                # if this is the only remaining key, remove unnecessary paths
+                if current_dict[self._terminal] == [key]:
+                    # we found a complete match for input keyword
+                    token_trie_list.append((self._terminal, current_dict))
+                    token_trie_list.reverse()
+                    for key_to_remove, dict_pointer in token_trie_list:
+                        if len(dict_pointer.keys()) == 1:
+                            dict_pointer.pop(key_to_remove)
+                        else:
+                            # more than one key means more than 1 path,
+                            # delete not required path and keep the other
+                            dict_pointer.pop(key_to_remove)
+                            break
+                # otherwise simply remove the key
+                else:
+                    current_dict[self._terminal].remove(key)
+
+        del self._keywords[key]
+        del self._callbacks[key]
 
     def add(self, key, on_match, *docs):
         """Add a match-rule to the phrase-matcher. A match-rule consists of: an ID
@@ -119,17 +131,13 @@ cdef class PhraseMatcher:
 
         DOCS: https://spacy.io/api/phrasematcher#add
         """
-        cdef Doc doc
-        cdef hash_t ent_id = self.matcher._normalize_key(key)
-        self._callbacks[ent_id] = on_match
-        self._docs[ent_id] = docs
-        cdef int length
-        cdef int i
-        cdef hash_t phrase_hash
-        cdef Pool mem = Pool()
+
+        _ = self.vocab[key]
+        self._callbacks[key] = on_match
+        self._keywords.setdefault(key, [])
+
         for doc in docs:
-            length = doc.length
-            if length == 0:
+            if len(doc) == 0:
                 continue
             if self.attr in (POS, TAG, LEMMA) and not doc.is_tagged:
                 raise ValueError(Errors.E155.format())
@@ -139,33 +147,18 @@ cdef class PhraseMatcher:
               and self.attr not in (DEP, POS, TAG, LEMMA):
                 string_attr = self.vocab.strings[self.attr]
                 user_warning(Warnings.W012.format(key=key, attr=string_attr))
-            tags = get_biluo(length)
-            phrase_key = <attr_t*>mem.alloc(length, sizeof(attr_t))
-            for i, tag in enumerate(tags):
-                attr_value = self.get_lex_value(doc, i)
-                lexeme = self.vocab[attr_value]
-                lexeme.set_flag(tag, True)
-                phrase_key[i] = lexeme.orth
-            phrase_hash = hash64(phrase_key, length * sizeof(attr_t), 0)
+            keyword = self._convert_to_array(doc)
+            # keep track of keywords per key to make remove easier
+            # (would use a set, but can't hash numpy arrays)
+            if keyword not in self._keywords[key]:
+                self._keywords[key].append(keyword)
+            current_dict = self.keyword_trie_dict
+            for token in keyword:
+                current_dict = current_dict.setdefault(token, {})
+            current_dict.setdefault(self._terminal, set())
+            current_dict[self._terminal].add(key)
 
-            if phrase_hash in self.phrase_ids:
-                phrase_index = self.phrase_ids[phrase_hash]
-                ent_id_list = self.ent_id_matrix[phrase_index]
-                ent_id_list.append(ent_id)
-                self.ent_id_matrix[phrase_index] = ent_id_list
-
-            else:
-                ent_id_list = hash_vec(1)
-                ent_id_list[0] = ent_id
-                new_index = self.ent_id_matrix.size()
-                if new_index == 0:
-                    # PreshMaps can not contain 0 as value, so storing a dummy at 0
-                    self.ent_id_matrix.push_back(hash_vec(0))
-                    new_index = 1
-                self.ent_id_matrix.push_back(ent_id_list)
-                self.phrase_ids.set(phrase_hash,  <void*>new_index)
-
-    def __call__(self, Doc doc):
+    def __call__(self, doc):
         """Find all sequences matching the supplied patterns on the `Doc`.
 
         doc (Doc): The document to match over.
@@ -175,20 +168,62 @@ cdef class PhraseMatcher:
 
         DOCS: https://spacy.io/api/phrasematcher#call
         """
+        doc_array = self._convert_to_array(doc)
         matches = []
-        if self.attr == ORTH:
-            match_doc = doc
-        else:
-            # If we're not matching on the ORTH, match_doc will be a Doc whose
-            # token.orth values are the attribute values we're matching on,
-            # e.g. Doc(nlp.vocab, words=[token.pos_ for token in doc])
-            words = [self.get_lex_value(doc, i) for i in range(len(doc))]
-            match_doc = Doc(self.vocab, words=words)
-        for _, start, end in self.matcher(match_doc):
-            ent_ids = self.accept_match(match_doc, start, end)
-            if ent_ids is not None:
-                for ent_id in ent_ids:
-                    matches.append((ent_id, start, end))
+        if doc_array is None or len(doc_array) == 0:
+            # if doc_array is empty or None just return empty list
+            return matches
+        current_dict = self.keyword_trie_dict
+        start = 0
+        reset_current_dict = False
+        idx = 0
+        doc_array_len = len(doc_array)
+        while idx < doc_array_len:
+            token = doc_array[idx]
+            # if end is present in current_dict
+            if self._terminal in current_dict or token in current_dict:
+                if self._terminal in current_dict:
+                    ent_id = current_dict[self._terminal]
+                    matches.append((self.vocab.strings[ent_id], start, idx))
+
+                # look for longer sequences from this position
+                if token in current_dict:
+                    current_dict_continued = current_dict[token]
+
+                    idy = idx + 1
+                    while idy < doc_array_len:
+                        inner_token = doc_array[idy]
+                        if self._terminal in current_dict_continued:
+                            ent_ids = current_dict_continued[self._terminal]
+                            for ent_id in ent_ids:
+                                matches.append((self.vocab.strings[ent_id], start, idy))
+                        if inner_token in current_dict_continued:
+                            current_dict_continued = current_dict_continued[inner_token]
+                        else:
+                            break
+                        idy += 1
+                    else:
+                        # end of doc_array reached
+                        if self._terminal in current_dict_continued:
+                            ent_ids = current_dict_continued[self._terminal]
+                            for ent_id in ent_ids:
+                                matches.append((self.vocab.strings[ent_id], start, idy))
+                current_dict = self.keyword_trie_dict
+                reset_current_dict = True
+            else:
+                # we reset current_dict
+                current_dict = self.keyword_trie_dict
+                reset_current_dict = True
+            # if we are end of doc_array and have a sequence discovered
+            if idx + 1 >= doc_array_len:
+                if self._terminal in current_dict:
+                    ent_ids = current_dict[self._terminal]
+                    for ent_id in ent_ids:
+                        matches.append((self.vocab.strings[ent_id], start, doc_array_len))
+            idx += 1
+            if reset_current_dict:
+                reset_current_dict = False
+                start = idx
         for i, (ent_id, start, end) in enumerate(matches):
             on_match = self._callbacks.get(ent_id)
             if on_match is not None:
@@ -228,19 +263,6 @@ cdef class PhraseMatcher:
                 else:
                     yield doc
 
-    def accept_match(self, Doc doc, int start, int end):
-        cdef int i, j
-        cdef Pool mem = Pool()
-        phrase_key = <attr_t*>mem.alloc(end-start, sizeof(attr_t))
-        for i, j in enumerate(range(start, end)):
-            phrase_key[i] = doc.c[j].lex.orth
-        cdef hash_t key = hash64(phrase_key, (end-start) * sizeof(attr_t), 0)
-
-        ent_index = <hash_t>self.phrase_ids.get(key)
-        if ent_index == 0:
-            return None
-        return self.ent_id_matrix[ent_index]
-
     def get_lex_value(self, Doc doc, int i):
         if self.attr == ORTH:
             # Return the regular orth value of the lexeme
@@ -256,25 +278,10 @@ cdef class PhraseMatcher:
         # Concatenate the attr name and value to not pollute lexeme space
         # e.g. 'POS-VERB' instead of just 'VERB', which could otherwise
         # create false positive matches
-        return "matcher:{}-{}".format(string_attr_name, string_attr_value)
+        matcher_attr_string = "matcher:{}-{}".format(string_attr_name, string_attr_value)
+        # Add new string to vocab
+        _ = self.vocab[matcher_attr_string]
+        return self.vocab.strings[matcher_attr_string]
 
-
-def get_biluo(length):
-    if length == 0:
-        raise ValueError(Errors.E127)
-    elif length == 1:
-        return [U_ENT]
-    elif length == 2:
-        return [B2_ENT, L2_ENT]
-    elif length == 3:
-        return [B3_ENT, I3_ENT, L3_ENT]
-    else:
-        return [B4_ENT, I4_ENT] + [I4_ENT] * (length-3) + [L4_ENT]
-
-
-def unpickle_matcher(vocab, docs, callbacks):
-    matcher = PhraseMatcher(vocab)
-    for key, specs in docs.items():
-        callback = callbacks.get(key, None)
-        matcher.add(key, callback, *specs)
-    return matcher
+    def _convert_to_array(self, Doc doc):
+        return np.array([self.get_lex_value(doc, i) for i in range(len(doc))], dtype=np.uint64)
diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py
index b82f9a058..a9f5ac990 100644
--- a/spacy/tests/matcher/test_phrase_matcher.py
+++ b/spacy/tests/matcher/test_phrase_matcher.py
@@ -31,6 +31,58 @@ def test_phrase_matcher_contains(en_vocab):
     assert "TEST2" not in matcher
 
 
+def test_phrase_matcher_repeated_add(en_vocab):
+    matcher = PhraseMatcher(en_vocab)
+    # match ID only gets added once
+    matcher.add("TEST", None, Doc(en_vocab, words=["like"]))
+    matcher.add("TEST", None, Doc(en_vocab, words=["like"]))
+    matcher.add("TEST", None, Doc(en_vocab, words=["like"]))
+    matcher.add("TEST", None, Doc(en_vocab, words=["like"]))
+    doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"])
+    assert "TEST" in matcher
+    assert "TEST2" not in matcher
+    assert len(matcher(doc)) == 1
+
+
+def test_phrase_matcher_remove(en_vocab):
+    matcher = PhraseMatcher(en_vocab)
+    matcher.add("TEST", None, Doc(en_vocab, words=["like"]))
+    doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"])
+    assert "TEST" in matcher
+    assert "TEST2" not in matcher
+    assert len(matcher(doc)) == 1
+    matcher.remove("TEST")
+    assert "TEST" not in matcher
+    assert "TEST2" not in matcher
+    assert len(matcher(doc)) == 0
+    matcher.remove("TEST2")
+    assert "TEST" not in matcher
+    assert "TEST2" not in matcher
+    assert len(matcher(doc)) == 0
+
+
+def test_phrase_matcher_overlapping_with_remove(en_vocab):
+    matcher = PhraseMatcher(en_vocab)
+    matcher.add("TEST", None, Doc(en_vocab, words=["like"]))
+    # TEST2 is added alongside TEST
+    matcher.add("TEST2", None, Doc(en_vocab, words=["like"]))
+    doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"])
+    assert "TEST" in matcher
+    assert len(matcher) == 2
+    assert len(matcher(doc)) == 2
+    # removing TEST does not remove the entry for TEST2
+    matcher.remove("TEST")
+    assert "TEST" not in matcher
+    assert len(matcher) == 1
+    assert len(matcher(doc)) == 1
+    assert matcher(doc)[0][0] == en_vocab.strings["TEST2"]
+    # removing TEST2 removes all
+    matcher.remove("TEST2")
+    assert "TEST2" not in matcher
+    assert len(matcher) == 0
+    assert len(matcher(doc)) == 0
+
+
 def test_phrase_matcher_string_attrs(en_vocab):
     words1 = ["I", "like", "cats"]
     pos1 = ["PRON", "VERB", "NOUN"]

From 0d851db6d9f7cc7630718091e64d227ce3e4bd89 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 19 Sep 2019 20:20:53 +0200
Subject: [PATCH 02/12] Restore support for pickling

---
 spacy/matcher/phrasematcher.pyx | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx
index 76a66c506..67f4ac83d 100644
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@@ -30,6 +30,7 @@ cdef class PhraseMatcher:
     cdef attr_id_t attr
     cdef object _callbacks
     cdef object _keywords
+    cdef object _docs
     cdef bint _validate
 
     def __init__(self, Vocab vocab, max_length=0, attr="ORTH", validate=False):
@@ -49,6 +50,7 @@ cdef class PhraseMatcher:
         self.keyword_trie_dict = dict()
         self._callbacks = {}
         self._keywords = {}
+        self._docs = {}
         self._validate = validate
 
         if isinstance(attr, long):
@@ -80,6 +82,10 @@ cdef class PhraseMatcher:
         """
         return key in self._callbacks
 
+    def __reduce__(self):
+        data = (self.vocab, self._docs, self._callbacks)
+        return (unpickle_matcher, data, None, None)
+
     def remove(self, key):
         """Remove a match-rule from the matcher by match ID.
 
@@ -120,6 +126,7 @@ cdef class PhraseMatcher:
 
         del self._keywords[key]
         del self._callbacks[key]
+        del self._docs[key]
 
     def add(self, key, on_match, *docs):
         """Add a match-rule to the phrase-matcher. A match-rule consists of: an ID
@@ -135,6 +142,8 @@ cdef class PhraseMatcher:
         _ = self.vocab[key]
         self._callbacks[key] = on_match
         self._keywords.setdefault(key, [])
+        self._docs.setdefault(key, set())
+        self._docs[key].update(docs)
 
         for doc in docs:
             if len(doc) == 0:
@@ -285,3 +294,11 @@ cdef class PhraseMatcher:
 
     def _convert_to_array(self, Doc doc):
         return np.array([self.get_lex_value(doc, i) for i in range(len(doc))], dtype=np.uint64)
+
+
+def unpickle_matcher(vocab, docs, callbacks):
+    matcher = PhraseMatcher(vocab)
+    for key, specs in docs.items():
+        callback = callbacks.get(key, None)
+        matcher.add(key, callback, *specs)
+    return matcher

From 3a4e1f5ca7ff550e90d0c309d8673a296e05e41c Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 20 Sep 2019 09:18:38 +0200
Subject: [PATCH 03/12] Fix internal keyword add/remove for numpy arrays

---
 spacy/matcher/phrasematcher.pyx | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx
index 67f4ac83d..ef55b0e68 100644
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@@ -122,7 +122,8 @@ cdef class PhraseMatcher:
                             break
                 # otherwise simply remove the key
                 else:
-                    current_dict[self._terminal].remove(key)
+                    if key in current_dict[self._terminal]:
+                        current_dict[self._terminal].remove(key)
 
         del self._keywords[key]
         del self._callbacks[key]
@@ -159,8 +160,7 @@ cdef class PhraseMatcher:
             keyword = self._convert_to_array(doc)
             # keep track of keywords per key to make remove easier
             # (would use a set, but can't hash numpy arrays)
-            if keyword not in self._keywords[key]:
-                self._keywords[key].append(keyword)
+            self._keywords[key].append(keyword)
             current_dict = self.keyword_trie_dict
             for token in keyword:
                 current_dict = current_dict.setdefault(token, {})

From c38c33058585a9de3be001d499b210b56f4d1a85 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Sat, 21 Sep 2019 15:57:38 +0200
Subject: [PATCH 04/12] Add missing loop for match ID set in search loop

---
 spacy/matcher/phrasematcher.pyx | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx
index ef55b0e68..375b25b46 100644
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@@ -192,8 +192,9 @@ cdef class PhraseMatcher:
             # if end is present in current_dict
             if self._terminal in current_dict or token in current_dict:
                 if self._terminal in current_dict:
-                    ent_id = current_dict[self._terminal]
-                    matches.append((self.vocab.strings[ent_id], start, idx))
+                    ent_ids = current_dict[self._terminal]
+                    for ent_id in ent_ids:
+                        matches.append((self.vocab.strings[ent_id], start, idx))
 
                 # look for longer sequences from this position
                 if token in current_dict:

From a7e9c0fd3e597cae890feecfe09c8a5f3c6a42a8 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 23 Sep 2019 09:11:13 +0200
Subject: [PATCH 05/12] Remove cruft in matching loop for partial matches

There was a bit of unnecessary code left over from FlashText in the
matching loop to handle partial token matches, which we don't have with
PhraseMatcher.
---
 spacy/matcher/phrasematcher.pyx            | 64 +++++++---------------
 spacy/tests/matcher/test_phrase_matcher.py | 27 ++++++++-
 2 files changed, 45 insertions(+), 46 deletions(-)

diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx
index 375b25b46..048b5bd68 100644
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@@ -184,56 +184,34 @@ cdef class PhraseMatcher:
             return matches
         current_dict = self.keyword_trie_dict
         start = 0
-        reset_current_dict = False
         idx = 0
         doc_array_len = len(doc_array)
         while idx < doc_array_len:
+            start = idx
             token = doc_array[idx]
-            # if end is present in current_dict
-            if self._terminal in current_dict or token in current_dict:
-                if self._terminal in current_dict:
-                    ent_ids = current_dict[self._terminal]
-                    for ent_id in ent_ids:
-                        matches.append((self.vocab.strings[ent_id], start, idx))
-
-                # look for longer sequences from this position
-                if token in current_dict:
-                    current_dict_continued = current_dict[token]
-
-                    idy = idx + 1
-                    while idy < doc_array_len:
-                        inner_token = doc_array[idy]
-                        if self._terminal in current_dict_continued:
-                            ent_ids = current_dict_continued[self._terminal]
-                            for ent_id in ent_ids:
-                                matches.append((self.vocab.strings[ent_id], start, idy))
-                        if inner_token in current_dict_continued:
-                            current_dict_continued = current_dict_continued[inner_token]
-                        else:
-                            break
+            # look for sequences from this position
+            if token in current_dict:
+                current_dict_continued = current_dict[token]
+                idy = idx + 1
+                while idy < doc_array_len:
+                    if self._terminal in current_dict_continued:
+                        ent_ids = current_dict_continued[self._terminal]
+                        for ent_id in ent_ids:
+                            matches.append((self.vocab.strings[ent_id], start, idy))
+                    inner_token = doc_array[idy]
+                    if inner_token in current_dict_continued:
+                        current_dict_continued = current_dict_continued[inner_token]
                         idy += 1
                     else:
-                        # end of doc_array reached
-                        if self._terminal in current_dict_continued:
-                            ent_ids = current_dict_continued[self._terminal]
-                            for ent_id in ent_ids:
-                                matches.append((self.vocab.strings[ent_id], start, idy))
-                current_dict = self.keyword_trie_dict
-                reset_current_dict = True
-            else:
-                # we reset current_dict
-                current_dict = self.keyword_trie_dict
-                reset_current_dict = True
-            # if we are end of doc_array and have a sequence discovered
-            if idx + 1 >= doc_array_len:
-                if self._terminal in current_dict:
-                    ent_ids = current_dict[self._terminal]
-                    for ent_id in ent_ids:
-                        matches.append((self.vocab.strings[ent_id], start, doc_array_len))
+                        break
+                else:
+                    # end of doc_array reached
+                    if self._terminal in current_dict_continued:
+                        ent_ids = current_dict_continued[self._terminal]
+                        for ent_id in ent_ids:
+                            matches.append((self.vocab.strings[ent_id], start, idy))
+            current_dict = self.keyword_trie_dict
             idx += 1
-            if reset_current_dict:
-                reset_current_dict = False
-                start = idx
         for i, (ent_id, start, end) in enumerate(matches):
             on_match = self._callbacks.get(ent_id)
             if on_match is not None:
diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py
index a9f5ac990..3eb454c39 100644
--- a/spacy/tests/matcher/test_phrase_matcher.py
+++ b/spacy/tests/matcher/test_phrase_matcher.py
@@ -8,10 +8,31 @@ from ..util import get_doc
 
 
 def test_matcher_phrase_matcher(en_vocab):
-    doc = Doc(en_vocab, words=["Google", "Now"])
-    matcher = PhraseMatcher(en_vocab)
-    matcher.add("COMPANY", None, doc)
     doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"])
+    # intermediate phrase
+    pattern = Doc(en_vocab, words=["Google", "Now"])
+    matcher = PhraseMatcher(en_vocab)
+    matcher.add("COMPANY", None, pattern)
+    assert len(matcher(doc)) == 1
+    # initial token
+    pattern = Doc(en_vocab, words=["I"])
+    matcher = PhraseMatcher(en_vocab)
+    matcher.add("I", None, pattern)
+    assert len(matcher(doc)) == 1
+    # initial phrase
+    pattern = Doc(en_vocab, words=["I", "like"])
+    matcher = PhraseMatcher(en_vocab)
+    matcher.add("ILIKE", None, pattern)
+    assert len(matcher(doc)) == 1
+    # final token
+    pattern = Doc(en_vocab, words=["best"])
+    matcher = PhraseMatcher(en_vocab)
+    matcher.add("BEST", None, pattern)
+    assert len(matcher(doc)) == 1
+    # final phrase
+    pattern = Doc(en_vocab, words=["Now", "best"])
+    matcher = PhraseMatcher(en_vocab)
+    matcher.add("NOWBEST", None, pattern)
     assert len(matcher(doc)) == 1
 
 

From 39540ed1ce95e98a04daa26209dd6342f308c740 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 24 Sep 2019 14:39:50 +0200
Subject: [PATCH 06/12] Replace dict trie with MapStruct trie

---
 spacy/matcher/phrasematcher.pxd            |   6 +
 spacy/matcher/phrasematcher.pyx            | 190 +++++++++++++++------
 spacy/tests/matcher/test_phrase_matcher.py |  25 ++-
 3 files changed, 159 insertions(+), 62 deletions(-)

diff --git a/spacy/matcher/phrasematcher.pxd b/spacy/matcher/phrasematcher.pxd
index e69de29bb..1a550989d 100644
--- a/spacy/matcher/phrasematcher.pxd
+++ b/spacy/matcher/phrasematcher.pxd
@@ -0,0 +1,6 @@
+from preshed.maps cimport key_t
+
+cdef struct MatchStruct:
+    key_t match_id
+    int start
+    int end
diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx
index 048b5bd68..172a271b0 100644
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@@ -2,10 +2,20 @@
 # cython: profile=True
 from __future__ import unicode_literals
 
+from libc.stdint cimport uintptr_t
+from libc.stdio cimport printf
+from libcpp.vector cimport vector
+
+from cymem.cymem cimport Pool
+
+from preshed.maps cimport MapStruct, map_init, map_set, map_get_unless_missing
+from preshed.maps cimport map_clear, map_iter, key_t, Result
+
 import numpy as np
 
 from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA, attr_id_t
 from ..vocab cimport Vocab
+from ..strings cimport hash_string
 from ..tokens.doc cimport Doc, get_token_attr
 
 from ._schemas import TOKEN_PATTERN_SCHEMA
@@ -25,14 +35,18 @@ cdef class PhraseMatcher:
     Copyright (c) 2017 Vikash Singh (vikash.duliajan@gmail.com)
     """
     cdef Vocab vocab
-    cdef unicode _terminal
-    cdef object keyword_trie_dict
     cdef attr_id_t attr
     cdef object _callbacks
     cdef object _keywords
     cdef object _docs
     cdef bint _validate
 
+    cdef MapStruct* c_map
+    cdef Pool mem
+    cdef key_t _terminal_node
+
+    cdef void find_matches(self, key_t* hash_array, int hash_array_len, vector[MatchStruct] *matches) nogil
+
     def __init__(self, Vocab vocab, max_length=0, attr="ORTH", validate=False):
         """Initialize the PhraseMatcher.
 
@@ -46,13 +60,16 @@ cdef class PhraseMatcher:
         if max_length != 0:
             deprecation_warning(Warnings.W010)
         self.vocab = vocab
-        self._terminal = '_terminal_'
-        self.keyword_trie_dict = dict()
         self._callbacks = {}
         self._keywords = {}
         self._docs = {}
         self._validate = validate
 
+        self.mem = Pool()
+        self.c_map = <MapStruct*>self.mem.alloc(1, sizeof(MapStruct))
+        self._terminal_node = 1 # or random: np.random.randint(0, high=np.iinfo(np.uint64).max, dtype=np.uint64)
+        map_init(self.mem, self.c_map, 8)
+
         if isinstance(attr, long):
             self.attr = attr
         else:
@@ -93,37 +110,58 @@ cdef class PhraseMatcher:
         """
         if key not in self._keywords:
             return
+        cdef MapStruct* current_node
+        cdef MapStruct* terminal_map
+        cdef MapStruct* node_pointer
+        cdef Result result
+        cdef key_t terminal_key
+        cdef void* value
+        cdef int c_i = 0
         for keyword in self._keywords[key]:
-            current_dict = self.keyword_trie_dict
+            current_node = self.c_map
             token_trie_list = []
-            for tokens in keyword:
-                if tokens in current_dict:
-                    token_trie_list.append((tokens, current_dict))
-                    current_dict = current_dict[tokens]
+            for token in keyword:
+                result = map_get_unless_missing(current_node, token)
+                if result.found:
+                    token_trie_list.append((token, <uintptr_t>current_node))
+                    current_node = <MapStruct*>result.value
                 else:
                     # if token is not found, break out of the loop
-                    current_dict = None
+                    current_node = NULL
                     break
-            # remove the tokens from trie dict if there are no other
+            # remove the tokens from trie node if there are no other
             # keywords with them
-            if current_dict and self._terminal in current_dict:
+            result = map_get_unless_missing(current_node, self._terminal_node)
+            if current_node != NULL and result.found:
                 # if this is the only remaining key, remove unnecessary paths
-                if current_dict[self._terminal] == [key]:
+                terminal_map = <MapStruct*>result.value
+                terminal_keys = []
+                c_i = 0
+                while map_iter(terminal_map, &c_i, &terminal_key, &value):
+                    terminal_keys.append(self.vocab.strings[terminal_key])
+                # TODO: not working, fix remove for unused paths/maps
+                if False and terminal_keys == [key]:
                     # we found a complete match for input keyword
-                    token_trie_list.append((self._terminal, current_dict))
+                    token_trie_list.append((self.vocab.strings[key], <uintptr_t>terminal_map))
                     token_trie_list.reverse()
-                    for key_to_remove, dict_pointer in token_trie_list:
-                        if len(dict_pointer.keys()) == 1:
-                            dict_pointer.pop(key_to_remove)
+                    for key_to_remove, py_node_pointer in token_trie_list:
+                        node_pointer = <MapStruct*>py_node_pointer
+                        result = map_get_unless_missing(node_pointer, key_to_remove)
+                        if node_pointer.filled == 1:
+                            map_clear(node_pointer, key_to_remove)
+                            self.mem.free(result.value)
+                            pass
                         else:
                             # more than one key means more than 1 path,
                             # delete not required path and keep the other
-                            dict_pointer.pop(key_to_remove)
+                            map_clear(node_pointer, key_to_remove)
+                            self.mem.free(result.value)
                             break
                 # otherwise simply remove the key
                 else:
-                    if key in current_dict[self._terminal]:
-                        current_dict[self._terminal].remove(key)
+                    result = map_get_unless_missing(current_node, self._terminal_node)
+                    if result.found:
+                        map_clear(<MapStruct*>result.value, self.vocab.strings[key])
 
         del self._keywords[key]
         del self._callbacks[key]
@@ -146,6 +184,10 @@ cdef class PhraseMatcher:
         self._docs.setdefault(key, set())
         self._docs[key].update(docs)
 
+        cdef MapStruct* current_node
+        cdef MapStruct* internal_node
+        cdef Result result
+
         for doc in docs:
             if len(doc) == 0:
                 continue
@@ -161,11 +203,23 @@ cdef class PhraseMatcher:
             # keep track of keywords per key to make remove easier
             # (would use a set, but can't hash numpy arrays)
             self._keywords[key].append(keyword)
-            current_dict = self.keyword_trie_dict
+
+            current_node = self.c_map
             for token in keyword:
-                current_dict = current_dict.setdefault(token, {})
-            current_dict.setdefault(self._terminal, set())
-            current_dict[self._terminal].add(key)
+                result = map_get_unless_missing(current_node, token)
+                if not result.found:
+                    internal_node = <MapStruct*>self.mem.alloc(1, sizeof(MapStruct))
+                    map_init(self.mem, internal_node, 8)
+                    map_set(self.mem, current_node, token, internal_node)
+                    result.value = internal_node
+                current_node = <MapStruct*>result.value
+            result = map_get_unless_missing(current_node, self._terminal_node)
+            if not result.found:
+                internal_node = <MapStruct*>self.mem.alloc(1, sizeof(MapStruct))
+                map_init(self.mem, internal_node, 8)
+                map_set(self.mem, current_node, self._terminal_node, internal_node)
+                result.value = internal_node
+            map_set(self.mem, <MapStruct*>result.value, hash_string(key), NULL)
 
     def __call__(self, doc):
         """Find all sequences matching the supplied patterns on the `Doc`.
@@ -182,42 +236,62 @@ cdef class PhraseMatcher:
         if doc_array is None or len(doc_array) == 0:
             # if doc_array is empty or None just return empty list
             return matches
-        current_dict = self.keyword_trie_dict
-        start = 0
-        idx = 0
-        doc_array_len = len(doc_array)
-        while idx < doc_array_len:
-            start = idx
-            token = doc_array[idx]
-            # look for sequences from this position
-            if token in current_dict:
-                current_dict_continued = current_dict[token]
-                idy = idx + 1
-                while idy < doc_array_len:
-                    if self._terminal in current_dict_continued:
-                        ent_ids = current_dict_continued[self._terminal]
-                        for ent_id in ent_ids:
-                            matches.append((self.vocab.strings[ent_id], start, idy))
-                    inner_token = doc_array[idy]
-                    if inner_token in current_dict_continued:
-                        current_dict_continued = current_dict_continued[inner_token]
-                        idy += 1
-                    else:
-                        break
-                else:
-                    # end of doc_array reached
-                    if self._terminal in current_dict_continued:
-                        ent_ids = current_dict_continued[self._terminal]
-                        for ent_id in ent_ids:
-                            matches.append((self.vocab.strings[ent_id], start, idy))
-            current_dict = self.keyword_trie_dict
-            idx += 1
+
+        if not doc_array.flags['C_CONTIGUOUS']:
+            doc_array = np.ascontiguousarray(doc_array)
+        cdef key_t[::1] doc_array_memview = doc_array
+        cdef vector[MatchStruct] c_matches
+        self.find_matches(&doc_array_memview[0], doc_array_memview.shape[0], &c_matches)
+        for i in range(c_matches.size()):
+            matches.append((c_matches[i].match_id, c_matches[i].start, c_matches[i].end))
         for i, (ent_id, start, end) in enumerate(matches):
             on_match = self._callbacks.get(ent_id)
             if on_match is not None:
                 on_match(self, doc, i, matches)
         return matches
 
+    cdef void find_matches(self, key_t* hash_array, int hash_array_len, vector[MatchStruct] *matches) nogil:
+        cdef MapStruct* current_node = self.c_map
+        cdef int start = 0
+        cdef int idx = 0
+        cdef int idy = 0
+        cdef key_t key
+        cdef void* value
+        cdef int i = 0
+        cdef MatchStruct ms
+        while idx < hash_array_len:
+            start = idx
+            token = hash_array[idx]
+            # look for sequences from this position
+            result = map_get_unless_missing(current_node, token)
+            if result.found:
+                current_node = <MapStruct*>result.value
+                idy = idx + 1
+                while idy < hash_array_len:
+                    result = map_get_unless_missing(current_node, self._terminal_node)
+                    if result.found:
+                        i = 0
+                        while map_iter(<MapStruct*>result.value, &i, &key, &value):
+                            ms = make_matchstruct(key, start, idy)
+                            matches.push_back(ms)
+                    inner_token = hash_array[idy]
+                    result = map_get_unless_missing(current_node, inner_token)
+                    if result.found:
+                        current_node = <MapStruct*>result.value
+                        idy += 1
+                    else:
+                        break
+                else:
+                    # end of hash_array reached
+                    result = map_get_unless_missing(current_node, self._terminal_node)
+                    if result.found:
+                        i = 0
+                        while map_iter(<MapStruct*>result.value, &i, &key, &value):
+                            ms = make_matchstruct(key, start, idy)
+                            matches.push_back(ms)
+            current_node = self.c_map
+            idx += 1
+
     def pipe(self, stream, batch_size=1000, n_threads=-1, return_matches=False,
              as_tuples=False):
         """Match a stream of documents, yielding them in turn.
@@ -281,3 +355,11 @@ def unpickle_matcher(vocab, docs, callbacks):
         callback = callbacks.get(key, None)
         matcher.add(key, callback, *specs)
     return matcher
+
+
+cdef MatchStruct make_matchstruct(key_t match_id, int start, int end) nogil:
+    cdef MatchStruct ms
+    ms.match_id = match_id
+    ms.start = start
+    ms.end = end
+    return ms
diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py
index 3eb454c39..7d65d0007 100644
--- a/spacy/tests/matcher/test_phrase_matcher.py
+++ b/spacy/tests/matcher/test_phrase_matcher.py
@@ -67,18 +67,27 @@ def test_phrase_matcher_repeated_add(en_vocab):
 
 def test_phrase_matcher_remove(en_vocab):
     matcher = PhraseMatcher(en_vocab)
-    matcher.add("TEST", None, Doc(en_vocab, words=["like"]))
+    matcher.add("TEST1", None, Doc(en_vocab, words=["like"]))
+    matcher.add("TEST2", None, Doc(en_vocab, words=["best"]))
     doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"])
-    assert "TEST" in matcher
-    assert "TEST2" not in matcher
+    assert "TEST1" in matcher
+    assert "TEST2" in matcher
+    assert "TEST3" not in matcher
+    assert len(matcher(doc)) == 2
+    matcher.remove("TEST1")
+    assert "TEST1" not in matcher
+    assert "TEST2" in matcher
+    assert "TEST3" not in matcher
     assert len(matcher(doc)) == 1
-    matcher.remove("TEST")
-    assert "TEST" not in matcher
-    assert "TEST2" not in matcher
-    assert len(matcher(doc)) == 0
     matcher.remove("TEST2")
-    assert "TEST" not in matcher
+    assert "TEST1" not in matcher
     assert "TEST2" not in matcher
+    assert "TEST3" not in matcher
+    assert len(matcher(doc)) == 0
+    matcher.remove("TEST3")
+    assert "TEST1" not in matcher
+    assert "TEST2" not in matcher
+    assert "TEST3" not in matcher
     assert len(matcher(doc)) == 0
 
 

From d4141302b6ba8f1a7b4cb178e22a01914720456c Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 24 Sep 2019 15:36:26 +0200
Subject: [PATCH 07/12] Fix how match ID hash is stored/added

---
 spacy/matcher/phrasematcher.pyx | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx
index 172a271b0..940ebb60e 100644
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@@ -15,7 +15,6 @@ import numpy as np
 
 from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA, attr_id_t
 from ..vocab cimport Vocab
-from ..strings cimport hash_string
 from ..tokens.doc cimport Doc, get_token_attr
 
 from ._schemas import TOKEN_PATTERN_SCHEMA
@@ -178,7 +177,7 @@ cdef class PhraseMatcher:
         DOCS: https://spacy.io/api/phrasematcher#add
         """
 
-        _ = self.vocab[key]
+        _ = self.vocab.strings[key]
         self._callbacks[key] = on_match
         self._keywords.setdefault(key, [])
         self._docs.setdefault(key, set())
@@ -219,7 +218,7 @@ cdef class PhraseMatcher:
                 map_init(self.mem, internal_node, 8)
                 map_set(self.mem, current_node, self._terminal_node, internal_node)
                 result.value = internal_node
-            map_set(self.mem, <MapStruct*>result.value, hash_string(key), NULL)
+            map_set(self.mem, <MapStruct*>result.value, self.vocab.strings[key], NULL)
 
     def __call__(self, doc):
         """Find all sequences matching the supplied patterns on the `Doc`.

From 34550ef66218a7e6c39e3349ea51930b09f103d0 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 24 Sep 2019 16:07:38 +0200
Subject: [PATCH 08/12] Update fix for match ID vocab

---
 spacy/matcher/phrasematcher.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx
index 940ebb60e..05231ffae 100644
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@@ -177,7 +177,7 @@ cdef class PhraseMatcher:
         DOCS: https://spacy.io/api/phrasematcher#add
         """
 
-        _ = self.vocab.strings[key]
+        _ = self.vocab[key]
         self._callbacks[key] = on_match
         self._keywords.setdefault(key, [])
         self._docs.setdefault(key, set())

From d995a7849e4c15f9cbc470410b543609f54b3a68 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 24 Sep 2019 16:20:24 +0200
Subject: [PATCH 09/12] Switch from map_get_unless_missing to map_get

---
 spacy/matcher/phrasematcher.pyx | 73 +++++++++++++++++----------------
 1 file changed, 37 insertions(+), 36 deletions(-)

diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx
index 05231ffae..d67cda005 100644
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@@ -8,8 +8,8 @@ from libcpp.vector cimport vector
 
 from cymem.cymem cimport Pool
 
-from preshed.maps cimport MapStruct, map_init, map_set, map_get_unless_missing
-from preshed.maps cimport map_clear, map_iter, key_t, Result
+from preshed.maps cimport MapStruct, map_init, map_set, map_get, map_clear
+from preshed.maps cimport map_iter, key_t
 
 import numpy as np
 
@@ -112,7 +112,7 @@ cdef class PhraseMatcher:
         cdef MapStruct* current_node
         cdef MapStruct* terminal_map
         cdef MapStruct* node_pointer
-        cdef Result result
+        cdef void* result
         cdef key_t terminal_key
         cdef void* value
         cdef int c_i = 0
@@ -120,20 +120,20 @@ cdef class PhraseMatcher:
             current_node = self.c_map
             token_trie_list = []
             for token in keyword:
-                result = map_get_unless_missing(current_node, token)
-                if result.found:
+                result = map_get(current_node, token)
+                if result:
                     token_trie_list.append((token, <uintptr_t>current_node))
-                    current_node = <MapStruct*>result.value
+                    current_node = <MapStruct*>result
                 else:
                     # if token is not found, break out of the loop
                     current_node = NULL
                     break
             # remove the tokens from trie node if there are no other
             # keywords with them
-            result = map_get_unless_missing(current_node, self._terminal_node)
-            if current_node != NULL and result.found:
+            result = map_get(current_node, self._terminal_node)
+            if current_node != NULL and result:
                 # if this is the only remaining key, remove unnecessary paths
-                terminal_map = <MapStruct*>result.value
+                terminal_map = <MapStruct*>result
                 terminal_keys = []
                 c_i = 0
                 while map_iter(terminal_map, &c_i, &terminal_key, &value):
@@ -145,22 +145,22 @@ cdef class PhraseMatcher:
                     token_trie_list.reverse()
                     for key_to_remove, py_node_pointer in token_trie_list:
                         node_pointer = <MapStruct*>py_node_pointer
-                        result = map_get_unless_missing(node_pointer, key_to_remove)
+                        result = map_get(node_pointer, key_to_remove)
                         if node_pointer.filled == 1:
                             map_clear(node_pointer, key_to_remove)
-                            self.mem.free(result.value)
+                            self.mem.free(result)
                             pass
                         else:
                             # more than one key means more than 1 path,
                             # delete not required path and keep the other
                             map_clear(node_pointer, key_to_remove)
-                            self.mem.free(result.value)
+                            self.mem.free(result)
                             break
                 # otherwise simply remove the key
                 else:
-                    result = map_get_unless_missing(current_node, self._terminal_node)
-                    if result.found:
-                        map_clear(<MapStruct*>result.value, self.vocab.strings[key])
+                    result = map_get(current_node, self._terminal_node)
+                    if result:
+                        map_clear(<MapStruct*>result, self.vocab.strings[key])
 
         del self._keywords[key]
         del self._callbacks[key]
@@ -185,7 +185,7 @@ cdef class PhraseMatcher:
 
         cdef MapStruct* current_node
         cdef MapStruct* internal_node
-        cdef Result result
+        cdef void* result
 
         for doc in docs:
             if len(doc) == 0:
@@ -205,20 +205,20 @@ cdef class PhraseMatcher:
 
             current_node = self.c_map
             for token in keyword:
-                result = map_get_unless_missing(current_node, token)
-                if not result.found:
+                result = <MapStruct*>map_get(current_node, token)
+                if not result:
                     internal_node = <MapStruct*>self.mem.alloc(1, sizeof(MapStruct))
                     map_init(self.mem, internal_node, 8)
                     map_set(self.mem, current_node, token, internal_node)
-                    result.value = internal_node
-                current_node = <MapStruct*>result.value
-            result = map_get_unless_missing(current_node, self._terminal_node)
-            if not result.found:
+                    result = internal_node
+                current_node = <MapStruct*>result
+            result = <MapStruct*>map_get(current_node, self._terminal_node)
+            if not result:
                 internal_node = <MapStruct*>self.mem.alloc(1, sizeof(MapStruct))
                 map_init(self.mem, internal_node, 8)
                 map_set(self.mem, current_node, self._terminal_node, internal_node)
-                result.value = internal_node
-            map_set(self.mem, <MapStruct*>result.value, self.vocab.strings[key], NULL)
+                result = internal_node
+            map_set(self.mem, <MapStruct*>result, self.vocab.strings[key], NULL)
 
     def __call__(self, doc):
         """Find all sequences matching the supplied patterns on the `Doc`.
@@ -258,34 +258,35 @@ cdef class PhraseMatcher:
         cdef void* value
         cdef int i = 0
         cdef MatchStruct ms
+        cdef void* result
         while idx < hash_array_len:
             start = idx
             token = hash_array[idx]
             # look for sequences from this position
-            result = map_get_unless_missing(current_node, token)
-            if result.found:
-                current_node = <MapStruct*>result.value
+            result = map_get(current_node, token)
+            if result:
+                current_node = <MapStruct*>result
                 idy = idx + 1
                 while idy < hash_array_len:
-                    result = map_get_unless_missing(current_node, self._terminal_node)
-                    if result.found:
+                    result = map_get(current_node, self._terminal_node)
+                    if result:
                         i = 0
-                        while map_iter(<MapStruct*>result.value, &i, &key, &value):
+                        while map_iter(<MapStruct*>result, &i, &key, &value):
                             ms = make_matchstruct(key, start, idy)
                             matches.push_back(ms)
                     inner_token = hash_array[idy]
-                    result = map_get_unless_missing(current_node, inner_token)
-                    if result.found:
-                        current_node = <MapStruct*>result.value
+                    result = map_get(current_node, inner_token)
+                    if result:
+                        current_node = <MapStruct*>result
                         idy += 1
                     else:
                         break
                 else:
                     # end of hash_array reached
-                    result = map_get_unless_missing(current_node, self._terminal_node)
-                    if result.found:
+                    result = map_get(current_node, self._terminal_node)
+                    if result:
                         i = 0
-                        while map_iter(<MapStruct*>result.value, &i, &key, &value):
+                        while map_iter(<MapStruct*>result, &i, &key, &value):
                             ms = make_matchstruct(key, start, idy)
                             matches.push_back(ms)
             current_node = self.c_map

From 3c6f1d7e3afb4b702b54109d400c6fc6bfa78f24 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 25 Sep 2019 09:41:27 +0200
Subject: [PATCH 10/12] Switch from numpy array to Token.get_struct_attr

Access token attributes directly in Doc instead of making a copy of the
relevant values in a numpy array.

Add unsatisfactory warning for hash collision with reserved terminal
hash key. (Ideally it would change the reserved terminal hash and redo
the whole trie, but for now, I'm hoping there won't be collisions.)
---
 spacy/errors.py                 |  2 +
 spacy/matcher/phrasematcher.pyx | 69 ++++++++++++---------------------
 2 files changed, 26 insertions(+), 45 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index b03bd6d23..da6694bb7 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -86,6 +86,8 @@ class Warnings(object):
             "previously loaded vectors. See Issue #3853.")
     W020 = ("Unnamed vectors. This won't allow multiple vectors models to be "
             "loaded. (Shape: {shape})")
+    W021 = ("Unexpected hash collision in PhraseMatcher. Matches may be "
+            "incorrect. Modify PhraseMatcher._terminal_hash to fix.")
 
 
 @add_codes
diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx
index d67cda005..aedb16dcc 100644
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@@ -11,10 +11,10 @@ from cymem.cymem cimport Pool
 from preshed.maps cimport MapStruct, map_init, map_set, map_get, map_clear
 from preshed.maps cimport map_iter, key_t
 
-import numpy as np
-
 from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA, attr_id_t
 from ..vocab cimport Vocab
+from ..structs cimport TokenC
+from ..tokens.token cimport Token
 from ..tokens.doc cimport Doc, get_token_attr
 
 from ._schemas import TOKEN_PATTERN_SCHEMA
@@ -42,9 +42,9 @@ cdef class PhraseMatcher:
 
     cdef MapStruct* c_map
     cdef Pool mem
-    cdef key_t _terminal_node
+    cdef key_t _terminal_hash
 
-    cdef void find_matches(self, key_t* hash_array, int hash_array_len, vector[MatchStruct] *matches) nogil
+    cdef void find_matches(self, Doc doc, vector[MatchStruct] *matches) nogil
 
     def __init__(self, Vocab vocab, max_length=0, attr="ORTH", validate=False):
         """Initialize the PhraseMatcher.
@@ -66,7 +66,7 @@ cdef class PhraseMatcher:
 
         self.mem = Pool()
         self.c_map = <MapStruct*>self.mem.alloc(1, sizeof(MapStruct))
-        self._terminal_node = 1 # or random: np.random.randint(0, high=np.iinfo(np.uint64).max, dtype=np.uint64)
+        self._terminal_hash = 826361138722620965
         map_init(self.mem, self.c_map, 8)
 
         if isinstance(attr, long):
@@ -130,7 +130,7 @@ cdef class PhraseMatcher:
                     break
             # remove the tokens from trie node if there are no other
             # keywords with them
-            result = map_get(current_node, self._terminal_node)
+            result = map_get(current_node, self._terminal_hash)
             if current_node != NULL and result:
                 # if this is the only remaining key, remove unnecessary paths
                 terminal_map = <MapStruct*>result
@@ -158,7 +158,7 @@ cdef class PhraseMatcher:
                             break
                 # otherwise simply remove the key
                 else:
-                    result = map_get(current_node, self._terminal_node)
+                    result = map_get(current_node, self._terminal_hash)
                     if result:
                         map_clear(<MapStruct*>result, self.vocab.strings[key])
 
@@ -205,6 +205,9 @@ cdef class PhraseMatcher:
 
             current_node = self.c_map
             for token in keyword:
+                if token == self._terminal_hash:
+                    user_warning(Warnings.W021)
+                    break
                 result = <MapStruct*>map_get(current_node, token)
                 if not result:
                     internal_node = <MapStruct*>self.mem.alloc(1, sizeof(MapStruct))
@@ -212,11 +215,11 @@ cdef class PhraseMatcher:
                     map_set(self.mem, current_node, token, internal_node)
                     result = internal_node
                 current_node = <MapStruct*>result
-            result = <MapStruct*>map_get(current_node, self._terminal_node)
+            result = <MapStruct*>map_get(current_node, self._terminal_hash)
             if not result:
                 internal_node = <MapStruct*>self.mem.alloc(1, sizeof(MapStruct))
                 map_init(self.mem, internal_node, 8)
-                map_set(self.mem, current_node, self._terminal_node, internal_node)
+                map_set(self.mem, current_node, self._terminal_hash, internal_node)
                 result = internal_node
             map_set(self.mem, <MapStruct*>result, self.vocab.strings[key], NULL)
 
@@ -230,17 +233,13 @@ cdef class PhraseMatcher:
 
         DOCS: https://spacy.io/api/phrasematcher#call
         """
-        doc_array = self._convert_to_array(doc)
         matches = []
-        if doc_array is None or len(doc_array) == 0:
-            # if doc_array is empty or None just return empty list
+        if doc is None or len(doc) == 0:
+            # if doc is empty or None just return empty list
             return matches
 
-        if not doc_array.flags['C_CONTIGUOUS']:
-            doc_array = np.ascontiguousarray(doc_array)
-        cdef key_t[::1] doc_array_memview = doc_array
         cdef vector[MatchStruct] c_matches
-        self.find_matches(&doc_array_memview[0], doc_array_memview.shape[0], &c_matches)
+        self.find_matches(doc, &c_matches)
         for i in range(c_matches.size()):
             matches.append((c_matches[i].match_id, c_matches[i].start, c_matches[i].end))
         for i, (ent_id, start, end) in enumerate(matches):
@@ -249,7 +248,7 @@ cdef class PhraseMatcher:
                 on_match(self, doc, i, matches)
         return matches
 
-    cdef void find_matches(self, key_t* hash_array, int hash_array_len, vector[MatchStruct] *matches) nogil:
+    cdef void find_matches(self, Doc doc, vector[MatchStruct] *matches) nogil:
         cdef MapStruct* current_node = self.c_map
         cdef int start = 0
         cdef int idx = 0
@@ -259,22 +258,22 @@ cdef class PhraseMatcher:
         cdef int i = 0
         cdef MatchStruct ms
         cdef void* result
-        while idx < hash_array_len:
+        while idx < doc.length:
             start = idx
-            token = hash_array[idx]
+            token = Token.get_struct_attr(&doc.c[idx], self.attr)
             # look for sequences from this position
             result = map_get(current_node, token)
             if result:
                 current_node = <MapStruct*>result
                 idy = idx + 1
-                while idy < hash_array_len:
-                    result = map_get(current_node, self._terminal_node)
+                while idy < doc.length:
+                    result = map_get(current_node, self._terminal_hash)
                     if result:
                         i = 0
                         while map_iter(<MapStruct*>result, &i, &key, &value):
                             ms = make_matchstruct(key, start, idy)
                             matches.push_back(ms)
-                    inner_token = hash_array[idy]
+                    inner_token = Token.get_struct_attr(&doc.c[idy], self.attr)
                     result = map_get(current_node, inner_token)
                     if result:
                         current_node = <MapStruct*>result
@@ -282,8 +281,8 @@ cdef class PhraseMatcher:
                     else:
                         break
                 else:
-                    # end of hash_array reached
-                    result = map_get(current_node, self._terminal_node)
+                    # end of doc reached
+                    result = map_get(current_node, self._terminal_hash)
                     if result:
                         i = 0
                         while map_iter(<MapStruct*>result, &i, &key, &value):
@@ -325,28 +324,8 @@ cdef class PhraseMatcher:
                 else:
                     yield doc
 
-    def get_lex_value(self, Doc doc, int i):
-        if self.attr == ORTH:
-            # Return the regular orth value of the lexeme
-            return doc.c[i].lex.orth
-        # Get the attribute value instead, e.g. token.pos
-        attr_value = get_token_attr(&doc.c[i], self.attr)
-        if attr_value in (0, 1):
-            # Value is boolean, convert to string
-            string_attr_value = str(attr_value)
-        else:
-            string_attr_value = self.vocab.strings[attr_value]
-        string_attr_name = self.vocab.strings[self.attr]
-        # Concatenate the attr name and value to not pollute lexeme space
-        # e.g. 'POS-VERB' instead of just 'VERB', which could otherwise
-        # create false positive matches
-        matcher_attr_string = "matcher:{}-{}".format(string_attr_name, string_attr_value)
-        # Add new string to vocab
-        _ = self.vocab[matcher_attr_string]
-        return self.vocab.strings[matcher_attr_string]
-
     def _convert_to_array(self, Doc doc):
-        return np.array([self.get_lex_value(doc, i) for i in range(len(doc))], dtype=np.uint64)
+        return [Token.get_struct_attr(&doc.c[i], self.attr) for i in range(len(doc))]
 
 
 def unpickle_matcher(vocab, docs, callbacks):

From 7862a6eb016ed10a24d9ea237816c5ae9157cf01 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 25 Sep 2019 11:03:58 +0200
Subject: [PATCH 11/12] Restructure imports to export find_matches

---
 spacy/matcher/phrasematcher.pxd | 24 +++++++++++++++++++++++-
 spacy/matcher/phrasematcher.pyx | 23 ++---------------------
 2 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/spacy/matcher/phrasematcher.pxd b/spacy/matcher/phrasematcher.pxd
index 1a550989d..8695a8114 100644
--- a/spacy/matcher/phrasematcher.pxd
+++ b/spacy/matcher/phrasematcher.pxd
@@ -1,4 +1,26 @@
-from preshed.maps cimport key_t
+from libcpp.vector cimport vector
+
+from cymem.cymem cimport Pool
+from preshed.maps cimport key_t, MapStruct
+
+from ..attrs cimport attr_id_t
+from ..tokens.doc cimport Doc
+from ..vocab cimport Vocab
+
+
+cdef class PhraseMatcher:
+    cdef Vocab vocab
+    cdef attr_id_t attr
+    cdef object _callbacks
+    cdef object _keywords
+    cdef object _docs
+    cdef bint _validate
+    cdef MapStruct* c_map
+    cdef Pool mem
+    cdef key_t _terminal_hash
+
+    cdef void find_matches(self, Doc doc, vector[MatchStruct] *matches) nogil
+
 
 cdef struct MatchStruct:
     key_t match_id
diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx
index aedb16dcc..75881848a 100644
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@@ -3,19 +3,12 @@
 from __future__ import unicode_literals
 
 from libc.stdint cimport uintptr_t
-from libc.stdio cimport printf
-from libcpp.vector cimport vector
 
-from cymem.cymem cimport Pool
+from preshed.maps cimport map_init, map_set, map_get, map_clear, map_iter
 
-from preshed.maps cimport MapStruct, map_init, map_set, map_get, map_clear
-from preshed.maps cimport map_iter, key_t
-
-from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA, attr_id_t
-from ..vocab cimport Vocab
+from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA
 from ..structs cimport TokenC
 from ..tokens.token cimport Token
-from ..tokens.doc cimport Doc, get_token_attr
 
 from ._schemas import TOKEN_PATTERN_SCHEMA
 from ..errors import Errors, Warnings, deprecation_warning, user_warning
@@ -33,18 +26,6 @@ cdef class PhraseMatcher:
     MIT License (see `LICENSE`)
     Copyright (c) 2017 Vikash Singh (vikash.duliajan@gmail.com)
     """
-    cdef Vocab vocab
-    cdef attr_id_t attr
-    cdef object _callbacks
-    cdef object _keywords
-    cdef object _docs
-    cdef bint _validate
-
-    cdef MapStruct* c_map
-    cdef Pool mem
-    cdef key_t _terminal_hash
-
-    cdef void find_matches(self, Doc doc, vector[MatchStruct] *matches) nogil
 
     def __init__(self, Vocab vocab, max_length=0, attr="ORTH", validate=False):
         """Initialize the PhraseMatcher.

From 3fdb22d832c01faf179f8e597de10287d4d72fd9 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 26 Sep 2019 11:31:03 +0200
Subject: [PATCH 12/12] Implement full remove()

Remove unnecessary trie paths and free unused maps.

Parallel to Matcher, raise KeyError when attempting to remove a match ID
that has not been added.
---
 spacy/matcher/phrasematcher.pyx            | 30 ++++++++++++----------
 spacy/tests/matcher/test_phrase_matcher.py |  3 ++-
 2 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx
index 75881848a..68428d843 100644
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@@ -84,12 +84,13 @@ cdef class PhraseMatcher:
         return (unpickle_matcher, data, None, None)
 
     def remove(self, key):
-        """Remove a match-rule from the matcher by match ID.
+        """Remove a rule from the matcher by match ID. A KeyError is raised if
+        the key does not exist.
 
         key (unicode): The match ID.
         """
         if key not in self._keywords:
-            return
+            raise KeyError(key)
         cdef MapStruct* current_node
         cdef MapStruct* terminal_map
         cdef MapStruct* node_pointer
@@ -97,13 +98,16 @@ cdef class PhraseMatcher:
         cdef key_t terminal_key
         cdef void* value
         cdef int c_i = 0
+        cdef vector[MapStruct*] path_nodes
+        cdef vector[key_t] path_keys
+        cdef key_t key_to_remove
         for keyword in self._keywords[key]:
             current_node = self.c_map
-            token_trie_list = []
             for token in keyword:
                 result = map_get(current_node, token)
                 if result:
-                    token_trie_list.append((token, <uintptr_t>current_node))
+                    path_nodes.push_back(current_node)
+                    path_keys.push_back(token)
                     current_node = <MapStruct*>result
                 else:
                     # if token is not found, break out of the loop
@@ -113,27 +117,25 @@ cdef class PhraseMatcher:
             # keywords with them
             result = map_get(current_node, self._terminal_hash)
             if current_node != NULL and result:
-                # if this is the only remaining key, remove unnecessary paths
                 terminal_map = <MapStruct*>result
                 terminal_keys = []
                 c_i = 0
                 while map_iter(terminal_map, &c_i, &terminal_key, &value):
                     terminal_keys.append(self.vocab.strings[terminal_key])
-                # TODO: not working, fix remove for unused paths/maps
-                if False and terminal_keys == [key]:
-                    # we found a complete match for input keyword
-                    token_trie_list.append((self.vocab.strings[key], <uintptr_t>terminal_map))
-                    token_trie_list.reverse()
-                    for key_to_remove, py_node_pointer in token_trie_list:
-                        node_pointer = <MapStruct*>py_node_pointer
+                # if this is the only remaining key, remove unnecessary paths
+                if terminal_keys == [key]:
+                    while not path_nodes.empty():
+                        node_pointer = path_nodes.back()
+                        path_nodes.pop_back()
+                        key_to_remove = path_keys.back()
+                        path_keys.pop_back()
                         result = map_get(node_pointer, key_to_remove)
                         if node_pointer.filled == 1:
                             map_clear(node_pointer, key_to_remove)
                             self.mem.free(result)
-                            pass
                         else:
                             # more than one key means more than 1 path,
-                            # delete not required path and keep the other
+                            # delete not required path and keep the others
                             map_clear(node_pointer, key_to_remove)
                             self.mem.free(result)
                             break
diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py
index 7d65d0007..486cbb984 100644
--- a/spacy/tests/matcher/test_phrase_matcher.py
+++ b/spacy/tests/matcher/test_phrase_matcher.py
@@ -84,7 +84,8 @@ def test_phrase_matcher_remove(en_vocab):
     assert "TEST2" not in matcher
     assert "TEST3" not in matcher
     assert len(matcher(doc)) == 0
-    matcher.remove("TEST3")
+    with pytest.raises(KeyError):
+        matcher.remove("TEST3")
     assert "TEST1" not in matcher
     assert "TEST2" not in matcher
     assert "TEST3" not in matcher