From 669bc1a3143fe56ee351da3df4e867fe4c592853 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 26 Sep 2019 21:00:46 +0200
Subject: [PATCH] Switch to local cdef functions for span filtering

---
 spacy/tokenizer.pxd |  2 ++
 spacy/tokenizer.pyx | 34 ++++++++++++++++++++++++++++++++--
 2 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd
index 506076ac1..ec6640196 100644
--- a/spacy/tokenizer.pxd
+++ b/spacy/tokenizer.pxd
@@ -45,3 +45,5 @@ cdef class Tokenizer:
                             bint with_special_cases) except -1
     cdef int _save_cached(self, const TokenC* tokens, hash_t key,
                           int* has_special, int n) except -1
+    cdef void _filter_spans(self, vector[MatchStruct] &original,
+                            vector[MatchStruct] &filtered, int doc_len) nogil
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index 8433f2ca5..de3e2fd95 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -6,6 +6,9 @@ from __future__ import unicode_literals
 from cython.operator cimport dereference as deref
 from cython.operator cimport preincrement as preinc
 from libc.string cimport memcpy, memset
+from libcpp.algorithm cimport sort
+from libcpp.set cimport set as stdset
+from libc.stdio cimport printf
 from cymem.cymem cimport Pool
 from preshed.maps cimport PreshMap
 cimport cython
@@ -246,8 +249,9 @@ cdef class Tokenizer:
         # Skip processing if no matches
         if c_matches.size() == 0:
             return True
-        spans = [doc[match.start:match.end] for match in c_matches]
-        spans = util.filter_spans(spans)
+        cdef vector[MatchStruct] c_filtered
+        self._filter_spans(c_matches, c_filtered, doc.length)
+        spans = [doc[match.start:match.end] for match in c_filtered]
         # Put span info in span.start-indexed dict and calculate maximum
         # intermediate document size
         span_data = {}
@@ -308,6 +312,22 @@ cdef class Tokenizer:
         doc.length = doc.length + offset
         return True
 
+    cdef void _filter_spans(self, vector[MatchStruct] &original, vector[MatchStruct] &filtered, int doc_len) nogil:
+
+        cdef int seen_i
+        cdef MatchStruct span
+        cdef stdset[int] seen_tokens
+        sort(original.begin(), original.end(), len_start_cmp)
+        cdef int orig_i = original.size() - 1
+        while orig_i >= 0:
+            span = original[orig_i]
+            if not seen_tokens.count(span.start) and not seen_tokens.count(span.end - 1):
+                filtered.push_back(span)
+            for seen_i in range(span.start, span.end):
+                seen_tokens.insert(seen_i)
+            orig_i -= 1
+        sort(filtered.begin(), filtered.end(), start_cmp)
+
     cdef int _try_cache(self, hash_t key, Doc tokens) except -1:
         cached = <_Cached*>self._cache.get(key)
         if cached == NULL:
@@ -660,3 +680,13 @@ cdef class Tokenizer:
 def _get_regex_pattern(regex):
     """Get a pattern string for a regex, or None if the pattern is None."""
     return None if regex is None else regex.__self__.pattern
+
+
+cdef bint len_start_cmp(MatchStruct a, MatchStruct b) nogil:
+    if a.end - a.start == b.end - b.start:
+        return a.start < b.start
+    return a.end - a.start < b.end - b.start
+
+
+cdef bint start_cmp(MatchStruct a, MatchStruct b) nogil:
+    return a.start < b.start