Switch to local cdef functions for span filtering

2025-08-09 14:44:52 +03:00 · 2019-09-26 21:00:46 +02:00 · 2019-09-26 21:00:46 +02:00 · 669bc1a314
commit 669bc1a314
parent ae348bee43
2 changed files with 34 additions and 2 deletions
--- a/spacy/tokenizer.pxd
+++ b/spacy/tokenizer.pxd
@ -45,3 +45,5 @@ cdef class Tokenizer:
                            bint with_special_cases) except -1
    cdef int _save_cached(self, const TokenC* tokens, hash_t key,
                          int* has_special, int n) except -1
+    cdef void _filter_spans(self, vector[MatchStruct] &original,
+                            vector[MatchStruct] &filtered, int doc_len) nogil
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -6,6 +6,9 @@ from __future__ import unicode_literals
 from cython.operator cimport dereference as deref
 from cython.operator cimport preincrement as preinc
 from libc.string cimport memcpy, memset
+from libcpp.algorithm cimport sort
+from libcpp.set cimport set as stdset
+from libc.stdio cimport printf
 from cymem.cymem cimport Pool
 from preshed.maps cimport PreshMap
 cimport cython
@ -246,8 +249,9 @@ cdef class Tokenizer:
        # Skip processing if no matches
        if c_matches.size() == 0:
            return True
-        spans = [doc[match.start:match.end] for match in c_matches]
-        spans = util.filter_spans(spans)
+        cdef vector[MatchStruct] c_filtered
+        self._filter_spans(c_matches, c_filtered, doc.length)
+        spans = [doc[match.start:match.end] for match in c_filtered]
        # Put span info in span.start-indexed dict and calculate maximum
        # intermediate document size
        span_data = {}
@ -308,6 +312,22 @@ cdef class Tokenizer:
        doc.length = doc.length + offset
        return True

+    cdef void _filter_spans(self, vector[MatchStruct] &original, vector[MatchStruct] &filtered, int doc_len) nogil:
+
+        cdef int seen_i
+        cdef MatchStruct span
+        cdef stdset[int] seen_tokens
+        sort(original.begin(), original.end(), len_start_cmp)
+        cdef int orig_i = original.size() - 1
+        while orig_i >= 0:
+            span = original[orig_i]
+            if not seen_tokens.count(span.start) and not seen_tokens.count(span.end - 1):
+                filtered.push_back(span)
+            for seen_i in range(span.start, span.end):
+                seen_tokens.insert(seen_i)
+            orig_i -= 1
+        sort(filtered.begin(), filtered.end(), start_cmp)
+
    cdef int _try_cache(self, hash_t key, Doc tokens) except -1:
        cached = <_Cached*>self._cache.get(key)
        if cached == NULL:
@ -660,3 +680,13 @@ cdef class Tokenizer:
 def _get_regex_pattern(regex):
    """Get a pattern string for a regex, or None if the pattern is None."""
    return None if regex is None else regex.__self__.pattern
+
+
+cdef bint len_start_cmp(MatchStruct a, MatchStruct b) nogil:
+    if a.end - a.start == b.end - b.start:
+        return a.start < b.start
+    return a.end - a.start < b.end - b.start
+
+
+cdef bint start_cmp(MatchStruct a, MatchStruct b) nogil:
+    return a.start < b.start