From 0e7f94b247e0e616439339c66588871a4be30750 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 19 Apr 2021 11:08:20 +0200
Subject: [PATCH] Update Tokenizer.explain with special matches (#7749)

* Update Tokenizer.explain with special matches

Update `Tokenizer.explain` and the pseudo-code in the docs to include
the processing of special cases that contain affixes or whitespace.

* Handle optional settings in explain

* Add test for special matches in explain

Add test for `Tokenizer.explain` for special cases containing affixes.
---
 spacy/tests/tokenizer/test_explain.py     | 17 ++++++++++
 spacy/tokenizer.pyx                       | 39 +++++++++++++++++++++--
 website/docs/usage/linguistic-features.md |  7 +++-
 3 files changed, 59 insertions(+), 4 deletions(-)

diff --git a/spacy/tests/tokenizer/test_explain.py b/spacy/tests/tokenizer/test_explain.py
index ea6cf91be..0a10ae67d 100644
--- a/spacy/tests/tokenizer/test_explain.py
+++ b/spacy/tests/tokenizer/test_explain.py
@@ -1,5 +1,7 @@
 import pytest
+import re
 from spacy.util import get_lang_class
+from spacy.tokenizer import Tokenizer
 
 # Only include languages with no external dependencies
 # "is" seems to confuse importlib, so we're also excluding it for now
@@ -60,3 +62,18 @@ def test_tokenizer_explain(lang):
         tokens = [t.text for t in tokenizer(sentence) if not t.is_space]
         debug_tokens = [t[1] for t in tokenizer.explain(sentence)]
         assert tokens == debug_tokens
+
+
+def test_tokenizer_explain_special_matcher(en_vocab):
+    suffix_re = re.compile(r"[\.]$")
+    infix_re = re.compile(r"[/]")
+    rules = {"a.": [{"ORTH": "a."}]}
+    tokenizer = Tokenizer(
+        en_vocab,
+        rules=rules,
+        suffix_search=suffix_re.search,
+        infix_finditer=infix_re.finditer,
+    )
+    tokens = [t.text for t in tokenizer("a/a.")]
+    explain_tokens = [t[1] for t in tokenizer.explain("a/a.")]
+    assert tokens == explain_tokens
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index 5bd6e7aa3..41bbaeee6 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -20,11 +20,12 @@ from .attrs import intify_attrs
 from .symbols import ORTH, NORM
 from .errors import Errors, Warnings
 from . import util
-from .util import registry
+from .util import registry, get_words_and_spaces
 from .attrs import intify_attrs
 from .symbols import ORTH
 from .scorer import Scorer
 from .training import validate_examples
+from .tokens import Span
 
 
 cdef class Tokenizer:
@@ -638,8 +639,14 @@ cdef class Tokenizer:
         DOCS: https://spacy.io/api/tokenizer#explain
         """
         prefix_search = self.prefix_search
+        if prefix_search is None:
+            prefix_search = re.compile("a^").search
         suffix_search = self.suffix_search
+        if suffix_search is None:
+            suffix_search = re.compile("a^").search
         infix_finditer = self.infix_finditer
+        if infix_finditer is None:
+            infix_finditer = re.compile("a^").finditer
         token_match = self.token_match
         if token_match is None:
             token_match = re.compile("a^").match
@@ -687,7 +694,7 @@ cdef class Tokenizer:
                     tokens.append(("URL_MATCH", substring))
                     substring = ''
                 elif substring in special_cases:
-                    tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
+                    tokens.extend((f"SPECIAL-{i + 1}", self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
                     substring = ''
                 elif list(infix_finditer(substring)):
                     infixes = infix_finditer(substring)
@@ -705,7 +712,33 @@ cdef class Tokenizer:
                     tokens.append(("TOKEN", substring))
                     substring = ''
             tokens.extend(reversed(suffixes))
-        return tokens
+        # Find matches for special cases handled by special matcher
+        words, spaces = get_words_and_spaces([t[1] for t in tokens], text)
+        t_words = []
+        t_spaces = []
+        for word, space in zip(words, spaces):
+            if not word.isspace():
+                t_words.append(word)
+                t_spaces.append(space)
+        doc = Doc(self.vocab, words=t_words, spaces=t_spaces)
+        matches = self._special_matcher(doc)
+        spans = [Span(doc, s, e, label=m_id) for m_id, s, e in matches]
+        spans = util.filter_spans(spans)
+        # Replace matched tokens with their exceptions
+        i = 0
+        final_tokens = []
+        spans_by_start = {s.start: s for s in spans}
+        while i < len(tokens):
+            if i in spans_by_start:
+                span = spans_by_start[i]
+                exc = [d[ORTH] for d in special_cases[span.label_]]
+                for j, orth in enumerate(exc):
+                    final_tokens.append((f"SPECIAL-{j + 1}", self.vocab.strings[orth]))
+                i += len(span)
+            else:
+                final_tokens.append(tokens[i])
+                i += 1
+        return final_tokens
 
     def score(self, examples, **kwargs):
         validate_examples(examples, "Tokenizer.score")
diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md
index 2d3390049..077b1a556 100644
--- a/website/docs/usage/linguistic-features.md
+++ b/website/docs/usage/linguistic-features.md
@@ -786,6 +786,7 @@ rather than performance:
 
 ```python
 def tokenizer_pseudo_code(
+    text,
     special_cases,
     prefix_search,
     suffix_search,
@@ -839,12 +840,14 @@ def tokenizer_pseudo_code(
                 tokens.append(substring)
                 substring = ""
         tokens.extend(reversed(suffixes))
+    for match in matcher(special_cases, text):
+        tokens.replace(match, special_cases[match])
     return tokens
 ```
 
 The algorithm can be summarized as follows:
 
-1. Iterate over whitespace-separated substrings.
+1. Iterate over space-separated substrings.
 2. Look for a token match. If there is a match, stop processing and keep this
    token.
 3. Check whether we have an explicitly defined special case for this substring.
@@ -858,6 +861,8 @@ The algorithm can be summarized as follows:
 8. Look for "infixes" – stuff like hyphens etc. and split the substring into
    tokens on all infixes.
 9. Once we can't consume any more of the string, handle it as a single token.
+10. Make a final pass over the text to check for special cases that include
+    spaces or that were missed due to the incremental processing of affixes.
 
 </Accordion>