From 2c876eb672f8ea03b7e639d4c75491a92327bd06 Mon Sep 17 00:00:00 2001
From: adrianeboyd <adrianeboyd@gmail.com>
Date: Wed, 20 Nov 2019 13:07:25 +0100
Subject: [PATCH] Add tokenizer explain() debugging method (#4596)

* Expose tokenizer rules as a property

Expose the tokenizer rules property in the same way as the other core
properties. (The cache resetting is overkill, but consistent with
`from_bytes` for now.)

Add tests and update Tokenizer API docs.

* Update Hungarian punctuation to remove empty string

Update Hungarian punctuation definitions so that `_units` does not match
an empty string.

* Use _load_special_tokenization consistently

Use `_load_special_tokenization()` and have it to handle `None` checks.

* Fix precedence of `token_match` vs. special cases

Remove `token_match` check from `_split_affixes()` so that special cases
have precedence over `token_match`. `token_match` is checked only before
infixes are split.

* Add `make_debug_doc()` to the Tokenizer

Add `make_debug_doc()` to the Tokenizer as a working implementation of
the pseudo-code in the docs.

Add a test (marked as slow) that checks that `nlp.tokenizer()` and
`nlp.tokenizer.make_debug_doc()` return the same non-whitespace tokens
for all languages that have `examples.sentences` that can be imported.

* Update tokenization usage docs

Update pseudo-code and algorithm description to correspond to
`nlp.tokenizer.make_debug_doc()` with example debugging usage.

Add more examples for customizing tokenizers while preserving the
existing defaults.

Minor edits / clarifications.

* Revert "Update Hungarian punctuation to remove empty string"

This reverts commit f0a577f7a5c67f55807fdbda9e9a936464723931.

* Rework `make_debug_doc()` as `explain()`

Rework `make_debug_doc()` as `explain()`, which returns a list of
`(pattern_string, token_string)` tuples rather than a non-standard
`Doc`. Update docs and tests accordingly, leaving the visualization for
future work.

* Handle cases with bad tokenizer patterns

Detect when tokenizer patterns match empty prefixes and suffixes so that
`explain()` does not hang on bad patterns.

* Remove unused displacy image

* Add tokenizer.explain() to usage docs
---
 .../lang/en/test_customized_tokenizer.py      | 82 +++++++++++++++-
 spacy/tests/tokenizer/test_explain.py         | 30 ++++++
 spacy/tokenizer.pyx                           | 95 ++++++++++++++++---
 website/docs/api/tokenizer.md                 | 51 +++++++---
 website/docs/usage/linguistic-features.md     | 27 ++++++
 5 files changed, 258 insertions(+), 27 deletions(-)
 create mode 100644 spacy/tests/tokenizer/test_explain.py

diff --git a/spacy/tests/lang/en/test_customized_tokenizer.py b/spacy/tests/lang/en/test_customized_tokenizer.py
index fdac32a90..7f939011f 100644
--- a/spacy/tests/lang/en/test_customized_tokenizer.py
+++ b/spacy/tests/lang/en/test_customized_tokenizer.py
@@ -2,6 +2,7 @@
 from __future__ import unicode_literals
 
 import pytest
+import re
 from spacy.lang.en import English
 from spacy.tokenizer import Tokenizer
 from spacy.util import compile_prefix_regex, compile_suffix_regex
@@ -19,13 +20,14 @@ def custom_en_tokenizer(en_vocab):
         r"[\[\]!&:,()\*—–\/-]",
     ]
     infix_re = compile_infix_regex(custom_infixes)
+    token_match_re = re.compile("a-b")
     return Tokenizer(
         en_vocab,
         English.Defaults.tokenizer_exceptions,
         prefix_re.search,
         suffix_re.search,
         infix_re.finditer,
-        token_match=None,
+        token_match=token_match_re.match,
     )
 
 
@@ -74,3 +76,81 @@ def test_en_customized_tokenizer_handles_infixes(custom_en_tokenizer):
         "Megaregion",
         ".",
     ]
+
+
+def test_en_customized_tokenizer_handles_token_match(custom_en_tokenizer):
+    sentence = "The 8 and 10-county definitions a-b not used for the greater Southern California Megaregion."
+    context = [word.text for word in custom_en_tokenizer(sentence)]
+    assert context == [
+        "The",
+        "8",
+        "and",
+        "10",
+        "-",
+        "county",
+        "definitions",
+        "a-b",
+        "not",
+        "used",
+        "for",
+        "the",
+        "greater",
+        "Southern",
+        "California",
+        "Megaregion",
+        ".",
+    ]
+
+
+def test_en_customized_tokenizer_handles_rules(custom_en_tokenizer):
+    sentence = "The 8 and 10-county definitions are not used for the greater Southern California Megaregion. :)"
+    context = [word.text for word in custom_en_tokenizer(sentence)]
+    assert context == [
+        "The",
+        "8",
+        "and",
+        "10",
+        "-",
+        "county",
+        "definitions",
+        "are",
+        "not",
+        "used",
+        "for",
+        "the",
+        "greater",
+        "Southern",
+        "California",
+        "Megaregion",
+        ".",
+        ":)",
+    ]
+
+
+def test_en_customized_tokenizer_handles_rules_property(custom_en_tokenizer):
+    sentence = "The 8 and 10-county definitions are not used for the greater Southern California Megaregion. :)"
+    rules = custom_en_tokenizer.rules
+    del rules[":)"]
+    custom_en_tokenizer.rules = rules
+    context = [word.text for word in custom_en_tokenizer(sentence)]
+    assert context == [
+        "The",
+        "8",
+        "and",
+        "10",
+        "-",
+        "county",
+        "definitions",
+        "are",
+        "not",
+        "used",
+        "for",
+        "the",
+        "greater",
+        "Southern",
+        "California",
+        "Megaregion",
+        ".",
+        ":",
+        ")",
+    ]
diff --git a/spacy/tests/tokenizer/test_explain.py b/spacy/tests/tokenizer/test_explain.py
new file mode 100644
index 000000000..dcd4268ab
--- /dev/null
+++ b/spacy/tests/tokenizer/test_explain.py
@@ -0,0 +1,30 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import importlib
+import pytest
+from spacy.util import get_lang_class
+
+
+# fmt: off
+# Only include languages with no external dependencies
+# excluded: ja, ru, th, uk, vi, zh
+LANGUAGES = ["af", "ar", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es",
+             "et", "fa", "fi", "fr", "ga", "he", "hi", "hr", "hu", "id", "is",
+             "it", "kn", "lt", "lv", "nb", "nl", "pl", "pt", "ro", "si", "sk",
+             "sl", "sq", "sr", "sv", "ta", "te", "tl", "tr", "tt", "ur"]
+# fmt: on
+
+
+@pytest.mark.slow
+@pytest.mark.parametrize("lang", LANGUAGES)
+def test_tokenizer_explain(lang):
+    nlp = get_lang_class(lang)()
+    try:
+        examples = importlib.import_module("spacy.lang." + lang + ".examples")
+        for sentence in examples.sentences:
+            tokens = [t.text for t in nlp.tokenizer(sentence) if not t.is_space]
+            debug_tokens = [t[1] for t in nlp.tokenizer.explain(sentence)]
+            assert tokens == debug_tokens
+    except:
+        pass
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index b39bb1ecb..f95b44283 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -15,6 +15,8 @@ import re
 from .tokens.doc cimport Doc
 from .strings cimport hash_string
 from .compat import unescape_unicode
+from .attrs import intify_attrs
+from .symbols import ORTH
 
 from .errors import Errors, Warnings, deprecation_warning
 from . import util
@@ -57,9 +59,7 @@ cdef class Tokenizer:
         self.infix_finditer = infix_finditer
         self.vocab = vocab
         self._rules = {}
-        if rules is not None:
-            for chunk, substrings in sorted(rules.items()):
-                self.add_special_case(chunk, substrings)
+        self._load_special_tokenization(rules)
 
     property token_match:
         def __get__(self):
@@ -93,6 +93,18 @@ cdef class Tokenizer:
             self._infix_finditer = infix_finditer
             self._flush_cache()
 
+    property rules:
+        def __get__(self):
+            return self._rules
+
+        def __set__(self, rules):
+            self._rules = {}
+            self._reset_cache([key for key in self._cache])
+            self._reset_specials()
+            self._cache = PreshMap()
+            self._specials = PreshMap()
+            self._load_special_tokenization(rules)
+
     def __reduce__(self):
         args = (self.vocab,
                 self._rules,
@@ -227,10 +239,6 @@ cdef class Tokenizer:
         cdef unicode minus_suf
         cdef size_t last_size = 0
         while string and len(string) != last_size:
-            if self.token_match and self.token_match(string) \
-                    and not self.find_prefix(string) \
-                    and not self.find_suffix(string):
-                break
             if self._specials.get(hash_string(string)) != NULL:
                 has_special[0] = 1
                 break
@@ -393,8 +401,9 @@ cdef class Tokenizer:
 
     def _load_special_tokenization(self, special_cases):
         """Add special-case tokenization rules."""
-        for chunk, substrings in sorted(special_cases.items()):
-            self.add_special_case(chunk, substrings)
+        if special_cases is not None:
+            for chunk, substrings in sorted(special_cases.items()):
+                self.add_special_case(chunk, substrings)
 
     def add_special_case(self, unicode string, substrings):
         """Add a special-case tokenization rule.
@@ -423,6 +432,71 @@ cdef class Tokenizer:
             self.mem.free(stale_cached)
         self._rules[string] = substrings
 
+    def explain(self, text):
+        """A debugging tokenizer that provides information about which
+        tokenizer rule or pattern was matched for each token. The tokens
+        produced are identical to `nlp.tokenizer()` except for whitespace
+        tokens.
+
+        string (unicode): The string to tokenize.
+        RETURNS (list): A list of (pattern_string, token_string) tuples
+
+        DOCS: https://spacy.io/api/tokenizer#explain
+        """
+        prefix_search = self.prefix_search
+        suffix_search = self.suffix_search
+        infix_finditer = self.infix_finditer
+        token_match = self.token_match
+        special_cases = {}
+        for orth, special_tokens in self.rules.items():
+            special_cases[orth] = [intify_attrs(special_token, strings_map=self.vocab.strings, _do_deprecated=True) for special_token in special_tokens]
+        tokens = []
+        for substring in text.split():
+            suffixes = []
+            while substring:
+                while prefix_search(substring) or suffix_search(substring):
+                    if substring in special_cases:
+                        tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
+                        substring = ''
+                        break
+                    if prefix_search(substring):
+                        split = prefix_search(substring).end()
+                        tokens.append(("PREFIX", substring[:split]))
+                        substring = substring[split:]
+                        if substring in special_cases:
+                            continue
+                        # break if pattern matches the empty string
+                        if split == 0:
+                            break
+                    if suffix_search(substring):
+                        split = suffix_search(substring).start()
+                        suffixes.append(("SUFFIX", substring[split:]))
+                        substring = substring[:split]
+                        # break if pattern matches the empty string
+                        if split == len(substring):
+                            break
+                if substring in special_cases:
+                    tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
+                    substring = ''
+                elif token_match(substring):
+                    tokens.append(("TOKEN_MATCH", substring))
+                    substring = ''
+                elif list(infix_finditer(substring)):
+                    infixes = infix_finditer(substring)
+                    offset = 0
+                    for match in infixes:
+                        tokens.append(("TOKEN", substring[offset : match.start()]))
+                        tokens.append(("INFIX", substring[match.start() : match.end()]))
+                        offset = match.end()
+                    if substring[offset:]:
+                        tokens.append(("TOKEN", substring[offset:]))
+                    substring = ''
+                elif substring:
+                    tokens.append(("TOKEN", substring))
+                    substring = ''
+            tokens.extend(reversed(suffixes))
+        return tokens
+
     def to_disk(self, path, **kwargs):
         """Save the current state to a directory.
 
@@ -507,8 +581,7 @@ cdef class Tokenizer:
             self._reset_specials()
             self._cache = PreshMap()
             self._specials = PreshMap()
-            for string, substrings in data.get("rules", {}).items():
-                self.add_special_case(string, substrings)
+            self._load_special_tokenization(data.get("rules", {}))
 
         return self
 
diff --git a/website/docs/api/tokenizer.md b/website/docs/api/tokenizer.md
index d6ab73f14..7462af739 100644
--- a/website/docs/api/tokenizer.md
+++ b/website/docs/api/tokenizer.md
@@ -34,15 +34,15 @@ the
 > tokenizer = nlp.Defaults.create_tokenizer(nlp)
 > ```
 
-| Name             | Type        | Description                                                                         |
-| ---------------- | ----------- | ----------------------------------------------------------------------------------- |
-| `vocab`          | `Vocab`     | A storage container for lexical types.                                              |
-| `rules`          | dict        | Exceptions and special-cases for the tokenizer.                                     |
-| `prefix_search`  | callable    | A function matching the signature of `re.compile(string).search` to match prefixes. |
-| `suffix_search`  | callable    | A function matching the signature of `re.compile(string).search` to match suffixes. |
-| `infix_finditer` | callable    | A function matching the signature of `re.compile(string).finditer` to find infixes. |
-| `token_match`    | callable    | A boolean function matching strings to be recognized as tokens.                     |
-| **RETURNS**      | `Tokenizer` | The newly constructed object.                                                       |
+| Name             | Type        | Description                                                                                                                   |
+| ---------------- | ----------- | ----------------------------------------------------------------------------------------------------------------------------- |
+| `vocab`          | `Vocab`     | A storage container for lexical types.                                                                                        |
+| `rules`          | dict        | Exceptions and special-cases for the tokenizer.                                                                               |
+| `prefix_search`  | callable    | A function matching the signature of `re.compile(string).search` to match prefixes.                                           |
+| `suffix_search`  | callable    | A function matching the signature of `re.compile(string).search` to match suffixes.                                           |
+| `infix_finditer` | callable    | A function matching the signature of `re.compile(string).finditer` to find infixes.                                           |
+| `token_match`    | callable    | A function matching the signature of `re.compile(string).match to find token matches.                                         |
+| **RETURNS**      | `Tokenizer` | The newly constructed object.                                                                                                 |
 
 ## Tokenizer.\_\_call\_\_ {#call tag="method"}
 
@@ -128,6 +128,25 @@ and examples.
 | `string`      | unicode  | The string to specially tokenize.                                                                                                                                        |
 | `token_attrs` | iterable | A sequence of dicts, where each dict describes a token and its attributes. The `ORTH` fields of the attributes must exactly match the string when they are concatenated. |
 
+## Tokenizer.explain {#explain tag="method"}
+
+Tokenize a string with a slow debugging tokenizer that provides information
+about which tokenizer rule or pattern was matched for each token. The tokens
+produced are identical to `Tokenizer.__call__` except for whitespace tokens.
+
+> #### Example
+>
+> ```python
+> tok_exp = nlp.tokenizer.explain("(don't)")
+> assert [t[0] for t in tok_exp] == ["PREFIX", "SPECIAL-1", "SPECIAL-2", "SUFFIX"]
+> assert [t[1] for t in tok_exp] == ["(", "do", "n't", ")"]
+> ```
+
+| Name        | Type     | Description                                         |
+| ------------| -------- | --------------------------------------------------- |
+| `string`    | unicode  | The string to tokenize with the debugging tokenizer |
+| **RETURNS** | list     | A list of `(pattern_string, token_string)` tuples   |
+
 ## Tokenizer.to_disk {#to_disk tag="method"}
 
 Serialize the tokenizer to disk.
@@ -198,12 +217,14 @@ it.
 
 ## Attributes {#attributes}
 
-| Name             | Type    | Description                                                                                                                |
-| ---------------- | ------- | -------------------------------------------------------------------------------------------------------------------------- |
-| `vocab`          | `Vocab` | The vocab object of the parent `Doc`.                                                                                      |
-| `prefix_search`  | -       | A function to find segment boundaries from the start of a string. Returns the length of the segment, or `None`.            |
-| `suffix_search`  | -       | A function to find segment boundaries from the end of a string. Returns the length of the segment, or `None`.              |
-| `infix_finditer` | -       | A function to find internal segment separators, e.g. hyphens. Returns a (possibly empty) list of `re.MatchObject` objects. |
+| Name             | Type    | Description                                                                                                                 |
+| ---------------- | ------- | --------------------------------------------------------------------------------------------------------------------------- |
+| `vocab`          | `Vocab` | The vocab object of the parent `Doc`.                                                                                       |
+| `prefix_search`  | -       | A function to find segment boundaries from the start of a string. Returns the length of the segment, or `None`.             |
+| `suffix_search`  | -       | A function to find segment boundaries from the end of a string. Returns the length of the segment, or `None`.               |
+| `infix_finditer` | -       | A function to find internal segment separators, e.g. hyphens. Returns a (possibly empty) list of `re.MatchObject` objects.  |
+| `token_match`    | -       | A function matching the signature of `re.compile(string).match to find token matches. Returns an `re.MatchObject` or `None. |
+| `rules`          | dict        | A dictionary of tokenizer exceptions and special cases.                                                                  |
 
 ## Serialization fields {#serialization-fields}
 
diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md
index db3aac686..ba85e6154 100644
--- a/website/docs/usage/linguistic-features.md
+++ b/website/docs/usage/linguistic-features.md
@@ -792,6 +792,33 @@ The algorithm can be summarized as follows:
    tokens on all infixes.
 8. Once we can't consume any more of the string, handle it as a single token.
 
+#### Debugging the tokenizer {#tokenizer-debug new="2.2.3"}
+
+A working implementation of the pseudo-code above is available for debugging as
+[`nlp.tokenizer.explain(text)`](/api/tokenizer#explain). It returns a list of
+tuples showing which tokenizer rule or pattern was matched for each token. The
+tokens produced are identical to `nlp.tokenizer()` except for whitespace
+tokens:
+
+```python
+### {executable="true"}
+from spacy.lang.en import English
+nlp = English()
+text = '''"Let's go!"'''
+doc = nlp(text)
+tok_exp = nlp.tokenizer.explain(text)
+assert [t.text for t in doc if not t.is_space] == [t[1] for t in tok_exp]
+for t in tok_exp:
+    print(t[1], "\\t", t[0])
+
+# " 	 PREFIX
+# Let 	 SPECIAL-1
+# 's 	 SPECIAL-2
+# go 	 TOKEN
+# ! 	 SUFFIX
+# " 	 SUFFIX
+```
+
 ### Customizing spaCy's Tokenizer class {#native-tokenizers}
 
 Let's imagine you wanted to create a tokenizer for a new language or specific