Add tokenizer explain() debugging method (#4596)

* Expose tokenizer rules as a property Expose the tokenizer rules property in the same way as the other core properties. (The cache resetting is overkill, but consistent with `from_bytes` for now.) Add tests and update Tokenizer API docs. * Update Hungarian punctuation to remove empty string Update Hungarian punctuation definitions so that `_units` does not match an empty string. * Use _load_special_tokenization consistently Use `_load_special_tokenization()` and have it to handle `None` checks. * Fix precedence of `token_match` vs. special cases Remove `token_match` check from `_split_affixes()` so that special cases have precedence over `token_match`. `token_match` is checked only before infixes are split. * Add `make_debug_doc()` to the Tokenizer Add `make_debug_doc()` to the Tokenizer as a working implementation of the pseudo-code in the docs. Add a test (marked as slow) that checks that `nlp.tokenizer()` and `nlp.tokenizer.make_debug_doc()` return the same non-whitespace tokens for all languages that have `examples.sentences` that can be imported. * Update tokenization usage docs Update pseudo-code and algorithm description to correspond to `nlp.tokenizer.make_debug_doc()` with example debugging usage. Add more examples for customizing tokenizers while preserving the existing defaults. Minor edits / clarifications. * Revert "Update Hungarian punctuation to remove empty string" This reverts commit f0a577f7a5. * Rework `make_debug_doc()` as `explain()` Rework `make_debug_doc()` as `explain()`, which returns a list of `(pattern_string, token_string)` tuples rather than a non-standard `Doc`. Update docs and tests accordingly, leaving the visualization for future work. * Handle cases with bad tokenizer patterns Detect when tokenizer patterns match empty prefixes and suffixes so that `explain()` does not hang on bad patterns. * Remove unused displacy image * Add tokenizer.explain() to usage docs
2025-09-15 16:42:36 +03:00 · 2019-11-20 13:07:25 +01:00 · 2019-11-20 13:07:25 +01:00 · 2c876eb672
commit 2c876eb672
parent a3c43a1692
5 changed files with 258 additions and 27 deletions
--- a/spacy/tests/lang/en/test_customized_tokenizer.py
+++ b/spacy/tests/lang/en/test_customized_tokenizer.py
@ -2,6 +2,7 @@
 from __future__ import unicode_literals

 import pytest
+import re
 from spacy.lang.en import English
 from spacy.tokenizer import Tokenizer
 from spacy.util import compile_prefix_regex, compile_suffix_regex
@ -19,13 +20,14 @@ def custom_en_tokenizer(en_vocab):
        r"[\[\]!&:,()\*—–\/-]",
    ]
    infix_re = compile_infix_regex(custom_infixes)
+    token_match_re = re.compile("a-b")
    return Tokenizer(
        en_vocab,
        English.Defaults.tokenizer_exceptions,
        prefix_re.search,
        suffix_re.search,
        infix_re.finditer,
-        token_match=None,
+        token_match=token_match_re.match,
    )


@ -74,3 +76,81 @@ def test_en_customized_tokenizer_handles_infixes(custom_en_tokenizer):
        "Megaregion",
        ".",
    ]
+
+
+def test_en_customized_tokenizer_handles_token_match(custom_en_tokenizer):
+    sentence = "The 8 and 10-county definitions a-b not used for the greater Southern California Megaregion."
+    context = [word.text for word in custom_en_tokenizer(sentence)]
+    assert context == [
+        "The",
+        "8",
+        "and",
+        "10",
+        "-",
+        "county",
+        "definitions",
+        "a-b",
+        "not",
+        "used",
+        "for",
+        "the",
+        "greater",
+        "Southern",
+        "California",
+        "Megaregion",
+        ".",
+    ]
+
+
+def test_en_customized_tokenizer_handles_rules(custom_en_tokenizer):
+    sentence = "The 8 and 10-county definitions are not used for the greater Southern California Megaregion. :)"
+    context = [word.text for word in custom_en_tokenizer(sentence)]
+    assert context == [
+        "The",
+        "8",
+        "and",
+        "10",
+        "-",
+        "county",
+        "definitions",
+        "are",
+        "not",
+        "used",
+        "for",
+        "the",
+        "greater",
+        "Southern",
+        "California",
+        "Megaregion",
+        ".",
+        ":)",
+    ]
+
+
+def test_en_customized_tokenizer_handles_rules_property(custom_en_tokenizer):
+    sentence = "The 8 and 10-county definitions are not used for the greater Southern California Megaregion. :)"
+    rules = custom_en_tokenizer.rules
+    del rules[":)"]
+    custom_en_tokenizer.rules = rules
+    context = [word.text for word in custom_en_tokenizer(sentence)]
+    assert context == [
+        "The",
+        "8",
+        "and",
+        "10",
+        "-",
+        "county",
+        "definitions",
+        "are",
+        "not",
+        "used",
+        "for",
+        "the",
+        "greater",
+        "Southern",
+        "California",
+        "Megaregion",
+        ".",
+        ":",
+        ")",
+    ]
--- a/spacy/tests/tokenizer/test_explain.py
+++ b/spacy/tests/tokenizer/test_explain.py
@ -0,0 +1,30 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import importlib
+import pytest
+from spacy.util import get_lang_class
+
+
+# fmt: off
+# Only include languages with no external dependencies
+# excluded: ja, ru, th, uk, vi, zh
+LANGUAGES = ["af", "ar", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es",
+             "et", "fa", "fi", "fr", "ga", "he", "hi", "hr", "hu", "id", "is",
+             "it", "kn", "lt", "lv", "nb", "nl", "pl", "pt", "ro", "si", "sk",
+             "sl", "sq", "sr", "sv", "ta", "te", "tl", "tr", "tt", "ur"]
+# fmt: on
+
+
+@pytest.mark.slow
+@pytest.mark.parametrize("lang", LANGUAGES)
+def test_tokenizer_explain(lang):
+    nlp = get_lang_class(lang)()
+    try:
+        examples = importlib.import_module("spacy.lang." + lang + ".examples")
+        for sentence in examples.sentences:
+            tokens = [t.text for t in nlp.tokenizer(sentence) if not t.is_space]
+            debug_tokens = [t[1] for t in nlp.tokenizer.explain(sentence)]
+            assert tokens == debug_tokens
+    except:
+        pass
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -15,6 +15,8 @@ import re
 from .tokens.doc cimport Doc
 from .strings cimport hash_string
 from .compat import unescape_unicode
+from .attrs import intify_attrs
+from .symbols import ORTH

 from .errors import Errors, Warnings, deprecation_warning
 from . import util
@ -57,9 +59,7 @@ cdef class Tokenizer:
        self.infix_finditer = infix_finditer
        self.vocab = vocab
        self._rules = {}
-        if rules is not None:
-            for chunk, substrings in sorted(rules.items()):
-                self.add_special_case(chunk, substrings)
+        self._load_special_tokenization(rules)

    property token_match:
        def __get__(self):
@ -93,6 +93,18 @@ cdef class Tokenizer:
            self._infix_finditer = infix_finditer
            self._flush_cache()

+    property rules:
+        def __get__(self):
+            return self._rules
+
+        def __set__(self, rules):
+            self._rules = {}
+            self._reset_cache([key for key in self._cache])
+            self._reset_specials()
+            self._cache = PreshMap()
+            self._specials = PreshMap()
+            self._load_special_tokenization(rules)
+
    def __reduce__(self):
        args = (self.vocab,
                self._rules,
@ -227,10 +239,6 @@ cdef class Tokenizer:
        cdef unicode minus_suf
        cdef size_t last_size = 0
        while string and len(string) != last_size:
-            if self.token_match and self.token_match(string) \
-                    and not self.find_prefix(string) \
-                    and not self.find_suffix(string):
-                break
            if self._specials.get(hash_string(string)) != NULL:
                has_special[0] = 1
                break
@ -393,8 +401,9 @@ cdef class Tokenizer:

    def _load_special_tokenization(self, special_cases):
        """Add special-case tokenization rules."""
-        for chunk, substrings in sorted(special_cases.items()):
-            self.add_special_case(chunk, substrings)
+        if special_cases is not None:
+            for chunk, substrings in sorted(special_cases.items()):
+                self.add_special_case(chunk, substrings)

    def add_special_case(self, unicode string, substrings):
        """Add a special-case tokenization rule.
@ -423,6 +432,71 @@ cdef class Tokenizer:
            self.mem.free(stale_cached)
        self._rules[string] = substrings

+    def explain(self, text):
+        """A debugging tokenizer that provides information about which
+        tokenizer rule or pattern was matched for each token. The tokens
+        produced are identical to `nlp.tokenizer()` except for whitespace
+        tokens.
+
+        string (unicode): The string to tokenize.
+        RETURNS (list): A list of (pattern_string, token_string) tuples
+
+        DOCS: https://spacy.io/api/tokenizer#explain
+        """
+        prefix_search = self.prefix_search
+        suffix_search = self.suffix_search
+        infix_finditer = self.infix_finditer
+        token_match = self.token_match
+        special_cases = {}
+        for orth, special_tokens in self.rules.items():
+            special_cases[orth] = [intify_attrs(special_token, strings_map=self.vocab.strings, _do_deprecated=True) for special_token in special_tokens]
+        tokens = []
+        for substring in text.split():
+            suffixes = []
+            while substring:
+                while prefix_search(substring) or suffix_search(substring):
+                    if substring in special_cases:
+                        tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
+                        substring = ''
+                        break
+                    if prefix_search(substring):
+                        split = prefix_search(substring).end()
+                        tokens.append(("PREFIX", substring[:split]))
+                        substring = substring[split:]
+                        if substring in special_cases:
+                            continue
+                        # break if pattern matches the empty string
+                        if split == 0:
+                            break
+                    if suffix_search(substring):
+                        split = suffix_search(substring).start()
+                        suffixes.append(("SUFFIX", substring[split:]))
+                        substring = substring[:split]
+                        # break if pattern matches the empty string
+                        if split == len(substring):
+                            break
+                if substring in special_cases:
+                    tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
+                    substring = ''
+                elif token_match(substring):
+                    tokens.append(("TOKEN_MATCH", substring))
+                    substring = ''
+                elif list(infix_finditer(substring)):
+                    infixes = infix_finditer(substring)
+                    offset = 0
+                    for match in infixes:
+                        tokens.append(("TOKEN", substring[offset : match.start()]))
+                        tokens.append(("INFIX", substring[match.start() : match.end()]))
+                        offset = match.end()
+                    if substring[offset:]:
+                        tokens.append(("TOKEN", substring[offset:]))
+                    substring = ''
+                elif substring:
+                    tokens.append(("TOKEN", substring))
+                    substring = ''
+            tokens.extend(reversed(suffixes))
+        return tokens
+
    def to_disk(self, path, **kwargs):
        """Save the current state to a directory.

@ -507,8 +581,7 @@ cdef class Tokenizer:
            self._reset_specials()
            self._cache = PreshMap()
            self._specials = PreshMap()
-            for string, substrings in data.get("rules", {}).items():
-                self.add_special_case(string, substrings)
+            self._load_special_tokenization(data.get("rules", {}))

        return self

--- a/website/docs/api/tokenizer.md
+++ b/website/docs/api/tokenizer.md
@ -34,15 +34,15 @@ the
 > tokenizer = nlp.Defaults.create_tokenizer(nlp)
 > ```

-| Name             | Type        | Description                                                                         |
-| ---------------- | ----------- | ----------------------------------------------------------------------------------- |
-| `vocab`          | `Vocab`     | A storage container for lexical types.                                              |
-| `rules`          | dict        | Exceptions and special-cases for the tokenizer.                                     |
-| `prefix_search`  | callable    | A function matching the signature of `re.compile(string).search` to match prefixes. |
-| `suffix_search`  | callable    | A function matching the signature of `re.compile(string).search` to match suffixes. |
-| `infix_finditer` | callable    | A function matching the signature of `re.compile(string).finditer` to find infixes. |
-| `token_match`    | callable    | A boolean function matching strings to be recognized as tokens.                     |
-| **RETURNS**      | `Tokenizer` | The newly constructed object.                                                       |
+| Name             | Type        | Description                                                                                                                   |
+| ---------------- | ----------- | ----------------------------------------------------------------------------------------------------------------------------- |
+| `vocab`          | `Vocab`     | A storage container for lexical types.                                                                                        |
+| `rules`          | dict        | Exceptions and special-cases for the tokenizer.                                                                               |
+| `prefix_search`  | callable    | A function matching the signature of `re.compile(string).search` to match prefixes.                                           |
+| `suffix_search`  | callable    | A function matching the signature of `re.compile(string).search` to match suffixes.                                           |
+| `infix_finditer` | callable    | A function matching the signature of `re.compile(string).finditer` to find infixes.                                           |
+| `token_match`    | callable    | A function matching the signature of `re.compile(string).match to find token matches.                                         |
+| **RETURNS**      | `Tokenizer` | The newly constructed object.                                                                                                 |

 ## Tokenizer.\_\_call\_\_ {#call tag="method"}

@ -128,6 +128,25 @@ and examples.
 | `string`      | unicode  | The string to specially tokenize.                                                                                                                                        |
 | `token_attrs` | iterable | A sequence of dicts, where each dict describes a token and its attributes. The `ORTH` fields of the attributes must exactly match the string when they are concatenated. |

+## Tokenizer.explain {#explain tag="method"}
+
+Tokenize a string with a slow debugging tokenizer that provides information
+about which tokenizer rule or pattern was matched for each token. The tokens
+produced are identical to `Tokenizer.__call__` except for whitespace tokens.
+
+> #### Example
+>
+> ```python
+> tok_exp = nlp.tokenizer.explain("(don't)")
+> assert [t[0] for t in tok_exp] == ["PREFIX", "SPECIAL-1", "SPECIAL-2", "SUFFIX"]
+> assert [t[1] for t in tok_exp] == ["(", "do", "n't", ")"]
+> ```
+
+| Name        | Type     | Description                                         |
+| ------------| -------- | --------------------------------------------------- |
+| `string`    | unicode  | The string to tokenize with the debugging tokenizer |
+| **RETURNS** | list     | A list of `(pattern_string, token_string)` tuples   |
+
 ## Tokenizer.to_disk {#to_disk tag="method"}

 Serialize the tokenizer to disk.
@ -198,12 +217,14 @@ it.

 ## Attributes {#attributes}

-| Name             | Type    | Description                                                                                                                |
-| ---------------- | ------- | -------------------------------------------------------------------------------------------------------------------------- |
-| `vocab`          | `Vocab` | The vocab object of the parent `Doc`.                                                                                      |
-| `prefix_search`  | -       | A function to find segment boundaries from the start of a string. Returns the length of the segment, or `None`.            |
-| `suffix_search`  | -       | A function to find segment boundaries from the end of a string. Returns the length of the segment, or `None`.              |
-| `infix_finditer` | -       | A function to find internal segment separators, e.g. hyphens. Returns a (possibly empty) list of `re.MatchObject` objects. |
+| Name             | Type    | Description                                                                                                                 |
+| ---------------- | ------- | --------------------------------------------------------------------------------------------------------------------------- |
+| `vocab`          | `Vocab` | The vocab object of the parent `Doc`.                                                                                       |
+| `prefix_search`  | -       | A function to find segment boundaries from the start of a string. Returns the length of the segment, or `None`.             |
+| `suffix_search`  | -       | A function to find segment boundaries from the end of a string. Returns the length of the segment, or `None`.               |
+| `infix_finditer` | -       | A function to find internal segment separators, e.g. hyphens. Returns a (possibly empty) list of `re.MatchObject` objects.  |
+| `token_match`    | -       | A function matching the signature of `re.compile(string).match to find token matches. Returns an `re.MatchObject` or `None. |
+| `rules`          | dict        | A dictionary of tokenizer exceptions and special cases.                                                                  |

 ## Serialization fields {#serialization-fields}

--- a/website/docs/usage/linguistic-features.md
+++ b/website/docs/usage/linguistic-features.md
@ -792,6 +792,33 @@ The algorithm can be summarized as follows:
   tokens on all infixes.
 8. Once we can't consume any more of the string, handle it as a single token.

+#### Debugging the tokenizer {#tokenizer-debug new="2.2.3"}
+
+A working implementation of the pseudo-code above is available for debugging as
+[`nlp.tokenizer.explain(text)`](/api/tokenizer#explain). It returns a list of
+tuples showing which tokenizer rule or pattern was matched for each token. The
+tokens produced are identical to `nlp.tokenizer()` except for whitespace
+tokens:
+
+```python
+### {executable="true"}
+from spacy.lang.en import English
+nlp = English()
+text = '''"Let's go!"'''
+doc = nlp(text)
+tok_exp = nlp.tokenizer.explain(text)
+assert [t.text for t in doc if not t.is_space] == [t[1] for t in tok_exp]
+for t in tok_exp:
+    print(t[1], "\\t", t[0])
+
+# " 	 PREFIX
+# Let 	 SPECIAL-1
+# 's 	 SPECIAL-2
+# go 	 TOKEN
+# ! 	 SUFFIX
+# " 	 SUFFIX
+```
+
 ### Customizing spaCy's Tokenizer class {#native-tokenizers}

 Let's imagine you wanted to create a tokenizer for a new language or specific