Add tokenizer explain() debugging method (#4596)

* Expose tokenizer rules as a property

Expose the tokenizer rules property in the same way as the other core
properties. (The cache resetting is overkill, but consistent with
`from_bytes` for now.)

Add tests and update Tokenizer API docs.

* Update Hungarian punctuation to remove empty string

Update Hungarian punctuation definitions so that `_units` does not match
an empty string.

* Use _load_special_tokenization consistently

Use `_load_special_tokenization()` and have it to handle `None` checks.

* Fix precedence of `token_match` vs. special cases

Remove `token_match` check from `_split_affixes()` so that special cases
have precedence over `token_match`. `token_match` is checked only before
infixes are split.

* Add `make_debug_doc()` to the Tokenizer

Add `make_debug_doc()` to the Tokenizer as a working implementation of
the pseudo-code in the docs.

Add a test (marked as slow) that checks that `nlp.tokenizer()` and
`nlp.tokenizer.make_debug_doc()` return the same non-whitespace tokens
for all languages that have `examples.sentences` that can be imported.

* Update tokenization usage docs

Update pseudo-code and algorithm description to correspond to
`nlp.tokenizer.make_debug_doc()` with example debugging usage.

Add more examples for customizing tokenizers while preserving the
existing defaults.

Minor edits / clarifications.

* Revert "Update Hungarian punctuation to remove empty string"

This reverts commit f0a577f7a5.

* Rework `make_debug_doc()` as `explain()`

Rework `make_debug_doc()` as `explain()`, which returns a list of
`(pattern_string, token_string)` tuples rather than a non-standard
`Doc`. Update docs and tests accordingly, leaving the visualization for
future work.

* Handle cases with bad tokenizer patterns

Detect when tokenizer patterns match empty prefixes and suffixes so that
`explain()` does not hang on bad patterns.

* Remove unused displacy image

* Add tokenizer.explain() to usage docs
This commit is contained in:
adrianeboyd 2019-11-20 13:07:25 +01:00 committed by Ines Montani
parent a3c43a1692
commit 2c876eb672
5 changed files with 258 additions and 27 deletions

View File

@ -2,6 +2,7 @@
from __future__ import unicode_literals
import pytest
import re
from spacy.lang.en import English
from spacy.tokenizer import Tokenizer
from spacy.util import compile_prefix_regex, compile_suffix_regex
@ -19,13 +20,14 @@ def custom_en_tokenizer(en_vocab):
r"[\[\]!&:,()\*—–\/-]",
]
infix_re = compile_infix_regex(custom_infixes)
token_match_re = re.compile("a-b")
return Tokenizer(
en_vocab,
English.Defaults.tokenizer_exceptions,
prefix_re.search,
suffix_re.search,
infix_re.finditer,
token_match=None,
token_match=token_match_re.match,
)
@ -74,3 +76,81 @@ def test_en_customized_tokenizer_handles_infixes(custom_en_tokenizer):
"Megaregion",
".",
]
def test_en_customized_tokenizer_handles_token_match(custom_en_tokenizer):
sentence = "The 8 and 10-county definitions a-b not used for the greater Southern California Megaregion."
context = [word.text for word in custom_en_tokenizer(sentence)]
assert context == [
"The",
"8",
"and",
"10",
"-",
"county",
"definitions",
"a-b",
"not",
"used",
"for",
"the",
"greater",
"Southern",
"California",
"Megaregion",
".",
]
def test_en_customized_tokenizer_handles_rules(custom_en_tokenizer):
sentence = "The 8 and 10-county definitions are not used for the greater Southern California Megaregion. :)"
context = [word.text for word in custom_en_tokenizer(sentence)]
assert context == [
"The",
"8",
"and",
"10",
"-",
"county",
"definitions",
"are",
"not",
"used",
"for",
"the",
"greater",
"Southern",
"California",
"Megaregion",
".",
":)",
]
def test_en_customized_tokenizer_handles_rules_property(custom_en_tokenizer):
sentence = "The 8 and 10-county definitions are not used for the greater Southern California Megaregion. :)"
rules = custom_en_tokenizer.rules
del rules[":)"]
custom_en_tokenizer.rules = rules
context = [word.text for word in custom_en_tokenizer(sentence)]
assert context == [
"The",
"8",
"and",
"10",
"-",
"county",
"definitions",
"are",
"not",
"used",
"for",
"the",
"greater",
"Southern",
"California",
"Megaregion",
".",
":",
")",
]

View File

@ -0,0 +1,30 @@
# coding: utf-8
from __future__ import unicode_literals
import importlib
import pytest
from spacy.util import get_lang_class
# fmt: off
# Only include languages with no external dependencies
# excluded: ja, ru, th, uk, vi, zh
LANGUAGES = ["af", "ar", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es",
"et", "fa", "fi", "fr", "ga", "he", "hi", "hr", "hu", "id", "is",
"it", "kn", "lt", "lv", "nb", "nl", "pl", "pt", "ro", "si", "sk",
"sl", "sq", "sr", "sv", "ta", "te", "tl", "tr", "tt", "ur"]
# fmt: on
@pytest.mark.slow
@pytest.mark.parametrize("lang", LANGUAGES)
def test_tokenizer_explain(lang):
nlp = get_lang_class(lang)()
try:
examples = importlib.import_module("spacy.lang." + lang + ".examples")
for sentence in examples.sentences:
tokens = [t.text for t in nlp.tokenizer(sentence) if not t.is_space]
debug_tokens = [t[1] for t in nlp.tokenizer.explain(sentence)]
assert tokens == debug_tokens
except:
pass

View File

@ -15,6 +15,8 @@ import re
from .tokens.doc cimport Doc
from .strings cimport hash_string
from .compat import unescape_unicode
from .attrs import intify_attrs
from .symbols import ORTH
from .errors import Errors, Warnings, deprecation_warning
from . import util
@ -57,9 +59,7 @@ cdef class Tokenizer:
self.infix_finditer = infix_finditer
self.vocab = vocab
self._rules = {}
if rules is not None:
for chunk, substrings in sorted(rules.items()):
self.add_special_case(chunk, substrings)
self._load_special_tokenization(rules)
property token_match:
def __get__(self):
@ -93,6 +93,18 @@ cdef class Tokenizer:
self._infix_finditer = infix_finditer
self._flush_cache()
property rules:
def __get__(self):
return self._rules
def __set__(self, rules):
self._rules = {}
self._reset_cache([key for key in self._cache])
self._reset_specials()
self._cache = PreshMap()
self._specials = PreshMap()
self._load_special_tokenization(rules)
def __reduce__(self):
args = (self.vocab,
self._rules,
@ -227,10 +239,6 @@ cdef class Tokenizer:
cdef unicode minus_suf
cdef size_t last_size = 0
while string and len(string) != last_size:
if self.token_match and self.token_match(string) \
and not self.find_prefix(string) \
and not self.find_suffix(string):
break
if self._specials.get(hash_string(string)) != NULL:
has_special[0] = 1
break
@ -393,8 +401,9 @@ cdef class Tokenizer:
def _load_special_tokenization(self, special_cases):
"""Add special-case tokenization rules."""
for chunk, substrings in sorted(special_cases.items()):
self.add_special_case(chunk, substrings)
if special_cases is not None:
for chunk, substrings in sorted(special_cases.items()):
self.add_special_case(chunk, substrings)
def add_special_case(self, unicode string, substrings):
"""Add a special-case tokenization rule.
@ -423,6 +432,71 @@ cdef class Tokenizer:
self.mem.free(stale_cached)
self._rules[string] = substrings
def explain(self, text):
"""A debugging tokenizer that provides information about which
tokenizer rule or pattern was matched for each token. The tokens
produced are identical to `nlp.tokenizer()` except for whitespace
tokens.
string (unicode): The string to tokenize.
RETURNS (list): A list of (pattern_string, token_string) tuples
DOCS: https://spacy.io/api/tokenizer#explain
"""
prefix_search = self.prefix_search
suffix_search = self.suffix_search
infix_finditer = self.infix_finditer
token_match = self.token_match
special_cases = {}
for orth, special_tokens in self.rules.items():
special_cases[orth] = [intify_attrs(special_token, strings_map=self.vocab.strings, _do_deprecated=True) for special_token in special_tokens]
tokens = []
for substring in text.split():
suffixes = []
while substring:
while prefix_search(substring) or suffix_search(substring):
if substring in special_cases:
tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
substring = ''
break
if prefix_search(substring):
split = prefix_search(substring).end()
tokens.append(("PREFIX", substring[:split]))
substring = substring[split:]
if substring in special_cases:
continue
# break if pattern matches the empty string
if split == 0:
break
if suffix_search(substring):
split = suffix_search(substring).start()
suffixes.append(("SUFFIX", substring[split:]))
substring = substring[:split]
# break if pattern matches the empty string
if split == len(substring):
break
if substring in special_cases:
tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
substring = ''
elif token_match(substring):
tokens.append(("TOKEN_MATCH", substring))
substring = ''
elif list(infix_finditer(substring)):
infixes = infix_finditer(substring)
offset = 0
for match in infixes:
tokens.append(("TOKEN", substring[offset : match.start()]))
tokens.append(("INFIX", substring[match.start() : match.end()]))
offset = match.end()
if substring[offset:]:
tokens.append(("TOKEN", substring[offset:]))
substring = ''
elif substring:
tokens.append(("TOKEN", substring))
substring = ''
tokens.extend(reversed(suffixes))
return tokens
def to_disk(self, path, **kwargs):
"""Save the current state to a directory.
@ -507,8 +581,7 @@ cdef class Tokenizer:
self._reset_specials()
self._cache = PreshMap()
self._specials = PreshMap()
for string, substrings in data.get("rules", {}).items():
self.add_special_case(string, substrings)
self._load_special_tokenization(data.get("rules", {}))
return self

View File

@ -34,15 +34,15 @@ the
> tokenizer = nlp.Defaults.create_tokenizer(nlp)
> ```
| Name | Type | Description |
| ---------------- | ----------- | ----------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | A storage container for lexical types. |
| `rules` | dict | Exceptions and special-cases for the tokenizer. |
| `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. |
| `suffix_search` | callable | A function matching the signature of `re.compile(string).search` to match suffixes. |
| `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. |
| `token_match` | callable | A boolean function matching strings to be recognized as tokens. |
| **RETURNS** | `Tokenizer` | The newly constructed object. |
| Name | Type | Description |
| ---------------- | ----------- | ----------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | A storage container for lexical types. |
| `rules` | dict | Exceptions and special-cases for the tokenizer. |
| `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. |
| `suffix_search` | callable | A function matching the signature of `re.compile(string).search` to match suffixes. |
| `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. |
| `token_match` | callable | A function matching the signature of `re.compile(string).match to find token matches. |
| **RETURNS** | `Tokenizer` | The newly constructed object. |
## Tokenizer.\_\_call\_\_ {#call tag="method"}
@ -128,6 +128,25 @@ and examples.
| `string` | unicode | The string to specially tokenize. |
| `token_attrs` | iterable | A sequence of dicts, where each dict describes a token and its attributes. The `ORTH` fields of the attributes must exactly match the string when they are concatenated. |
## Tokenizer.explain {#explain tag="method"}
Tokenize a string with a slow debugging tokenizer that provides information
about which tokenizer rule or pattern was matched for each token. The tokens
produced are identical to `Tokenizer.__call__` except for whitespace tokens.
> #### Example
>
> ```python
> tok_exp = nlp.tokenizer.explain("(don't)")
> assert [t[0] for t in tok_exp] == ["PREFIX", "SPECIAL-1", "SPECIAL-2", "SUFFIX"]
> assert [t[1] for t in tok_exp] == ["(", "do", "n't", ")"]
> ```
| Name | Type | Description |
| ------------| -------- | --------------------------------------------------- |
| `string` | unicode | The string to tokenize with the debugging tokenizer |
| **RETURNS** | list | A list of `(pattern_string, token_string)` tuples |
## Tokenizer.to_disk {#to_disk tag="method"}
Serialize the tokenizer to disk.
@ -198,12 +217,14 @@ it.
## Attributes {#attributes}
| Name | Type | Description |
| ---------------- | ------- | -------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | The vocab object of the parent `Doc`. |
| `prefix_search` | - | A function to find segment boundaries from the start of a string. Returns the length of the segment, or `None`. |
| `suffix_search` | - | A function to find segment boundaries from the end of a string. Returns the length of the segment, or `None`. |
| `infix_finditer` | - | A function to find internal segment separators, e.g. hyphens. Returns a (possibly empty) list of `re.MatchObject` objects. |
| Name | Type | Description |
| ---------------- | ------- | --------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | The vocab object of the parent `Doc`. |
| `prefix_search` | - | A function to find segment boundaries from the start of a string. Returns the length of the segment, or `None`. |
| `suffix_search` | - | A function to find segment boundaries from the end of a string. Returns the length of the segment, or `None`. |
| `infix_finditer` | - | A function to find internal segment separators, e.g. hyphens. Returns a (possibly empty) list of `re.MatchObject` objects. |
| `token_match` | - | A function matching the signature of `re.compile(string).match to find token matches. Returns an `re.MatchObject` or `None. |
| `rules` | dict | A dictionary of tokenizer exceptions and special cases. |
## Serialization fields {#serialization-fields}

View File

@ -792,6 +792,33 @@ The algorithm can be summarized as follows:
tokens on all infixes.
8. Once we can't consume any more of the string, handle it as a single token.
#### Debugging the tokenizer {#tokenizer-debug new="2.2.3"}
A working implementation of the pseudo-code above is available for debugging as
[`nlp.tokenizer.explain(text)`](/api/tokenizer#explain). It returns a list of
tuples showing which tokenizer rule or pattern was matched for each token. The
tokens produced are identical to `nlp.tokenizer()` except for whitespace
tokens:
```python
### {executable="true"}
from spacy.lang.en import English
nlp = English()
text = '''"Let's go!"'''
doc = nlp(text)
tok_exp = nlp.tokenizer.explain(text)
assert [t.text for t in doc if not t.is_space] == [t[1] for t in tok_exp]
for t in tok_exp:
print(t[1], "\\t", t[0])
# " PREFIX
# Let SPECIAL-1
# 's SPECIAL-2
# go TOKEN
# ! SUFFIX
# " SUFFIX
```
### Customizing spaCy's Tokenizer class {#native-tokenizers}
Let's imagine you wanted to create a tokenizer for a new language or specific