From 2c876eb672f8ea03b7e639d4c75491a92327bd06 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Wed, 20 Nov 2019 13:07:25 +0100 Subject: [PATCH] Add tokenizer explain() debugging method (#4596) * Expose tokenizer rules as a property Expose the tokenizer rules property in the same way as the other core properties. (The cache resetting is overkill, but consistent with `from_bytes` for now.) Add tests and update Tokenizer API docs. * Update Hungarian punctuation to remove empty string Update Hungarian punctuation definitions so that `_units` does not match an empty string. * Use _load_special_tokenization consistently Use `_load_special_tokenization()` and have it to handle `None` checks. * Fix precedence of `token_match` vs. special cases Remove `token_match` check from `_split_affixes()` so that special cases have precedence over `token_match`. `token_match` is checked only before infixes are split. * Add `make_debug_doc()` to the Tokenizer Add `make_debug_doc()` to the Tokenizer as a working implementation of the pseudo-code in the docs. Add a test (marked as slow) that checks that `nlp.tokenizer()` and `nlp.tokenizer.make_debug_doc()` return the same non-whitespace tokens for all languages that have `examples.sentences` that can be imported. * Update tokenization usage docs Update pseudo-code and algorithm description to correspond to `nlp.tokenizer.make_debug_doc()` with example debugging usage. Add more examples for customizing tokenizers while preserving the existing defaults. Minor edits / clarifications. * Revert "Update Hungarian punctuation to remove empty string" This reverts commit f0a577f7a5c67f55807fdbda9e9a936464723931. * Rework `make_debug_doc()` as `explain()` Rework `make_debug_doc()` as `explain()`, which returns a list of `(pattern_string, token_string)` tuples rather than a non-standard `Doc`. Update docs and tests accordingly, leaving the visualization for future work. * Handle cases with bad tokenizer patterns Detect when tokenizer patterns match empty prefixes and suffixes so that `explain()` does not hang on bad patterns. * Remove unused displacy image * Add tokenizer.explain() to usage docs --- .../lang/en/test_customized_tokenizer.py | 82 +++++++++++++++- spacy/tests/tokenizer/test_explain.py | 30 ++++++ spacy/tokenizer.pyx | 95 ++++++++++++++++--- website/docs/api/tokenizer.md | 51 +++++++--- website/docs/usage/linguistic-features.md | 27 ++++++ 5 files changed, 258 insertions(+), 27 deletions(-) create mode 100644 spacy/tests/tokenizer/test_explain.py diff --git a/spacy/tests/lang/en/test_customized_tokenizer.py b/spacy/tests/lang/en/test_customized_tokenizer.py index fdac32a90..7f939011f 100644 --- a/spacy/tests/lang/en/test_customized_tokenizer.py +++ b/spacy/tests/lang/en/test_customized_tokenizer.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import pytest +import re from spacy.lang.en import English from spacy.tokenizer import Tokenizer from spacy.util import compile_prefix_regex, compile_suffix_regex @@ -19,13 +20,14 @@ def custom_en_tokenizer(en_vocab): r"[\[\]!&:,()\*—–\/-]", ] infix_re = compile_infix_regex(custom_infixes) + token_match_re = re.compile("a-b") return Tokenizer( en_vocab, English.Defaults.tokenizer_exceptions, prefix_re.search, suffix_re.search, infix_re.finditer, - token_match=None, + token_match=token_match_re.match, ) @@ -74,3 +76,81 @@ def test_en_customized_tokenizer_handles_infixes(custom_en_tokenizer): "Megaregion", ".", ] + + +def test_en_customized_tokenizer_handles_token_match(custom_en_tokenizer): + sentence = "The 8 and 10-county definitions a-b not used for the greater Southern California Megaregion." + context = [word.text for word in custom_en_tokenizer(sentence)] + assert context == [ + "The", + "8", + "and", + "10", + "-", + "county", + "definitions", + "a-b", + "not", + "used", + "for", + "the", + "greater", + "Southern", + "California", + "Megaregion", + ".", + ] + + +def test_en_customized_tokenizer_handles_rules(custom_en_tokenizer): + sentence = "The 8 and 10-county definitions are not used for the greater Southern California Megaregion. :)" + context = [word.text for word in custom_en_tokenizer(sentence)] + assert context == [ + "The", + "8", + "and", + "10", + "-", + "county", + "definitions", + "are", + "not", + "used", + "for", + "the", + "greater", + "Southern", + "California", + "Megaregion", + ".", + ":)", + ] + + +def test_en_customized_tokenizer_handles_rules_property(custom_en_tokenizer): + sentence = "The 8 and 10-county definitions are not used for the greater Southern California Megaregion. :)" + rules = custom_en_tokenizer.rules + del rules[":)"] + custom_en_tokenizer.rules = rules + context = [word.text for word in custom_en_tokenizer(sentence)] + assert context == [ + "The", + "8", + "and", + "10", + "-", + "county", + "definitions", + "are", + "not", + "used", + "for", + "the", + "greater", + "Southern", + "California", + "Megaregion", + ".", + ":", + ")", + ] diff --git a/spacy/tests/tokenizer/test_explain.py b/spacy/tests/tokenizer/test_explain.py new file mode 100644 index 000000000..dcd4268ab --- /dev/null +++ b/spacy/tests/tokenizer/test_explain.py @@ -0,0 +1,30 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import importlib +import pytest +from spacy.util import get_lang_class + + +# fmt: off +# Only include languages with no external dependencies +# excluded: ja, ru, th, uk, vi, zh +LANGUAGES = ["af", "ar", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es", + "et", "fa", "fi", "fr", "ga", "he", "hi", "hr", "hu", "id", "is", + "it", "kn", "lt", "lv", "nb", "nl", "pl", "pt", "ro", "si", "sk", + "sl", "sq", "sr", "sv", "ta", "te", "tl", "tr", "tt", "ur"] +# fmt: on + + +@pytest.mark.slow +@pytest.mark.parametrize("lang", LANGUAGES) +def test_tokenizer_explain(lang): + nlp = get_lang_class(lang)() + try: + examples = importlib.import_module("spacy.lang." + lang + ".examples") + for sentence in examples.sentences: + tokens = [t.text for t in nlp.tokenizer(sentence) if not t.is_space] + debug_tokens = [t[1] for t in nlp.tokenizer.explain(sentence)] + assert tokens == debug_tokens + except: + pass diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index b39bb1ecb..f95b44283 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -15,6 +15,8 @@ import re from .tokens.doc cimport Doc from .strings cimport hash_string from .compat import unescape_unicode +from .attrs import intify_attrs +from .symbols import ORTH from .errors import Errors, Warnings, deprecation_warning from . import util @@ -57,9 +59,7 @@ cdef class Tokenizer: self.infix_finditer = infix_finditer self.vocab = vocab self._rules = {} - if rules is not None: - for chunk, substrings in sorted(rules.items()): - self.add_special_case(chunk, substrings) + self._load_special_tokenization(rules) property token_match: def __get__(self): @@ -93,6 +93,18 @@ cdef class Tokenizer: self._infix_finditer = infix_finditer self._flush_cache() + property rules: + def __get__(self): + return self._rules + + def __set__(self, rules): + self._rules = {} + self._reset_cache([key for key in self._cache]) + self._reset_specials() + self._cache = PreshMap() + self._specials = PreshMap() + self._load_special_tokenization(rules) + def __reduce__(self): args = (self.vocab, self._rules, @@ -227,10 +239,6 @@ cdef class Tokenizer: cdef unicode minus_suf cdef size_t last_size = 0 while string and len(string) != last_size: - if self.token_match and self.token_match(string) \ - and not self.find_prefix(string) \ - and not self.find_suffix(string): - break if self._specials.get(hash_string(string)) != NULL: has_special[0] = 1 break @@ -393,8 +401,9 @@ cdef class Tokenizer: def _load_special_tokenization(self, special_cases): """Add special-case tokenization rules.""" - for chunk, substrings in sorted(special_cases.items()): - self.add_special_case(chunk, substrings) + if special_cases is not None: + for chunk, substrings in sorted(special_cases.items()): + self.add_special_case(chunk, substrings) def add_special_case(self, unicode string, substrings): """Add a special-case tokenization rule. @@ -423,6 +432,71 @@ cdef class Tokenizer: self.mem.free(stale_cached) self._rules[string] = substrings + def explain(self, text): + """A debugging tokenizer that provides information about which + tokenizer rule or pattern was matched for each token. The tokens + produced are identical to `nlp.tokenizer()` except for whitespace + tokens. + + string (unicode): The string to tokenize. + RETURNS (list): A list of (pattern_string, token_string) tuples + + DOCS: https://spacy.io/api/tokenizer#explain + """ + prefix_search = self.prefix_search + suffix_search = self.suffix_search + infix_finditer = self.infix_finditer + token_match = self.token_match + special_cases = {} + for orth, special_tokens in self.rules.items(): + special_cases[orth] = [intify_attrs(special_token, strings_map=self.vocab.strings, _do_deprecated=True) for special_token in special_tokens] + tokens = [] + for substring in text.split(): + suffixes = [] + while substring: + while prefix_search(substring) or suffix_search(substring): + if substring in special_cases: + tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring])) + substring = '' + break + if prefix_search(substring): + split = prefix_search(substring).end() + tokens.append(("PREFIX", substring[:split])) + substring = substring[split:] + if substring in special_cases: + continue + # break if pattern matches the empty string + if split == 0: + break + if suffix_search(substring): + split = suffix_search(substring).start() + suffixes.append(("SUFFIX", substring[split:])) + substring = substring[:split] + # break if pattern matches the empty string + if split == len(substring): + break + if substring in special_cases: + tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring])) + substring = '' + elif token_match(substring): + tokens.append(("TOKEN_MATCH", substring)) + substring = '' + elif list(infix_finditer(substring)): + infixes = infix_finditer(substring) + offset = 0 + for match in infixes: + tokens.append(("TOKEN", substring[offset : match.start()])) + tokens.append(("INFIX", substring[match.start() : match.end()])) + offset = match.end() + if substring[offset:]: + tokens.append(("TOKEN", substring[offset:])) + substring = '' + elif substring: + tokens.append(("TOKEN", substring)) + substring = '' + tokens.extend(reversed(suffixes)) + return tokens + def to_disk(self, path, **kwargs): """Save the current state to a directory. @@ -507,8 +581,7 @@ cdef class Tokenizer: self._reset_specials() self._cache = PreshMap() self._specials = PreshMap() - for string, substrings in data.get("rules", {}).items(): - self.add_special_case(string, substrings) + self._load_special_tokenization(data.get("rules", {})) return self diff --git a/website/docs/api/tokenizer.md b/website/docs/api/tokenizer.md index d6ab73f14..7462af739 100644 --- a/website/docs/api/tokenizer.md +++ b/website/docs/api/tokenizer.md @@ -34,15 +34,15 @@ the > tokenizer = nlp.Defaults.create_tokenizer(nlp) > ``` -| Name | Type | Description | -| ---------------- | ----------- | ----------------------------------------------------------------------------------- | -| `vocab` | `Vocab` | A storage container for lexical types. | -| `rules` | dict | Exceptions and special-cases for the tokenizer. | -| `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. | -| `suffix_search` | callable | A function matching the signature of `re.compile(string).search` to match suffixes. | -| `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. | -| `token_match` | callable | A boolean function matching strings to be recognized as tokens. | -| **RETURNS** | `Tokenizer` | The newly constructed object. | +| Name | Type | Description | +| ---------------- | ----------- | ----------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | `Vocab` | A storage container for lexical types. | +| `rules` | dict | Exceptions and special-cases for the tokenizer. | +| `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. | +| `suffix_search` | callable | A function matching the signature of `re.compile(string).search` to match suffixes. | +| `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. | +| `token_match` | callable | A function matching the signature of `re.compile(string).match to find token matches. | +| **RETURNS** | `Tokenizer` | The newly constructed object. | ## Tokenizer.\_\_call\_\_ {#call tag="method"} @@ -128,6 +128,25 @@ and examples. | `string` | unicode | The string to specially tokenize. | | `token_attrs` | iterable | A sequence of dicts, where each dict describes a token and its attributes. The `ORTH` fields of the attributes must exactly match the string when they are concatenated. | +## Tokenizer.explain {#explain tag="method"} + +Tokenize a string with a slow debugging tokenizer that provides information +about which tokenizer rule or pattern was matched for each token. The tokens +produced are identical to `Tokenizer.__call__` except for whitespace tokens. + +> #### Example +> +> ```python +> tok_exp = nlp.tokenizer.explain("(don't)") +> assert [t[0] for t in tok_exp] == ["PREFIX", "SPECIAL-1", "SPECIAL-2", "SUFFIX"] +> assert [t[1] for t in tok_exp] == ["(", "do", "n't", ")"] +> ``` + +| Name | Type | Description | +| ------------| -------- | --------------------------------------------------- | +| `string` | unicode | The string to tokenize with the debugging tokenizer | +| **RETURNS** | list | A list of `(pattern_string, token_string)` tuples | + ## Tokenizer.to_disk {#to_disk tag="method"} Serialize the tokenizer to disk. @@ -198,12 +217,14 @@ it. ## Attributes {#attributes} -| Name | Type | Description | -| ---------------- | ------- | -------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | `Vocab` | The vocab object of the parent `Doc`. | -| `prefix_search` | - | A function to find segment boundaries from the start of a string. Returns the length of the segment, or `None`. | -| `suffix_search` | - | A function to find segment boundaries from the end of a string. Returns the length of the segment, or `None`. | -| `infix_finditer` | - | A function to find internal segment separators, e.g. hyphens. Returns a (possibly empty) list of `re.MatchObject` objects. | +| Name | Type | Description | +| ---------------- | ------- | --------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | `Vocab` | The vocab object of the parent `Doc`. | +| `prefix_search` | - | A function to find segment boundaries from the start of a string. Returns the length of the segment, or `None`. | +| `suffix_search` | - | A function to find segment boundaries from the end of a string. Returns the length of the segment, or `None`. | +| `infix_finditer` | - | A function to find internal segment separators, e.g. hyphens. Returns a (possibly empty) list of `re.MatchObject` objects. | +| `token_match` | - | A function matching the signature of `re.compile(string).match to find token matches. Returns an `re.MatchObject` or `None. | +| `rules` | dict | A dictionary of tokenizer exceptions and special cases. | ## Serialization fields {#serialization-fields} diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index db3aac686..ba85e6154 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -792,6 +792,33 @@ The algorithm can be summarized as follows: tokens on all infixes. 8. Once we can't consume any more of the string, handle it as a single token. +#### Debugging the tokenizer {#tokenizer-debug new="2.2.3"} + +A working implementation of the pseudo-code above is available for debugging as +[`nlp.tokenizer.explain(text)`](/api/tokenizer#explain). It returns a list of +tuples showing which tokenizer rule or pattern was matched for each token. The +tokens produced are identical to `nlp.tokenizer()` except for whitespace +tokens: + +```python +### {executable="true"} +from spacy.lang.en import English +nlp = English() +text = '''"Let's go!"''' +doc = nlp(text) +tok_exp = nlp.tokenizer.explain(text) +assert [t.text for t in doc if not t.is_space] == [t[1] for t in tok_exp] +for t in tok_exp: + print(t[1], "\\t", t[0]) + +# " PREFIX +# Let SPECIAL-1 +# 's SPECIAL-2 +# go TOKEN +# ! SUFFIX +# " SUFFIX +``` + ### Customizing spaCy's Tokenizer class {#native-tokenizers} Let's imagine you wanted to create a tokenizer for a new language or specific