mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
Add tokenizer explain() debugging method (#4596)
* Expose tokenizer rules as a property
Expose the tokenizer rules property in the same way as the other core
properties. (The cache resetting is overkill, but consistent with
`from_bytes` for now.)
Add tests and update Tokenizer API docs.
* Update Hungarian punctuation to remove empty string
Update Hungarian punctuation definitions so that `_units` does not match
an empty string.
* Use _load_special_tokenization consistently
Use `_load_special_tokenization()` and have it to handle `None` checks.
* Fix precedence of `token_match` vs. special cases
Remove `token_match` check from `_split_affixes()` so that special cases
have precedence over `token_match`. `token_match` is checked only before
infixes are split.
* Add `make_debug_doc()` to the Tokenizer
Add `make_debug_doc()` to the Tokenizer as a working implementation of
the pseudo-code in the docs.
Add a test (marked as slow) that checks that `nlp.tokenizer()` and
`nlp.tokenizer.make_debug_doc()` return the same non-whitespace tokens
for all languages that have `examples.sentences` that can be imported.
* Update tokenization usage docs
Update pseudo-code and algorithm description to correspond to
`nlp.tokenizer.make_debug_doc()` with example debugging usage.
Add more examples for customizing tokenizers while preserving the
existing defaults.
Minor edits / clarifications.
* Revert "Update Hungarian punctuation to remove empty string"
This reverts commit f0a577f7a5
.
* Rework `make_debug_doc()` as `explain()`
Rework `make_debug_doc()` as `explain()`, which returns a list of
`(pattern_string, token_string)` tuples rather than a non-standard
`Doc`. Update docs and tests accordingly, leaving the visualization for
future work.
* Handle cases with bad tokenizer patterns
Detect when tokenizer patterns match empty prefixes and suffixes so that
`explain()` does not hang on bad patterns.
* Remove unused displacy image
* Add tokenizer.explain() to usage docs
This commit is contained in:
parent
a3c43a1692
commit
2c876eb672
|
@ -2,6 +2,7 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
import re
|
||||
from spacy.lang.en import English
|
||||
from spacy.tokenizer import Tokenizer
|
||||
from spacy.util import compile_prefix_regex, compile_suffix_regex
|
||||
|
@ -19,13 +20,14 @@ def custom_en_tokenizer(en_vocab):
|
|||
r"[\[\]!&:,()\*—–\/-]",
|
||||
]
|
||||
infix_re = compile_infix_regex(custom_infixes)
|
||||
token_match_re = re.compile("a-b")
|
||||
return Tokenizer(
|
||||
en_vocab,
|
||||
English.Defaults.tokenizer_exceptions,
|
||||
prefix_re.search,
|
||||
suffix_re.search,
|
||||
infix_re.finditer,
|
||||
token_match=None,
|
||||
token_match=token_match_re.match,
|
||||
)
|
||||
|
||||
|
||||
|
@ -74,3 +76,81 @@ def test_en_customized_tokenizer_handles_infixes(custom_en_tokenizer):
|
|||
"Megaregion",
|
||||
".",
|
||||
]
|
||||
|
||||
|
||||
def test_en_customized_tokenizer_handles_token_match(custom_en_tokenizer):
|
||||
sentence = "The 8 and 10-county definitions a-b not used for the greater Southern California Megaregion."
|
||||
context = [word.text for word in custom_en_tokenizer(sentence)]
|
||||
assert context == [
|
||||
"The",
|
||||
"8",
|
||||
"and",
|
||||
"10",
|
||||
"-",
|
||||
"county",
|
||||
"definitions",
|
||||
"a-b",
|
||||
"not",
|
||||
"used",
|
||||
"for",
|
||||
"the",
|
||||
"greater",
|
||||
"Southern",
|
||||
"California",
|
||||
"Megaregion",
|
||||
".",
|
||||
]
|
||||
|
||||
|
||||
def test_en_customized_tokenizer_handles_rules(custom_en_tokenizer):
|
||||
sentence = "The 8 and 10-county definitions are not used for the greater Southern California Megaregion. :)"
|
||||
context = [word.text for word in custom_en_tokenizer(sentence)]
|
||||
assert context == [
|
||||
"The",
|
||||
"8",
|
||||
"and",
|
||||
"10",
|
||||
"-",
|
||||
"county",
|
||||
"definitions",
|
||||
"are",
|
||||
"not",
|
||||
"used",
|
||||
"for",
|
||||
"the",
|
||||
"greater",
|
||||
"Southern",
|
||||
"California",
|
||||
"Megaregion",
|
||||
".",
|
||||
":)",
|
||||
]
|
||||
|
||||
|
||||
def test_en_customized_tokenizer_handles_rules_property(custom_en_tokenizer):
|
||||
sentence = "The 8 and 10-county definitions are not used for the greater Southern California Megaregion. :)"
|
||||
rules = custom_en_tokenizer.rules
|
||||
del rules[":)"]
|
||||
custom_en_tokenizer.rules = rules
|
||||
context = [word.text for word in custom_en_tokenizer(sentence)]
|
||||
assert context == [
|
||||
"The",
|
||||
"8",
|
||||
"and",
|
||||
"10",
|
||||
"-",
|
||||
"county",
|
||||
"definitions",
|
||||
"are",
|
||||
"not",
|
||||
"used",
|
||||
"for",
|
||||
"the",
|
||||
"greater",
|
||||
"Southern",
|
||||
"California",
|
||||
"Megaregion",
|
||||
".",
|
||||
":",
|
||||
")",
|
||||
]
|
||||
|
|
30
spacy/tests/tokenizer/test_explain.py
Normal file
30
spacy/tests/tokenizer/test_explain.py
Normal file
|
@ -0,0 +1,30 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import importlib
|
||||
import pytest
|
||||
from spacy.util import get_lang_class
|
||||
|
||||
|
||||
# fmt: off
|
||||
# Only include languages with no external dependencies
|
||||
# excluded: ja, ru, th, uk, vi, zh
|
||||
LANGUAGES = ["af", "ar", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es",
|
||||
"et", "fa", "fi", "fr", "ga", "he", "hi", "hr", "hu", "id", "is",
|
||||
"it", "kn", "lt", "lv", "nb", "nl", "pl", "pt", "ro", "si", "sk",
|
||||
"sl", "sq", "sr", "sv", "ta", "te", "tl", "tr", "tt", "ur"]
|
||||
# fmt: on
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.parametrize("lang", LANGUAGES)
|
||||
def test_tokenizer_explain(lang):
|
||||
nlp = get_lang_class(lang)()
|
||||
try:
|
||||
examples = importlib.import_module("spacy.lang." + lang + ".examples")
|
||||
for sentence in examples.sentences:
|
||||
tokens = [t.text for t in nlp.tokenizer(sentence) if not t.is_space]
|
||||
debug_tokens = [t[1] for t in nlp.tokenizer.explain(sentence)]
|
||||
assert tokens == debug_tokens
|
||||
except:
|
||||
pass
|
|
@ -15,6 +15,8 @@ import re
|
|||
from .tokens.doc cimport Doc
|
||||
from .strings cimport hash_string
|
||||
from .compat import unescape_unicode
|
||||
from .attrs import intify_attrs
|
||||
from .symbols import ORTH
|
||||
|
||||
from .errors import Errors, Warnings, deprecation_warning
|
||||
from . import util
|
||||
|
@ -57,9 +59,7 @@ cdef class Tokenizer:
|
|||
self.infix_finditer = infix_finditer
|
||||
self.vocab = vocab
|
||||
self._rules = {}
|
||||
if rules is not None:
|
||||
for chunk, substrings in sorted(rules.items()):
|
||||
self.add_special_case(chunk, substrings)
|
||||
self._load_special_tokenization(rules)
|
||||
|
||||
property token_match:
|
||||
def __get__(self):
|
||||
|
@ -93,6 +93,18 @@ cdef class Tokenizer:
|
|||
self._infix_finditer = infix_finditer
|
||||
self._flush_cache()
|
||||
|
||||
property rules:
|
||||
def __get__(self):
|
||||
return self._rules
|
||||
|
||||
def __set__(self, rules):
|
||||
self._rules = {}
|
||||
self._reset_cache([key for key in self._cache])
|
||||
self._reset_specials()
|
||||
self._cache = PreshMap()
|
||||
self._specials = PreshMap()
|
||||
self._load_special_tokenization(rules)
|
||||
|
||||
def __reduce__(self):
|
||||
args = (self.vocab,
|
||||
self._rules,
|
||||
|
@ -227,10 +239,6 @@ cdef class Tokenizer:
|
|||
cdef unicode minus_suf
|
||||
cdef size_t last_size = 0
|
||||
while string and len(string) != last_size:
|
||||
if self.token_match and self.token_match(string) \
|
||||
and not self.find_prefix(string) \
|
||||
and not self.find_suffix(string):
|
||||
break
|
||||
if self._specials.get(hash_string(string)) != NULL:
|
||||
has_special[0] = 1
|
||||
break
|
||||
|
@ -393,8 +401,9 @@ cdef class Tokenizer:
|
|||
|
||||
def _load_special_tokenization(self, special_cases):
|
||||
"""Add special-case tokenization rules."""
|
||||
for chunk, substrings in sorted(special_cases.items()):
|
||||
self.add_special_case(chunk, substrings)
|
||||
if special_cases is not None:
|
||||
for chunk, substrings in sorted(special_cases.items()):
|
||||
self.add_special_case(chunk, substrings)
|
||||
|
||||
def add_special_case(self, unicode string, substrings):
|
||||
"""Add a special-case tokenization rule.
|
||||
|
@ -423,6 +432,71 @@ cdef class Tokenizer:
|
|||
self.mem.free(stale_cached)
|
||||
self._rules[string] = substrings
|
||||
|
||||
def explain(self, text):
|
||||
"""A debugging tokenizer that provides information about which
|
||||
tokenizer rule or pattern was matched for each token. The tokens
|
||||
produced are identical to `nlp.tokenizer()` except for whitespace
|
||||
tokens.
|
||||
|
||||
string (unicode): The string to tokenize.
|
||||
RETURNS (list): A list of (pattern_string, token_string) tuples
|
||||
|
||||
DOCS: https://spacy.io/api/tokenizer#explain
|
||||
"""
|
||||
prefix_search = self.prefix_search
|
||||
suffix_search = self.suffix_search
|
||||
infix_finditer = self.infix_finditer
|
||||
token_match = self.token_match
|
||||
special_cases = {}
|
||||
for orth, special_tokens in self.rules.items():
|
||||
special_cases[orth] = [intify_attrs(special_token, strings_map=self.vocab.strings, _do_deprecated=True) for special_token in special_tokens]
|
||||
tokens = []
|
||||
for substring in text.split():
|
||||
suffixes = []
|
||||
while substring:
|
||||
while prefix_search(substring) or suffix_search(substring):
|
||||
if substring in special_cases:
|
||||
tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
|
||||
substring = ''
|
||||
break
|
||||
if prefix_search(substring):
|
||||
split = prefix_search(substring).end()
|
||||
tokens.append(("PREFIX", substring[:split]))
|
||||
substring = substring[split:]
|
||||
if substring in special_cases:
|
||||
continue
|
||||
# break if pattern matches the empty string
|
||||
if split == 0:
|
||||
break
|
||||
if suffix_search(substring):
|
||||
split = suffix_search(substring).start()
|
||||
suffixes.append(("SUFFIX", substring[split:]))
|
||||
substring = substring[:split]
|
||||
# break if pattern matches the empty string
|
||||
if split == len(substring):
|
||||
break
|
||||
if substring in special_cases:
|
||||
tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
|
||||
substring = ''
|
||||
elif token_match(substring):
|
||||
tokens.append(("TOKEN_MATCH", substring))
|
||||
substring = ''
|
||||
elif list(infix_finditer(substring)):
|
||||
infixes = infix_finditer(substring)
|
||||
offset = 0
|
||||
for match in infixes:
|
||||
tokens.append(("TOKEN", substring[offset : match.start()]))
|
||||
tokens.append(("INFIX", substring[match.start() : match.end()]))
|
||||
offset = match.end()
|
||||
if substring[offset:]:
|
||||
tokens.append(("TOKEN", substring[offset:]))
|
||||
substring = ''
|
||||
elif substring:
|
||||
tokens.append(("TOKEN", substring))
|
||||
substring = ''
|
||||
tokens.extend(reversed(suffixes))
|
||||
return tokens
|
||||
|
||||
def to_disk(self, path, **kwargs):
|
||||
"""Save the current state to a directory.
|
||||
|
||||
|
@ -507,8 +581,7 @@ cdef class Tokenizer:
|
|||
self._reset_specials()
|
||||
self._cache = PreshMap()
|
||||
self._specials = PreshMap()
|
||||
for string, substrings in data.get("rules", {}).items():
|
||||
self.add_special_case(string, substrings)
|
||||
self._load_special_tokenization(data.get("rules", {}))
|
||||
|
||||
return self
|
||||
|
||||
|
|
|
@ -34,15 +34,15 @@ the
|
|||
> tokenizer = nlp.Defaults.create_tokenizer(nlp)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------------- | ----------- | ----------------------------------------------------------------------------------- |
|
||||
| `vocab` | `Vocab` | A storage container for lexical types. |
|
||||
| `rules` | dict | Exceptions and special-cases for the tokenizer. |
|
||||
| `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. |
|
||||
| `suffix_search` | callable | A function matching the signature of `re.compile(string).search` to match suffixes. |
|
||||
| `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. |
|
||||
| `token_match` | callable | A boolean function matching strings to be recognized as tokens. |
|
||||
| **RETURNS** | `Tokenizer` | The newly constructed object. |
|
||||
| Name | Type | Description |
|
||||
| ---------------- | ----------- | ----------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | `Vocab` | A storage container for lexical types. |
|
||||
| `rules` | dict | Exceptions and special-cases for the tokenizer. |
|
||||
| `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. |
|
||||
| `suffix_search` | callable | A function matching the signature of `re.compile(string).search` to match suffixes. |
|
||||
| `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. |
|
||||
| `token_match` | callable | A function matching the signature of `re.compile(string).match to find token matches. |
|
||||
| **RETURNS** | `Tokenizer` | The newly constructed object. |
|
||||
|
||||
## Tokenizer.\_\_call\_\_ {#call tag="method"}
|
||||
|
||||
|
@ -128,6 +128,25 @@ and examples.
|
|||
| `string` | unicode | The string to specially tokenize. |
|
||||
| `token_attrs` | iterable | A sequence of dicts, where each dict describes a token and its attributes. The `ORTH` fields of the attributes must exactly match the string when they are concatenated. |
|
||||
|
||||
## Tokenizer.explain {#explain tag="method"}
|
||||
|
||||
Tokenize a string with a slow debugging tokenizer that provides information
|
||||
about which tokenizer rule or pattern was matched for each token. The tokens
|
||||
produced are identical to `Tokenizer.__call__` except for whitespace tokens.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> tok_exp = nlp.tokenizer.explain("(don't)")
|
||||
> assert [t[0] for t in tok_exp] == ["PREFIX", "SPECIAL-1", "SPECIAL-2", "SUFFIX"]
|
||||
> assert [t[1] for t in tok_exp] == ["(", "do", "n't", ")"]
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------| -------- | --------------------------------------------------- |
|
||||
| `string` | unicode | The string to tokenize with the debugging tokenizer |
|
||||
| **RETURNS** | list | A list of `(pattern_string, token_string)` tuples |
|
||||
|
||||
## Tokenizer.to_disk {#to_disk tag="method"}
|
||||
|
||||
Serialize the tokenizer to disk.
|
||||
|
@ -198,12 +217,14 @@ it.
|
|||
|
||||
## Attributes {#attributes}
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------------- | ------- | -------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | `Vocab` | The vocab object of the parent `Doc`. |
|
||||
| `prefix_search` | - | A function to find segment boundaries from the start of a string. Returns the length of the segment, or `None`. |
|
||||
| `suffix_search` | - | A function to find segment boundaries from the end of a string. Returns the length of the segment, or `None`. |
|
||||
| `infix_finditer` | - | A function to find internal segment separators, e.g. hyphens. Returns a (possibly empty) list of `re.MatchObject` objects. |
|
||||
| Name | Type | Description |
|
||||
| ---------------- | ------- | --------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | `Vocab` | The vocab object of the parent `Doc`. |
|
||||
| `prefix_search` | - | A function to find segment boundaries from the start of a string. Returns the length of the segment, or `None`. |
|
||||
| `suffix_search` | - | A function to find segment boundaries from the end of a string. Returns the length of the segment, or `None`. |
|
||||
| `infix_finditer` | - | A function to find internal segment separators, e.g. hyphens. Returns a (possibly empty) list of `re.MatchObject` objects. |
|
||||
| `token_match` | - | A function matching the signature of `re.compile(string).match to find token matches. Returns an `re.MatchObject` or `None. |
|
||||
| `rules` | dict | A dictionary of tokenizer exceptions and special cases. |
|
||||
|
||||
## Serialization fields {#serialization-fields}
|
||||
|
||||
|
|
|
@ -792,6 +792,33 @@ The algorithm can be summarized as follows:
|
|||
tokens on all infixes.
|
||||
8. Once we can't consume any more of the string, handle it as a single token.
|
||||
|
||||
#### Debugging the tokenizer {#tokenizer-debug new="2.2.3"}
|
||||
|
||||
A working implementation of the pseudo-code above is available for debugging as
|
||||
[`nlp.tokenizer.explain(text)`](/api/tokenizer#explain). It returns a list of
|
||||
tuples showing which tokenizer rule or pattern was matched for each token. The
|
||||
tokens produced are identical to `nlp.tokenizer()` except for whitespace
|
||||
tokens:
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
from spacy.lang.en import English
|
||||
nlp = English()
|
||||
text = '''"Let's go!"'''
|
||||
doc = nlp(text)
|
||||
tok_exp = nlp.tokenizer.explain(text)
|
||||
assert [t.text for t in doc if not t.is_space] == [t[1] for t in tok_exp]
|
||||
for t in tok_exp:
|
||||
print(t[1], "\\t", t[0])
|
||||
|
||||
# " PREFIX
|
||||
# Let SPECIAL-1
|
||||
# 's SPECIAL-2
|
||||
# go TOKEN
|
||||
# ! SUFFIX
|
||||
# " SUFFIX
|
||||
```
|
||||
|
||||
### Customizing spaCy's Tokenizer class {#native-tokenizers}
|
||||
|
||||
Let's imagine you wanted to create a tokenizer for a new language or specific
|
||||
|
|
Loading…
Reference in New Issue
Block a user