Update Tokenizer.explain with special matches (#7749)

* Update Tokenizer.explain with special matches

Update `Tokenizer.explain` and the pseudo-code in the docs to include
the processing of special cases that contain affixes or whitespace.

* Handle optional settings in explain

* Add test for special matches in explain

Add test for `Tokenizer.explain` for special cases containing affixes.
This commit is contained in:
Adriane Boyd 2021-04-19 11:08:20 +02:00 committed by GitHub
parent 07b41c38ae
commit 0e7f94b247
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 59 additions and 4 deletions

View File

@ -1,5 +1,7 @@
import pytest
import re
from spacy.util import get_lang_class
from spacy.tokenizer import Tokenizer
# Only include languages with no external dependencies
# "is" seems to confuse importlib, so we're also excluding it for now
@ -60,3 +62,18 @@ def test_tokenizer_explain(lang):
tokens = [t.text for t in tokenizer(sentence) if not t.is_space]
debug_tokens = [t[1] for t in tokenizer.explain(sentence)]
assert tokens == debug_tokens
def test_tokenizer_explain_special_matcher(en_vocab):
suffix_re = re.compile(r"[\.]$")
infix_re = re.compile(r"[/]")
rules = {"a.": [{"ORTH": "a."}]}
tokenizer = Tokenizer(
en_vocab,
rules=rules,
suffix_search=suffix_re.search,
infix_finditer=infix_re.finditer,
)
tokens = [t.text for t in tokenizer("a/a.")]
explain_tokens = [t[1] for t in tokenizer.explain("a/a.")]
assert tokens == explain_tokens

View File

@ -20,11 +20,12 @@ from .attrs import intify_attrs
from .symbols import ORTH, NORM
from .errors import Errors, Warnings
from . import util
from .util import registry
from .util import registry, get_words_and_spaces
from .attrs import intify_attrs
from .symbols import ORTH
from .scorer import Scorer
from .training import validate_examples
from .tokens import Span
cdef class Tokenizer:
@ -638,8 +639,14 @@ cdef class Tokenizer:
DOCS: https://spacy.io/api/tokenizer#explain
"""
prefix_search = self.prefix_search
if prefix_search is None:
prefix_search = re.compile("a^").search
suffix_search = self.suffix_search
if suffix_search is None:
suffix_search = re.compile("a^").search
infix_finditer = self.infix_finditer
if infix_finditer is None:
infix_finditer = re.compile("a^").finditer
token_match = self.token_match
if token_match is None:
token_match = re.compile("a^").match
@ -687,7 +694,7 @@ cdef class Tokenizer:
tokens.append(("URL_MATCH", substring))
substring = ''
elif substring in special_cases:
tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
tokens.extend((f"SPECIAL-{i + 1}", self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
substring = ''
elif list(infix_finditer(substring)):
infixes = infix_finditer(substring)
@ -705,7 +712,33 @@ cdef class Tokenizer:
tokens.append(("TOKEN", substring))
substring = ''
tokens.extend(reversed(suffixes))
return tokens
# Find matches for special cases handled by special matcher
words, spaces = get_words_and_spaces([t[1] for t in tokens], text)
t_words = []
t_spaces = []
for word, space in zip(words, spaces):
if not word.isspace():
t_words.append(word)
t_spaces.append(space)
doc = Doc(self.vocab, words=t_words, spaces=t_spaces)
matches = self._special_matcher(doc)
spans = [Span(doc, s, e, label=m_id) for m_id, s, e in matches]
spans = util.filter_spans(spans)
# Replace matched tokens with their exceptions
i = 0
final_tokens = []
spans_by_start = {s.start: s for s in spans}
while i < len(tokens):
if i in spans_by_start:
span = spans_by_start[i]
exc = [d[ORTH] for d in special_cases[span.label_]]
for j, orth in enumerate(exc):
final_tokens.append((f"SPECIAL-{j + 1}", self.vocab.strings[orth]))
i += len(span)
else:
final_tokens.append(tokens[i])
i += 1
return final_tokens
def score(self, examples, **kwargs):
validate_examples(examples, "Tokenizer.score")

View File

@ -786,6 +786,7 @@ rather than performance:
```python
def tokenizer_pseudo_code(
text,
special_cases,
prefix_search,
suffix_search,
@ -839,12 +840,14 @@ def tokenizer_pseudo_code(
tokens.append(substring)
substring = ""
tokens.extend(reversed(suffixes))
for match in matcher(special_cases, text):
tokens.replace(match, special_cases[match])
return tokens
```
The algorithm can be summarized as follows:
1. Iterate over whitespace-separated substrings.
1. Iterate over space-separated substrings.
2. Look for a token match. If there is a match, stop processing and keep this
token.
3. Check whether we have an explicitly defined special case for this substring.
@ -858,6 +861,8 @@ The algorithm can be summarized as follows:
8. Look for "infixes" stuff like hyphens etc. and split the substring into
tokens on all infixes.
9. Once we can't consume any more of the string, handle it as a single token.
10. Make a final pass over the text to check for special cases that include
spaces or that were missed due to the incremental processing of affixes.
</Accordion>