Update Tokenizer.explain for special cases with whitespace (#13086)

* Update Tokenizer.explain for special cases with whitespace

Update `Tokenizer.explain` to skip special case matches if the exact
text has not been matched due to intervening whitespace.

Enable fuzzy `Tokenizer.explain` tests with additional whitespace
normalization.

* Add unit test for special cases with whitespace, xfail fuzzy tests again
This commit is contained in:
Adriane Boyd 2023-11-06 17:29:59 +01:00 committed by GitHub
parent ff9ddb6a07
commit 0c25725359
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 26 additions and 4 deletions

View File

@ -85,6 +85,18 @@ def test_tokenizer_explain_special_matcher(en_vocab):
assert tokens == explain_tokens
def test_tokenizer_explain_special_matcher_whitespace(en_vocab):
rules = {":]": [{"ORTH": ":]"}]}
tokenizer = Tokenizer(
en_vocab,
rules=rules,
)
text = ": ]"
tokens = [t.text for t in tokenizer(text)]
explain_tokens = [t[1] for t in tokenizer.explain(text)]
assert tokens == explain_tokens
@hypothesis.strategies.composite
def sentence_strategy(draw: hypothesis.strategies.DrawFn, max_n_words: int = 4) -> str:
"""
@ -123,6 +135,9 @@ def test_tokenizer_explain_fuzzy(lang: str, sentence: str) -> None:
"""
tokenizer: Tokenizer = spacy.blank(lang).tokenizer
tokens = [t.text for t in tokenizer(sentence) if not t.is_space]
# Tokenizer.explain is not intended to handle whitespace or control
# characters in the same way as Tokenizer
sentence = re.sub(r"\s+", " ", sentence).strip()
tokens = [t.text for t in tokenizer(sentence)]
debug_tokens = [t[1] for t in tokenizer.explain(sentence)]
assert tokens == debug_tokens, f"{tokens}, {debug_tokens}, {sentence}"

View File

@ -730,9 +730,16 @@ cdef class Tokenizer:
if i in spans_by_start:
span = spans_by_start[i]
exc = [d[ORTH] for d in special_cases[span.label_]]
for j, orth in enumerate(exc):
final_tokens.append((f"SPECIAL-{j + 1}", self.vocab.strings[orth]))
i += len(span)
# The phrase matcher can overmatch for tokens separated by
# spaces in the text but not in the underlying rule, so skip
# cases where the texts aren't identical
if span.text != "".join([self.vocab.strings[orth] for orth in exc]):
final_tokens.append(tokens[i])
i += 1
else:
for j, orth in enumerate(exc):
final_tokens.append((f"SPECIAL-{j + 1}", self.vocab.strings[orth]))
i += len(span)
else:
final_tokens.append(tokens[i])
i += 1