mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-14 03:26:24 +03:00
Update Tokenizer.explain for special cases with whitespace (#13086)
* Update Tokenizer.explain for special cases with whitespace Update `Tokenizer.explain` to skip special case matches if the exact text has not been matched due to intervening whitespace. Enable fuzzy `Tokenizer.explain` tests with additional whitespace normalization. * Add unit test for special cases with whitespace, xfail fuzzy tests again
This commit is contained in:
parent
ff9ddb6a07
commit
0c25725359
|
@ -85,6 +85,18 @@ def test_tokenizer_explain_special_matcher(en_vocab):
|
||||||
assert tokens == explain_tokens
|
assert tokens == explain_tokens
|
||||||
|
|
||||||
|
|
||||||
|
def test_tokenizer_explain_special_matcher_whitespace(en_vocab):
|
||||||
|
rules = {":]": [{"ORTH": ":]"}]}
|
||||||
|
tokenizer = Tokenizer(
|
||||||
|
en_vocab,
|
||||||
|
rules=rules,
|
||||||
|
)
|
||||||
|
text = ": ]"
|
||||||
|
tokens = [t.text for t in tokenizer(text)]
|
||||||
|
explain_tokens = [t[1] for t in tokenizer.explain(text)]
|
||||||
|
assert tokens == explain_tokens
|
||||||
|
|
||||||
|
|
||||||
@hypothesis.strategies.composite
|
@hypothesis.strategies.composite
|
||||||
def sentence_strategy(draw: hypothesis.strategies.DrawFn, max_n_words: int = 4) -> str:
|
def sentence_strategy(draw: hypothesis.strategies.DrawFn, max_n_words: int = 4) -> str:
|
||||||
"""
|
"""
|
||||||
|
@ -123,6 +135,9 @@ def test_tokenizer_explain_fuzzy(lang: str, sentence: str) -> None:
|
||||||
"""
|
"""
|
||||||
|
|
||||||
tokenizer: Tokenizer = spacy.blank(lang).tokenizer
|
tokenizer: Tokenizer = spacy.blank(lang).tokenizer
|
||||||
tokens = [t.text for t in tokenizer(sentence) if not t.is_space]
|
# Tokenizer.explain is not intended to handle whitespace or control
|
||||||
|
# characters in the same way as Tokenizer
|
||||||
|
sentence = re.sub(r"\s+", " ", sentence).strip()
|
||||||
|
tokens = [t.text for t in tokenizer(sentence)]
|
||||||
debug_tokens = [t[1] for t in tokenizer.explain(sentence)]
|
debug_tokens = [t[1] for t in tokenizer.explain(sentence)]
|
||||||
assert tokens == debug_tokens, f"{tokens}, {debug_tokens}, {sentence}"
|
assert tokens == debug_tokens, f"{tokens}, {debug_tokens}, {sentence}"
|
||||||
|
|
|
@ -730,9 +730,16 @@ cdef class Tokenizer:
|
||||||
if i in spans_by_start:
|
if i in spans_by_start:
|
||||||
span = spans_by_start[i]
|
span = spans_by_start[i]
|
||||||
exc = [d[ORTH] for d in special_cases[span.label_]]
|
exc = [d[ORTH] for d in special_cases[span.label_]]
|
||||||
for j, orth in enumerate(exc):
|
# The phrase matcher can overmatch for tokens separated by
|
||||||
final_tokens.append((f"SPECIAL-{j + 1}", self.vocab.strings[orth]))
|
# spaces in the text but not in the underlying rule, so skip
|
||||||
i += len(span)
|
# cases where the texts aren't identical
|
||||||
|
if span.text != "".join([self.vocab.strings[orth] for orth in exc]):
|
||||||
|
final_tokens.append(tokens[i])
|
||||||
|
i += 1
|
||||||
|
else:
|
||||||
|
for j, orth in enumerate(exc):
|
||||||
|
final_tokens.append((f"SPECIAL-{j + 1}", self.vocab.strings[orth]))
|
||||||
|
i += len(span)
|
||||||
else:
|
else:
|
||||||
final_tokens.append(tokens[i])
|
final_tokens.append(tokens[i])
|
||||||
i += 1
|
i += 1
|
||||||
|
|
Loading…
Reference in New Issue
Block a user