mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-27 09:44:36 +03:00
Fuzz tokenizer.explain: draft for fuzzy tests. (#10771)
* Fuzz tokenizer.explain: draft for fuzzy tests. * Fuzz tokenizer.explain: xignoring tokenizer.explain() tests. Removed deadline modification. Removed LANGUAGES_WITHOUT_TOKENIZERS. * Fuzz tokenizer.explain: changed tokenizer initialization to avoid failus in Azure runs. * Fuzz tokenizer.explain: type hint for tokenizer in test. Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
This commit is contained in:
parent
99aeaf9bd3
commit
357be2614e
|
@ -1,7 +1,13 @@
|
||||||
import pytest
|
|
||||||
import re
|
import re
|
||||||
from spacy.util import get_lang_class
|
import string
|
||||||
|
|
||||||
|
import hypothesis
|
||||||
|
import hypothesis.strategies
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
import spacy
|
||||||
from spacy.tokenizer import Tokenizer
|
from spacy.tokenizer import Tokenizer
|
||||||
|
from spacy.util import get_lang_class
|
||||||
|
|
||||||
# Only include languages with no external dependencies
|
# Only include languages with no external dependencies
|
||||||
# "is" seems to confuse importlib, so we're also excluding it for now
|
# "is" seems to confuse importlib, so we're also excluding it for now
|
||||||
|
@ -77,3 +83,46 @@ def test_tokenizer_explain_special_matcher(en_vocab):
|
||||||
tokens = [t.text for t in tokenizer("a/a.")]
|
tokens = [t.text for t in tokenizer("a/a.")]
|
||||||
explain_tokens = [t[1] for t in tokenizer.explain("a/a.")]
|
explain_tokens = [t[1] for t in tokenizer.explain("a/a.")]
|
||||||
assert tokens == explain_tokens
|
assert tokens == explain_tokens
|
||||||
|
|
||||||
|
|
||||||
|
@hypothesis.strategies.composite
|
||||||
|
def sentence_strategy(draw: hypothesis.strategies.DrawFn, max_n_words: int = 4) -> str:
|
||||||
|
"""
|
||||||
|
Composite strategy for fuzzily generating sentence with varying interpunctation.
|
||||||
|
|
||||||
|
draw (hypothesis.strategies.DrawFn): Protocol for drawing function allowing to fuzzily pick from hypothesis'
|
||||||
|
strategies.
|
||||||
|
max_n_words (int): Max. number of words in generated sentence.
|
||||||
|
RETURNS (str): Fuzzily generated sentence.
|
||||||
|
"""
|
||||||
|
|
||||||
|
punctuation_and_space_regex = "|".join(
|
||||||
|
[*[re.escape(p) for p in string.punctuation], r"\s"]
|
||||||
|
)
|
||||||
|
sentence = [
|
||||||
|
[
|
||||||
|
draw(hypothesis.strategies.text(min_size=1)),
|
||||||
|
draw(hypothesis.strategies.from_regex(punctuation_and_space_regex)),
|
||||||
|
]
|
||||||
|
for _ in range(
|
||||||
|
draw(hypothesis.strategies.integers(min_value=2, max_value=max_n_words))
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
return " ".join([token for token_pair in sentence for token in token_pair])
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail
|
||||||
|
@pytest.mark.parametrize("lang", LANGUAGES)
|
||||||
|
@hypothesis.given(sentence=sentence_strategy())
|
||||||
|
def test_tokenizer_explain_fuzzy(lang: str, sentence: str) -> None:
|
||||||
|
"""
|
||||||
|
Tests whether output of tokenizer.explain() matches tokenizer output. Input generated by hypothesis.
|
||||||
|
lang (str): Language to test.
|
||||||
|
text (str): Fuzzily generated sentence to tokenize.
|
||||||
|
"""
|
||||||
|
|
||||||
|
tokenizer: Tokenizer = spacy.blank(lang).tokenizer
|
||||||
|
tokens = [t.text for t in tokenizer(sentence) if not t.is_space]
|
||||||
|
debug_tokens = [t[1] for t in tokenizer.explain(sentence)]
|
||||||
|
assert tokens == debug_tokens, f"{tokens}, {debug_tokens}, {sentence}"
|
||||||
|
|
Loading…
Reference in New Issue
Block a user