Update Tokenizer.explain tests

2025-09-23 12:36:46 +03:00 · 2019-11-20 13:14:11 +01:00 · 2019-11-20 13:14:11 +01:00 · 2e7c896fe5
commit 2e7c896fe5
parent 2c876eb672
1 changed files with 52 additions and 19 deletions
--- a/spacy/tests/tokenizer/test_explain.py
+++ b/spacy/tests/tokenizer/test_explain.py
@ -1,30 +1,63 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import importlib
 import pytest
 from spacy.util import get_lang_class
 # fmt: off
 # Only include languages with no external dependencies
-# excluded: ja, ru, th, uk, vi, zh
+# "is" seems to confuse importlib, so we're also excluding it for now
-LANGUAGES = ["af", "ar", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es",
+# excluded: ja, ru, th, uk, vi, zh, is
-             "et", "fa", "fi", "fr", "ga", "he", "hi", "hr", "hu", "id", "is",
+LANGUAGES = [
-             "it", "kn", "lt", "lv", "nb", "nl", "pl", "pt", "ro", "si", "sk",
+    "af",
-             "sl", "sq", "sr", "sv", "ta", "te", "tl", "tr", "tt", "ur"]
+    "ar",
-# fmt: on
+    "bg",
    "bn",
    pytest.param("ca", marks=pytest.mark.xfail()),
    "cs",
    "da",
    "de",
    "el",
    "en",
    "es",
    "et",
    "fa",
    "fi",
    pytest.param("fr", marks=pytest.mark.xfail()),
    "ga",
    "he",
    "hi",
    "hr",
    pytest.param("hu", marks=pytest.mark.xfail()),
    "id",
    "it",
    "kn",
    "lt",
    "lv",
    "nb",
    "nl",
    pytest.param("pl", marks=pytest.mark.xfail()),
    "pt",
    "ro",
    "si",
    "sk",
    "sl",
    "sq",
    "sr",
    "sv",
    "ta",
    "te",
    "tl",
    "tr",
    "tt",
    "ur",
 ]
-
+# @pytest.mark.slow
@pytest.mark.slow
@pytest.mark.parametrize("lang", LANGUAGES)
 def test_tokenizer_explain(lang):
    nlp = get_lang_class(lang)()
-    try:
+    examples = pytest.importorskip("spacy.lang.{}.examples".format(lang))
-        examples = importlib.import_module("spacy.lang." + lang + ".examples")
+    for sentence in examples.sentences:
-        for sentence in examples.sentences:
+        tokens = [t.text for t in nlp.tokenizer(sentence) if not t.is_space]
-            tokens = [t.text for t in nlp.tokenizer(sentence) if not t.is_space]
+        debug_tokens = [t[1] for t in nlp.tokenizer.explain(sentence)]
-            debug_tokens = [t[1] for t in nlp.tokenizer.explain(sentence)]
+        assert tokens == debug_tokens
            assert tokens == debug_tokens
    except:
        pass