Adding num_like test for Czech (#5946)

* Create lex_attrs.py Hello, I am missing a CZECH language in SpaCy. So I would like to help to push it a little. This file is base on others lex_attrs.py files just with translation to Czech. * Update __init__.py Updated for use with new Czech Lex_attrs file * Update stop_words.py * Create test_text.py * add like_num testing for czech Co-authored-by: holubvl3 <47881982+holubvl3@users.noreply.github.com> Co-authored-by: holubvl3 <vilemrousi@gmail.com> Co-authored-by: Vladimír Holubec <vholubec@arcdata.cz>
2025-12-08 02:34:17 +03:00 · 2020-08-21 17:06:33 +02:00 · 2020-08-21 17:06:33 +02:00 · 56eabcb2f2
commit 56eabcb2f2
parent a341b4ef09
3 changed files with 31 additions and 0 deletions
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -47,6 +47,11 @@ def ca_tokenizer():
    return get_lang_class("ca").Defaults.create_tokenizer()


+@pytest.fixture(scope="session")
+def cs_tokenizer():
+    return get_lang_class("cs").Defaults.create_tokenizer()
+
+
@pytest.fixture(scope="session")
 def da_tokenizer():
    return get_lang_class("da").Defaults.create_tokenizer()
--- a/spacy/tests/lang/cs/init.py
+++ b/spacy/tests/lang/cs/init.py
--- a/spacy/tests/lang/cs/test_text.py
+++ b/spacy/tests/lang/cs/test_text.py
@ -0,0 +1,26 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import pytest
+
+
+@pytest.mark.parametrize(
+    "text,match",
+    [
+        ("10", True),
+        ("1", True),
+        ("10.000", True),
+        ("1000", True),
+        ("999,0", True),
+        ("devatenáct", True),
+        ("osmdesát", True),
+        ("kvadrilion", True),
+        ("Pes", False),
+        (",", False),
+        ("1/2", True),
+    ],
+)
+def test_lex_attrs_like_number(cs_tokenizer, text, match):
+    tokens = cs_tokenizer(text)
+    assert len(tokens) == 1
+    assert tokens[0].like_num == match