Add initial Tagalog (tl) tests (#9582)

* Add tl_tokenizer to test fixtures * Add tagalog tests
2025-09-23 04:26:46 +03:00 · 2021-11-02 15:35:49 +08:00 · 2021-11-02 15:35:49 +08:00 · f1bc655a38
commit f1bc655a38
parent 90ec820f05
5 changed files with 213 additions and 0 deletions
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -290,6 +290,11 @@ def ti_tokenizer():
    return get_lang_class("ti")().tokenizer


+@pytest.fixture(scope="session")
+def tl_tokenizer():
+    return get_lang_class("tl")().tokenizer
+
+
@pytest.fixture(scope="session")
 def tr_tokenizer():
    return get_lang_class("tr")().tokenizer
--- a/spacy/tests/lang/tl/init.py
+++ b/spacy/tests/lang/tl/init.py
--- a/spacy/tests/lang/tl/test_indices.py
+++ b/spacy/tests/lang/tl/test_indices.py
@ -0,0 +1,8 @@
+def test_tl_simple_punct(tl_tokenizer):
+    text = "Sige, punta ka dito"
+    tokens = tl_tokenizer(text)
+    assert tokens[0].idx == 0
+    assert tokens[1].idx == 4
+    assert tokens[2].idx == 6
+    assert tokens[3].idx == 12
+    assert tokens[4].idx == 15
--- a/spacy/tests/lang/tl/test_punct.py
+++ b/spacy/tests/lang/tl/test_punct.py
@ -0,0 +1,127 @@
+import pytest
+from spacy.util import compile_prefix_regex
+from spacy.lang.punctuation import TOKENIZER_PREFIXES
+
+
+PUNCT_OPEN = ["(", "[", "{", "*"]
+PUNCT_CLOSE = [")", "]", "}", "*"]
+PUNCT_PAIRED = [("(", ")"), ("[", "]"), ("{", "}"), ("*", "*")]
+
+
+@pytest.mark.parametrize("text", ["(", "((", "<"])
+def test_tl_tokenizer_handles_only_punct(tl_tokenizer, text):
+    tokens = tl_tokenizer(text)
+    assert len(tokens) == len(text)
+
+
+@pytest.mark.parametrize("punct", PUNCT_OPEN)
+@pytest.mark.parametrize("text", ["Mabuhay"])
+def test_tl_tokenizer_split_open_punct(tl_tokenizer, punct, text):
+    tokens = tl_tokenizer(punct + text)
+    assert len(tokens) == 2
+    assert tokens[0].text == punct
+    assert tokens[1].text == text
+
+
+@pytest.mark.parametrize("punct", PUNCT_CLOSE)
+@pytest.mark.parametrize("text", ["Mabuhay"])
+def test_tl_tokenizer_splits_close_punct(tl_tokenizer, punct, text):
+    tokens = tl_tokenizer(text + punct)
+    assert len(tokens) == 2
+    assert tokens[0].text == text
+    assert tokens[1].text == punct
+
+
+@pytest.mark.parametrize("punct", PUNCT_OPEN)
+@pytest.mark.parametrize("punct_add", ["`"])
+@pytest.mark.parametrize("text", ["Mabuhay"])
+def test_tl_tokenizer_splits_two_diff_open_punct(tl_tokenizer, punct, punct_add, text):
+    tokens = tl_tokenizer(punct + punct_add + text)
+    assert len(tokens) == 3
+    assert tokens[0].text == punct
+    assert tokens[1].text == punct_add
+    assert tokens[2].text == text
+
+
+@pytest.mark.parametrize("punct", PUNCT_CLOSE)
+@pytest.mark.parametrize("punct_add", ["`"])
+@pytest.mark.parametrize("text", ["Mabuhay"])
+def test_tl_tokenizer_splits_two_diff_close_punct(tl_tokenizer, punct, punct_add, text):
+    tokens = tl_tokenizer(text + punct + punct_add)
+    assert len(tokens) == 3
+    assert tokens[0].text == text
+    assert tokens[1].text == punct
+    assert tokens[2].text == punct_add
+
+
+@pytest.mark.parametrize("punct", PUNCT_OPEN)
+@pytest.mark.parametrize("text", ["Mabuhay"])
+def test_tl_tokenizer_splits_same_open_punct(tl_tokenizer, punct, text):
+    tokens = tl_tokenizer(punct + punct + punct + text)
+    assert len(tokens) == 4
+    assert tokens[0].text == punct
+    assert tokens[3].text == text
+
+
+@pytest.mark.parametrize("punct", PUNCT_CLOSE)
+@pytest.mark.parametrize("text", ["Mabuhay"])
+def test_tl_tokenizer_splits_same_close_punct(tl_tokenizer, punct, text):
+    tokens = tl_tokenizer(text + punct + punct + punct)
+    assert len(tokens) == 4
+    assert tokens[0].text == text
+    assert tokens[1].text == punct
+
+
+@pytest.mark.parametrize("text", ["'Ang"])
+def test_tl_tokenizer_splits_open_apostrophe(tl_tokenizer, text):
+    tokens = tl_tokenizer(text)
+    assert len(tokens) == 2
+    assert tokens[0].text == "'"
+
+
+@pytest.mark.parametrize("text", ["Mabuhay''"])
+def test_tl_tokenizer_splits_double_end_quote(tl_tokenizer, text):
+    tokens = tl_tokenizer(text)
+    assert len(tokens) == 2
+    tokens_punct = tl_tokenizer("''")
+    assert len(tokens_punct) == 1
+
+
+@pytest.mark.parametrize("punct_open,punct_close", PUNCT_PAIRED)
+@pytest.mark.parametrize("text", ["Mabuhay"])
+def test_tl_tokenizer_splits_open_close_punct(
+    tl_tokenizer, punct_open, punct_close, text
+):
+    tokens = tl_tokenizer(punct_open + text + punct_close)
+    assert len(tokens) == 3
+    assert tokens[0].text == punct_open
+    assert tokens[1].text == text
+    assert tokens[2].text == punct_close
+
+
+@pytest.mark.parametrize("punct_open,punct_close", PUNCT_PAIRED)
+@pytest.mark.parametrize("punct_open2,punct_close2", [("`", "'")])
+@pytest.mark.parametrize("text", ["Mabuhay"])
+def test_tl_tokenizer_two_diff_punct(
+    tl_tokenizer, punct_open, punct_close, punct_open2, punct_close2, text
+):
+    tokens = tl_tokenizer(punct_open2 + punct_open + text + punct_close + punct_close2)
+    assert len(tokens) == 5
+    assert tokens[0].text == punct_open2
+    assert tokens[1].text == punct_open
+    assert tokens[2].text == text
+    assert tokens[3].text == punct_close
+    assert tokens[4].text == punct_close2
+
+
+@pytest.mark.parametrize("text,punct", [("(sa'yo", "(")])
+def test_tl_tokenizer_splits_pre_punct_regex(text, punct):
+    tl_search_prefixes = compile_prefix_regex(TOKENIZER_PREFIXES).search
+    match = tl_search_prefixes(text)
+    assert match.group() == punct
+
+
+def test_tl_tokenizer_splits_bracket_period(tl_tokenizer):
+    text = "(Dumating siya kahapon)."
+    tokens = tl_tokenizer(text)
+    assert tokens[len(tokens) - 1].text == "."
--- a/spacy/tests/lang/tl/test_text.py
+++ b/spacy/tests/lang/tl/test_text.py
@ -0,0 +1,73 @@
+import pytest
+from spacy.lang.tl.lex_attrs import like_num
+
+# https://github.com/explosion/spaCy/blob/master/spacy/tests/lang/en/test_text.py
+
+
+def test_tl_tokenizer_handles_long_text(tl_tokenizer):
+    # Excerpt: "Sapagkat ang Pilosopiya ay Ginagawa" by Padre Roque Ferriols
+    text = """
+    Tingin tayo nang tingin. Kailangan lamang nating dumilat at
+    marami tayong makikita. At ang pagtingin ay isang gawain na ako lamang ang
+    makagagawa, kung ako nga ang makakita. Kahit na napanood na ng aking
+    matalik na kaibigan ang isang sine, kailangan ko pa ring panoorin, kung
+    ako nga ang may gustong makakita. Kahit na gaano kadikit ang aming
+    pagkabuklod, hindi siya maaaring tumingin sa isang paraan na ako ang
+    nakakakita. Kung ako ang makakita, ako lamang ang makatitingin.
+    """
+    tokens = tl_tokenizer(text)
+    assert len(tokens) == 97
+
+
+@pytest.mark.parametrize(
+    "text,length",
+    [
+        ("Huwag mo nang itanong sa akin.", 7),
+        ("Nasubukan mo na bang hulihin ang hangin?", 8),
+        ("Hindi ba?", 3),
+        ("Nagbukas ang DFA ng 1,000 appointment slots para sa pasaporte.", 11),
+        ("'Wala raw pasok bukas kasi may bagyo!' sabi ni Micah.", 14),
+        ("'Ingat,' aniya. 'Maingay sila pag malayo at tahimik kung malapit.'", 17),
+    ],
+)
+def test_tl_tokenizer_handles_cnts(tl_tokenizer, text, length):
+    tokens = tl_tokenizer(text)
+    assert len(tokens) == length
+
+
+@pytest.mark.parametrize(
+    "text,match",
+    [
+        ("10", True),
+        ("isa", True),
+        ("dalawa", True),
+        ("tatlumpu", True),
+        pytest.param(
+            "isang daan",
+            True,
+            marks=pytest.mark.xfail(reason="Not yet implemented (means 100)"),
+        ),
+        pytest.param(
+            "kalahati",
+            True,
+            marks=pytest.mark.xfail(reason="Not yet implemented (means 1/2)"),
+        ),
+        pytest.param(
+            "isa't kalahati",
+            True,
+            marks=pytest.mark.xfail(
+                reason="Not yet implemented (means one-and-a-half)"
+            ),
+        ),
+    ],
+)
+def test_lex_attrs_like_number(tl_tokenizer, text, match):
+    tokens = tl_tokenizer(text)
+    assert all([token.like_num for token in tokens]) == match
+
+
+@pytest.mark.xfail(reason="Not yet implemented, fails when capitalized.")
+@pytest.mark.parametrize("word", ["isa", "dalawa", "tatlo"])
+def test_tl_lex_attrs_capitals(word):
+    assert like_num(word)
+    assert like_num(word.upper())