Add initial Tagalog (tl) tests (#9582)

* Add tl_tokenizer to test fixtures

* Add tagalog tests
This commit is contained in:
Lj Miranda 2021-11-02 15:35:49 +08:00 committed by GitHub
parent 90ec820f05
commit f1bc655a38
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 213 additions and 0 deletions

View File

@ -290,6 +290,11 @@ def ti_tokenizer():
return get_lang_class("ti")().tokenizer
@pytest.fixture(scope="session")
def tl_tokenizer():
return get_lang_class("tl")().tokenizer
@pytest.fixture(scope="session")
def tr_tokenizer():
return get_lang_class("tr")().tokenizer

View File

View File

@ -0,0 +1,8 @@
def test_tl_simple_punct(tl_tokenizer):
text = "Sige, punta ka dito"
tokens = tl_tokenizer(text)
assert tokens[0].idx == 0
assert tokens[1].idx == 4
assert tokens[2].idx == 6
assert tokens[3].idx == 12
assert tokens[4].idx == 15

View File

@ -0,0 +1,127 @@
import pytest
from spacy.util import compile_prefix_regex
from spacy.lang.punctuation import TOKENIZER_PREFIXES
PUNCT_OPEN = ["(", "[", "{", "*"]
PUNCT_CLOSE = [")", "]", "}", "*"]
PUNCT_PAIRED = [("(", ")"), ("[", "]"), ("{", "}"), ("*", "*")]
@pytest.mark.parametrize("text", ["(", "((", "<"])
def test_tl_tokenizer_handles_only_punct(tl_tokenizer, text):
tokens = tl_tokenizer(text)
assert len(tokens) == len(text)
@pytest.mark.parametrize("punct", PUNCT_OPEN)
@pytest.mark.parametrize("text", ["Mabuhay"])
def test_tl_tokenizer_split_open_punct(tl_tokenizer, punct, text):
tokens = tl_tokenizer(punct + text)
assert len(tokens) == 2
assert tokens[0].text == punct
assert tokens[1].text == text
@pytest.mark.parametrize("punct", PUNCT_CLOSE)
@pytest.mark.parametrize("text", ["Mabuhay"])
def test_tl_tokenizer_splits_close_punct(tl_tokenizer, punct, text):
tokens = tl_tokenizer(text + punct)
assert len(tokens) == 2
assert tokens[0].text == text
assert tokens[1].text == punct
@pytest.mark.parametrize("punct", PUNCT_OPEN)
@pytest.mark.parametrize("punct_add", ["`"])
@pytest.mark.parametrize("text", ["Mabuhay"])
def test_tl_tokenizer_splits_two_diff_open_punct(tl_tokenizer, punct, punct_add, text):
tokens = tl_tokenizer(punct + punct_add + text)
assert len(tokens) == 3
assert tokens[0].text == punct
assert tokens[1].text == punct_add
assert tokens[2].text == text
@pytest.mark.parametrize("punct", PUNCT_CLOSE)
@pytest.mark.parametrize("punct_add", ["`"])
@pytest.mark.parametrize("text", ["Mabuhay"])
def test_tl_tokenizer_splits_two_diff_close_punct(tl_tokenizer, punct, punct_add, text):
tokens = tl_tokenizer(text + punct + punct_add)
assert len(tokens) == 3
assert tokens[0].text == text
assert tokens[1].text == punct
assert tokens[2].text == punct_add
@pytest.mark.parametrize("punct", PUNCT_OPEN)
@pytest.mark.parametrize("text", ["Mabuhay"])
def test_tl_tokenizer_splits_same_open_punct(tl_tokenizer, punct, text):
tokens = tl_tokenizer(punct + punct + punct + text)
assert len(tokens) == 4
assert tokens[0].text == punct
assert tokens[3].text == text
@pytest.mark.parametrize("punct", PUNCT_CLOSE)
@pytest.mark.parametrize("text", ["Mabuhay"])
def test_tl_tokenizer_splits_same_close_punct(tl_tokenizer, punct, text):
tokens = tl_tokenizer(text + punct + punct + punct)
assert len(tokens) == 4
assert tokens[0].text == text
assert tokens[1].text == punct
@pytest.mark.parametrize("text", ["'Ang"])
def test_tl_tokenizer_splits_open_apostrophe(tl_tokenizer, text):
tokens = tl_tokenizer(text)
assert len(tokens) == 2
assert tokens[0].text == "'"
@pytest.mark.parametrize("text", ["Mabuhay''"])
def test_tl_tokenizer_splits_double_end_quote(tl_tokenizer, text):
tokens = tl_tokenizer(text)
assert len(tokens) == 2
tokens_punct = tl_tokenizer("''")
assert len(tokens_punct) == 1
@pytest.mark.parametrize("punct_open,punct_close", PUNCT_PAIRED)
@pytest.mark.parametrize("text", ["Mabuhay"])
def test_tl_tokenizer_splits_open_close_punct(
tl_tokenizer, punct_open, punct_close, text
):
tokens = tl_tokenizer(punct_open + text + punct_close)
assert len(tokens) == 3
assert tokens[0].text == punct_open
assert tokens[1].text == text
assert tokens[2].text == punct_close
@pytest.mark.parametrize("punct_open,punct_close", PUNCT_PAIRED)
@pytest.mark.parametrize("punct_open2,punct_close2", [("`", "'")])
@pytest.mark.parametrize("text", ["Mabuhay"])
def test_tl_tokenizer_two_diff_punct(
tl_tokenizer, punct_open, punct_close, punct_open2, punct_close2, text
):
tokens = tl_tokenizer(punct_open2 + punct_open + text + punct_close + punct_close2)
assert len(tokens) == 5
assert tokens[0].text == punct_open2
assert tokens[1].text == punct_open
assert tokens[2].text == text
assert tokens[3].text == punct_close
assert tokens[4].text == punct_close2
@pytest.mark.parametrize("text,punct", [("(sa'yo", "(")])
def test_tl_tokenizer_splits_pre_punct_regex(text, punct):
tl_search_prefixes = compile_prefix_regex(TOKENIZER_PREFIXES).search
match = tl_search_prefixes(text)
assert match.group() == punct
def test_tl_tokenizer_splits_bracket_period(tl_tokenizer):
text = "(Dumating siya kahapon)."
tokens = tl_tokenizer(text)
assert tokens[len(tokens) - 1].text == "."

View File

@ -0,0 +1,73 @@
import pytest
from spacy.lang.tl.lex_attrs import like_num
# https://github.com/explosion/spaCy/blob/master/spacy/tests/lang/en/test_text.py
def test_tl_tokenizer_handles_long_text(tl_tokenizer):
# Excerpt: "Sapagkat ang Pilosopiya ay Ginagawa" by Padre Roque Ferriols
text = """
Tingin tayo nang tingin. Kailangan lamang nating dumilat at
marami tayong makikita. At ang pagtingin ay isang gawain na ako lamang ang
makagagawa, kung ako nga ang makakita. Kahit na napanood na ng aking
matalik na kaibigan ang isang sine, kailangan ko pa ring panoorin, kung
ako nga ang may gustong makakita. Kahit na gaano kadikit ang aming
pagkabuklod, hindi siya maaaring tumingin sa isang paraan na ako ang
nakakakita. Kung ako ang makakita, ako lamang ang makatitingin.
"""
tokens = tl_tokenizer(text)
assert len(tokens) == 97
@pytest.mark.parametrize(
"text,length",
[
("Huwag mo nang itanong sa akin.", 7),
("Nasubukan mo na bang hulihin ang hangin?", 8),
("Hindi ba?", 3),
("Nagbukas ang DFA ng 1,000 appointment slots para sa pasaporte.", 11),
("'Wala raw pasok bukas kasi may bagyo!' sabi ni Micah.", 14),
("'Ingat,' aniya. 'Maingay sila pag malayo at tahimik kung malapit.'", 17),
],
)
def test_tl_tokenizer_handles_cnts(tl_tokenizer, text, length):
tokens = tl_tokenizer(text)
assert len(tokens) == length
@pytest.mark.parametrize(
"text,match",
[
("10", True),
("isa", True),
("dalawa", True),
("tatlumpu", True),
pytest.param(
"isang daan",
True,
marks=pytest.mark.xfail(reason="Not yet implemented (means 100)"),
),
pytest.param(
"kalahati",
True,
marks=pytest.mark.xfail(reason="Not yet implemented (means 1/2)"),
),
pytest.param(
"isa't kalahati",
True,
marks=pytest.mark.xfail(
reason="Not yet implemented (means one-and-a-half)"
),
),
],
)
def test_lex_attrs_like_number(tl_tokenizer, text, match):
tokens = tl_tokenizer(text)
assert all([token.like_num for token in tokens]) == match
@pytest.mark.xfail(reason="Not yet implemented, fails when capitalized.")
@pytest.mark.parametrize("word", ["isa", "dalawa", "tatlo"])
def test_tl_lex_attrs_capitals(word):
assert like_num(word)
assert like_num(word.upper())