mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Add initial Tagalog (tl) tests (#9582)
* Add tl_tokenizer to test fixtures * Add tagalog tests
This commit is contained in:
parent
90ec820f05
commit
f1bc655a38
|
@ -290,6 +290,11 @@ def ti_tokenizer():
|
|||
return get_lang_class("ti")().tokenizer
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def tl_tokenizer():
|
||||
return get_lang_class("tl")().tokenizer
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def tr_tokenizer():
|
||||
return get_lang_class("tr")().tokenizer
|
||||
|
|
0
spacy/tests/lang/tl/__init__.py
Normal file
0
spacy/tests/lang/tl/__init__.py
Normal file
8
spacy/tests/lang/tl/test_indices.py
Normal file
8
spacy/tests/lang/tl/test_indices.py
Normal file
|
@ -0,0 +1,8 @@
|
|||
def test_tl_simple_punct(tl_tokenizer):
|
||||
text = "Sige, punta ka dito"
|
||||
tokens = tl_tokenizer(text)
|
||||
assert tokens[0].idx == 0
|
||||
assert tokens[1].idx == 4
|
||||
assert tokens[2].idx == 6
|
||||
assert tokens[3].idx == 12
|
||||
assert tokens[4].idx == 15
|
127
spacy/tests/lang/tl/test_punct.py
Normal file
127
spacy/tests/lang/tl/test_punct.py
Normal file
|
@ -0,0 +1,127 @@
|
|||
import pytest
|
||||
from spacy.util import compile_prefix_regex
|
||||
from spacy.lang.punctuation import TOKENIZER_PREFIXES
|
||||
|
||||
|
||||
PUNCT_OPEN = ["(", "[", "{", "*"]
|
||||
PUNCT_CLOSE = [")", "]", "}", "*"]
|
||||
PUNCT_PAIRED = [("(", ")"), ("[", "]"), ("{", "}"), ("*", "*")]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text", ["(", "((", "<"])
|
||||
def test_tl_tokenizer_handles_only_punct(tl_tokenizer, text):
|
||||
tokens = tl_tokenizer(text)
|
||||
assert len(tokens) == len(text)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("punct", PUNCT_OPEN)
|
||||
@pytest.mark.parametrize("text", ["Mabuhay"])
|
||||
def test_tl_tokenizer_split_open_punct(tl_tokenizer, punct, text):
|
||||
tokens = tl_tokenizer(punct + text)
|
||||
assert len(tokens) == 2
|
||||
assert tokens[0].text == punct
|
||||
assert tokens[1].text == text
|
||||
|
||||
|
||||
@pytest.mark.parametrize("punct", PUNCT_CLOSE)
|
||||
@pytest.mark.parametrize("text", ["Mabuhay"])
|
||||
def test_tl_tokenizer_splits_close_punct(tl_tokenizer, punct, text):
|
||||
tokens = tl_tokenizer(text + punct)
|
||||
assert len(tokens) == 2
|
||||
assert tokens[0].text == text
|
||||
assert tokens[1].text == punct
|
||||
|
||||
|
||||
@pytest.mark.parametrize("punct", PUNCT_OPEN)
|
||||
@pytest.mark.parametrize("punct_add", ["`"])
|
||||
@pytest.mark.parametrize("text", ["Mabuhay"])
|
||||
def test_tl_tokenizer_splits_two_diff_open_punct(tl_tokenizer, punct, punct_add, text):
|
||||
tokens = tl_tokenizer(punct + punct_add + text)
|
||||
assert len(tokens) == 3
|
||||
assert tokens[0].text == punct
|
||||
assert tokens[1].text == punct_add
|
||||
assert tokens[2].text == text
|
||||
|
||||
|
||||
@pytest.mark.parametrize("punct", PUNCT_CLOSE)
|
||||
@pytest.mark.parametrize("punct_add", ["`"])
|
||||
@pytest.mark.parametrize("text", ["Mabuhay"])
|
||||
def test_tl_tokenizer_splits_two_diff_close_punct(tl_tokenizer, punct, punct_add, text):
|
||||
tokens = tl_tokenizer(text + punct + punct_add)
|
||||
assert len(tokens) == 3
|
||||
assert tokens[0].text == text
|
||||
assert tokens[1].text == punct
|
||||
assert tokens[2].text == punct_add
|
||||
|
||||
|
||||
@pytest.mark.parametrize("punct", PUNCT_OPEN)
|
||||
@pytest.mark.parametrize("text", ["Mabuhay"])
|
||||
def test_tl_tokenizer_splits_same_open_punct(tl_tokenizer, punct, text):
|
||||
tokens = tl_tokenizer(punct + punct + punct + text)
|
||||
assert len(tokens) == 4
|
||||
assert tokens[0].text == punct
|
||||
assert tokens[3].text == text
|
||||
|
||||
|
||||
@pytest.mark.parametrize("punct", PUNCT_CLOSE)
|
||||
@pytest.mark.parametrize("text", ["Mabuhay"])
|
||||
def test_tl_tokenizer_splits_same_close_punct(tl_tokenizer, punct, text):
|
||||
tokens = tl_tokenizer(text + punct + punct + punct)
|
||||
assert len(tokens) == 4
|
||||
assert tokens[0].text == text
|
||||
assert tokens[1].text == punct
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text", ["'Ang"])
|
||||
def test_tl_tokenizer_splits_open_apostrophe(tl_tokenizer, text):
|
||||
tokens = tl_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
assert tokens[0].text == "'"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text", ["Mabuhay''"])
|
||||
def test_tl_tokenizer_splits_double_end_quote(tl_tokenizer, text):
|
||||
tokens = tl_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
tokens_punct = tl_tokenizer("''")
|
||||
assert len(tokens_punct) == 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize("punct_open,punct_close", PUNCT_PAIRED)
|
||||
@pytest.mark.parametrize("text", ["Mabuhay"])
|
||||
def test_tl_tokenizer_splits_open_close_punct(
|
||||
tl_tokenizer, punct_open, punct_close, text
|
||||
):
|
||||
tokens = tl_tokenizer(punct_open + text + punct_close)
|
||||
assert len(tokens) == 3
|
||||
assert tokens[0].text == punct_open
|
||||
assert tokens[1].text == text
|
||||
assert tokens[2].text == punct_close
|
||||
|
||||
|
||||
@pytest.mark.parametrize("punct_open,punct_close", PUNCT_PAIRED)
|
||||
@pytest.mark.parametrize("punct_open2,punct_close2", [("`", "'")])
|
||||
@pytest.mark.parametrize("text", ["Mabuhay"])
|
||||
def test_tl_tokenizer_two_diff_punct(
|
||||
tl_tokenizer, punct_open, punct_close, punct_open2, punct_close2, text
|
||||
):
|
||||
tokens = tl_tokenizer(punct_open2 + punct_open + text + punct_close + punct_close2)
|
||||
assert len(tokens) == 5
|
||||
assert tokens[0].text == punct_open2
|
||||
assert tokens[1].text == punct_open
|
||||
assert tokens[2].text == text
|
||||
assert tokens[3].text == punct_close
|
||||
assert tokens[4].text == punct_close2
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text,punct", [("(sa'yo", "(")])
|
||||
def test_tl_tokenizer_splits_pre_punct_regex(text, punct):
|
||||
tl_search_prefixes = compile_prefix_regex(TOKENIZER_PREFIXES).search
|
||||
match = tl_search_prefixes(text)
|
||||
assert match.group() == punct
|
||||
|
||||
|
||||
def test_tl_tokenizer_splits_bracket_period(tl_tokenizer):
|
||||
text = "(Dumating siya kahapon)."
|
||||
tokens = tl_tokenizer(text)
|
||||
assert tokens[len(tokens) - 1].text == "."
|
73
spacy/tests/lang/tl/test_text.py
Normal file
73
spacy/tests/lang/tl/test_text.py
Normal file
|
@ -0,0 +1,73 @@
|
|||
import pytest
|
||||
from spacy.lang.tl.lex_attrs import like_num
|
||||
|
||||
# https://github.com/explosion/spaCy/blob/master/spacy/tests/lang/en/test_text.py
|
||||
|
||||
|
||||
def test_tl_tokenizer_handles_long_text(tl_tokenizer):
|
||||
# Excerpt: "Sapagkat ang Pilosopiya ay Ginagawa" by Padre Roque Ferriols
|
||||
text = """
|
||||
Tingin tayo nang tingin. Kailangan lamang nating dumilat at
|
||||
marami tayong makikita. At ang pagtingin ay isang gawain na ako lamang ang
|
||||
makagagawa, kung ako nga ang makakita. Kahit na napanood na ng aking
|
||||
matalik na kaibigan ang isang sine, kailangan ko pa ring panoorin, kung
|
||||
ako nga ang may gustong makakita. Kahit na gaano kadikit ang aming
|
||||
pagkabuklod, hindi siya maaaring tumingin sa isang paraan na ako ang
|
||||
nakakakita. Kung ako ang makakita, ako lamang ang makatitingin.
|
||||
"""
|
||||
tokens = tl_tokenizer(text)
|
||||
assert len(tokens) == 97
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text,length",
|
||||
[
|
||||
("Huwag mo nang itanong sa akin.", 7),
|
||||
("Nasubukan mo na bang hulihin ang hangin?", 8),
|
||||
("Hindi ba?", 3),
|
||||
("Nagbukas ang DFA ng 1,000 appointment slots para sa pasaporte.", 11),
|
||||
("'Wala raw pasok bukas kasi may bagyo!' sabi ni Micah.", 14),
|
||||
("'Ingat,' aniya. 'Maingay sila pag malayo at tahimik kung malapit.'", 17),
|
||||
],
|
||||
)
|
||||
def test_tl_tokenizer_handles_cnts(tl_tokenizer, text, length):
|
||||
tokens = tl_tokenizer(text)
|
||||
assert len(tokens) == length
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text,match",
|
||||
[
|
||||
("10", True),
|
||||
("isa", True),
|
||||
("dalawa", True),
|
||||
("tatlumpu", True),
|
||||
pytest.param(
|
||||
"isang daan",
|
||||
True,
|
||||
marks=pytest.mark.xfail(reason="Not yet implemented (means 100)"),
|
||||
),
|
||||
pytest.param(
|
||||
"kalahati",
|
||||
True,
|
||||
marks=pytest.mark.xfail(reason="Not yet implemented (means 1/2)"),
|
||||
),
|
||||
pytest.param(
|
||||
"isa't kalahati",
|
||||
True,
|
||||
marks=pytest.mark.xfail(
|
||||
reason="Not yet implemented (means one-and-a-half)"
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_lex_attrs_like_number(tl_tokenizer, text, match):
|
||||
tokens = tl_tokenizer(text)
|
||||
assert all([token.like_num for token in tokens]) == match
|
||||
|
||||
|
||||
@pytest.mark.xfail(reason="Not yet implemented, fails when capitalized.")
|
||||
@pytest.mark.parametrize("word", ["isa", "dalawa", "tatlo"])
|
||||
def test_tl_lex_attrs_capitals(word):
|
||||
assert like_num(word)
|
||||
assert like_num(word.upper())
|
Loading…
Reference in New Issue
Block a user