New tests for a number of alpha languages (#9703)

* Added Slovak

* Added Slovenian tests

* Added Estonian tests

* Added Croatian tests

* Added Latvian tests

* Added Icelandic tests

* Added Afrikaans tests

* Added language-independent tests

* Added Kannada tests

* Tidied up

* Added Albanian tests

* Formatted with black

* Added failing tests for anomalies

* Update spacy/tests/lang/af/test_text.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Added context to failing Estonian tokenizer test

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Added context to failing Croatian tokenizer test

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Added context to failing Icelandic tokenizer test

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Added context to failing Latvian tokenizer test

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Added context to failing Slovak tokenizer test

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Added context to failing Slovenian tokenizer test

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
This commit is contained in:
Richard Hudson 2021-11-28 21:59:23 +01:00 committed by GitHub
parent 5c44533263
commit 7b134b8fbd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
28 changed files with 543 additions and 0 deletions

View File

@ -49,6 +49,11 @@ def tokenizer():
return get_lang_class("xx")().tokenizer
@pytest.fixture(scope="session")
def af_tokenizer():
return get_lang_class("af")().tokenizer
@pytest.fixture(scope="session")
def am_tokenizer():
return get_lang_class("am")().tokenizer
@ -125,6 +130,11 @@ def es_vocab():
return get_lang_class("es")().vocab
@pytest.fixture(scope="session")
def et_tokenizer():
return get_lang_class("et")().tokenizer
@pytest.fixture(scope="session")
def eu_tokenizer():
return get_lang_class("eu")().tokenizer
@ -185,6 +195,11 @@ def id_tokenizer():
return get_lang_class("id")().tokenizer
@pytest.fixture(scope="session")
def is_tokenizer():
return get_lang_class("is")().tokenizer
@pytest.fixture(scope="session")
def it_tokenizer():
return get_lang_class("it")().tokenizer
@ -212,6 +227,11 @@ def lt_tokenizer():
return get_lang_class("lt")().tokenizer
@pytest.fixture(scope="session")
def lv_tokenizer():
return get_lang_class("lv")().tokenizer
@pytest.fixture(scope="session")
def mk_tokenizer():
return get_lang_class("mk")().tokenizer
@ -279,11 +299,26 @@ def sa_tokenizer():
return get_lang_class("sa")().tokenizer
@pytest.fixture(scope="session")
def sk_tokenizer():
return get_lang_class("sk")().tokenizer
@pytest.fixture(scope="session")
def sl_tokenizer():
return get_lang_class("sl")().tokenizer
@pytest.fixture(scope="session")
def sr_tokenizer():
return get_lang_class("sr")().tokenizer
@pytest.fixture(scope="session")
def sq_tokenizer():
return get_lang_class("sq")().tokenizer
@pytest.fixture(scope="session")
def sv_tokenizer():
return get_lang_class("sv")().tokenizer
@ -344,6 +379,11 @@ def vi_tokenizer():
return get_lang_class("vi")().tokenizer
@pytest.fixture(scope="session")
def xx_tokenizer():
return get_lang_class("xx")().tokenizer
@pytest.fixture(scope="session")
def yo_tokenizer():
return get_lang_class("yo")().tokenizer

View File

View File

@ -0,0 +1,22 @@
import pytest
def test_long_text(af_tokenizer):
# Excerpt: Universal Declaration of Human Rights; “'n” changed to “die” in first sentence
text = """
Hierdie Universele Verklaring van Menseregte as die algemene standaard vir die verwesenliking deur alle mense en nasies,
om te verseker dat elke individu en elke deel van die gemeenskap hierdie Verklaring in ag sal neem en deur opvoeding,
respek vir hierdie regte en vryhede te bevorder, op nasionale en internasionale vlak, daarna sal strewe om die universele
en effektiewe erkenning en agting van hierdie regte te verseker, nie net vir die mense van die Lidstate nie, maar ook vir
die mense in die gebiede onder hul jurisdiksie.
"""
tokens = af_tokenizer(text)
assert len(tokens) == 100
@pytest.mark.xfail
def test_indefinite_article(af_tokenizer):
text = "as 'n algemene standaard"
tokens = af_tokenizer(text)
assert len(tokens) == 4

View File

@ -0,0 +1,29 @@
import pytest
AF_BASIC_TOKENIZATION_TESTS = [
(
"Elkeen het die reg tot lewe, vryheid en sekuriteit van persoon.",
[
"Elkeen",
"het",
"die",
"reg",
"tot",
"lewe",
",",
"vryheid",
"en",
"sekuriteit",
"van",
"persoon",
".",
],
),
]
@pytest.mark.parametrize("text,expected_tokens", AF_BASIC_TOKENIZATION_TESTS)
def test_af_tokenizer_basic(af_tokenizer, text, expected_tokens):
tokens = af_tokenizer(text)
token_list = [token.text for token in tokens if not token.is_space]
assert expected_tokens == token_list

View File

View File

@ -0,0 +1,26 @@
import pytest
def test_long_text(et_tokenizer):
# Excerpt: European Convention on Human Rights
text = """
arvestades, et nimetatud deklaratsiooni eesmärk on tagada selles
kuulutatud õiguste üldine ja tõhus tunnustamine ning järgimine;
arvestades, et Euroopa Nõukogu eesmärk on saavutada tema
liikmete suurem ühtsus ning et üheks selle eesmärgi saavutamise
vahendiks on inimõiguste ja põhivabaduste järgimine ning
elluviimine;
taaskinnitades oma sügavat usku neisse põhivabadustesse, mis
on õigluse ja rahu aluseks maailmas ning mida kõige paremini
tagab ühelt poolt tõhus poliitiline demokraatia ning teiselt poolt
inimõiguste, millest nad sõltuvad, üldine mõistmine ja järgimine;
"""
tokens = et_tokenizer(text)
assert len(tokens) == 94
@pytest.mark.xfail
def test_ordinal_number(et_tokenizer):
text = "10. detsembril 1948"
tokens = et_tokenizer(text)
assert len(tokens) == 3

View File

@ -0,0 +1,29 @@
import pytest
ET_BASIC_TOKENIZATION_TESTS = [
(
"Kedagi ei või piinata ega ebainimlikult või alandavalt kohelda "
"ega karistada.",
[
"Kedagi",
"ei",
"või",
"piinata",
"ega",
"ebainimlikult",
"või",
"alandavalt",
"kohelda",
"ega",
"karistada",
".",
],
),
]
@pytest.mark.parametrize("text,expected_tokens", ET_BASIC_TOKENIZATION_TESTS)
def test_et_tokenizer_basic(et_tokenizer, text, expected_tokens):
tokens = et_tokenizer(text)
token_list = [token.text for token in tokens if not token.is_space]
assert expected_tokens == token_list

View File

View File

@ -0,0 +1,26 @@
import pytest
def test_long_text(hr_tokenizer):
# Excerpt: European Convention on Human Rights
text = """
uzimajući u obzir da ta deklaracija nastoji osigurati opće i djelotvorno
priznanje i poštovanje u njoj proglašenih prava;
uzimajući u obzir da je cilj Vijeća Europe postizanje većeg jedinstva
njegovih članica, i da je jedan od načina postizanja toga cilja
očuvanje i daljnje ostvarivanje ljudskih prava i temeljnih sloboda;
potvrđujući svoju duboku privrženost tim temeljnim slobodama
koje su osnova pravde i mira u svijetu i koje su najbolje zaštićene
istinskom političkom demokracijom s jedne strane te zajedničkim
razumijevanjem i poštovanjem ljudskih prava o kojima te slobode
ovise s druge strane;
"""
tokens = hr_tokenizer(text)
assert len(tokens) == 105
@pytest.mark.xfail
def test_ordinal_number(hr_tokenizer):
text = "10. prosinca 1948"
tokens = hr_tokenizer(text)
assert len(tokens) == 3

View File

@ -0,0 +1,31 @@
import pytest
HR_BASIC_TOKENIZATION_TESTS = [
(
"Nitko se ne smije podvrgnuti mučenju ni nečovječnom ili "
"ponižavajućem postupanju ili kazni.",
[
"Nitko",
"se",
"ne",
"smije",
"podvrgnuti",
"mučenju",
"ni",
"nečovječnom",
"ili",
"ponižavajućem",
"postupanju",
"ili",
"kazni",
".",
],
),
]
@pytest.mark.parametrize("text,expected_tokens", HR_BASIC_TOKENIZATION_TESTS)
def test_hr_tokenizer_basic(hr_tokenizer, text, expected_tokens):
tokens = hr_tokenizer(text)
token_list = [token.text for token in tokens if not token.is_space]
assert expected_tokens == token_list

View File

View File

@ -0,0 +1,26 @@
import pytest
def test_long_text(is_tokenizer):
# Excerpt: European Convention on Human Rights
text = """
hafa í huga, yfirlýsing þessi hefur það markmið tryggja
almenna og raunhæfa viðurkenningu og vernd þeirra réttinda,
sem þar er lýst;
hafa í huga, markmið Evrópuráðs er koma á nánari einingu
aðildarríkjanna og ein af leiðunum því marki er ,
mannréttindi og mannfrelsi séu í heiðri höfð og efld;
lýsa á eindreginni trú sinni á það mannfrelsi, sem er undirstaða
réttlætis og friðar í heiminum og best er tryggt, annars vegar með
virku, lýðræðislegu stjórnarfari og, hins vegar, almennum skilningi
og varðveislu þeirra mannréttinda, sem eru grundvöllur frelsisins;
"""
tokens = is_tokenizer(text)
assert len(tokens) == 120
@pytest.mark.xfail
def test_ordinal_number(is_tokenizer):
text = "10. desember 1948"
tokens = is_tokenizer(text)
assert len(tokens) == 3

View File

@ -0,0 +1,30 @@
import pytest
IS_BASIC_TOKENIZATION_TESTS = [
(
"Enginn maður skal sæta pyndingum eða ómannlegri eða "
"vanvirðandi meðferð eða refsingu. ",
[
"Enginn",
"maður",
"skal",
"sæta",
"pyndingum",
"eða",
"ómannlegri",
"eða",
"vanvirðandi",
"meðferð",
"eða",
"refsingu",
".",
],
),
]
@pytest.mark.parametrize("text,expected_tokens", IS_BASIC_TOKENIZATION_TESTS)
def test_is_tokenizer_basic(is_tokenizer, text, expected_tokens):
tokens = is_tokenizer(text)
token_list = [token.text for token in tokens if not token.is_space]
assert expected_tokens == token_list

View File

View File

@ -0,0 +1,27 @@
import pytest
def test_long_text(lv_tokenizer):
# Excerpt: European Convention on Human Rights
text = """
Ievērodamas, ka šī deklarācija paredz nodrošināt vispārēju un
efektīvu tajā pasludināto tiesību atzīšanu un ievērošanu;
Ievērodamas, ka Eiropas Padomes mērķis ir panākt lielāku vienotību
tās dalībvalstu starpā un ka viens no līdzekļiem, šo mērķi
sasniegt, ir cilvēka tiesību un pamatbrīvību ievērošana un turpmāka
īstenošana;
No jauna apliecinādamas patiesu pārliecību, ka šīs pamatbrīvības
ir taisnīguma un miera pamats visā pasaulē un ka tās vislabāk var
nodrošināt patiess demokrātisks politisks režīms no vienas puses un
vispārējo cilvēktiesību, uz kurām tās pamatojas, kopīga izpratne un
ievērošana no otras puses;
"""
tokens = lv_tokenizer(text)
assert len(tokens) == 109
@pytest.mark.xfail
def test_ordinal_number(lv_tokenizer):
text = "10. decembrī"
tokens = lv_tokenizer(text)
assert len(tokens) == 2

View File

@ -0,0 +1,30 @@
import pytest
LV_BASIC_TOKENIZATION_TESTS = [
(
"Nevienu nedrīkst spīdzināt vai cietsirdīgi vai pazemojoši ar viņu "
"apieties vai sodīt.",
[
"Nevienu",
"nedrīkst",
"spīdzināt",
"vai",
"cietsirdīgi",
"vai",
"pazemojoši",
"ar",
"viņu",
"apieties",
"vai",
"sodīt",
".",
],
),
]
@pytest.mark.parametrize("text,expected_tokens", LV_BASIC_TOKENIZATION_TESTS)
def test_lv_tokenizer_basic(lv_tokenizer, text, expected_tokens):
tokens = lv_tokenizer(text)
token_list = [token.text for token in tokens if not token.is_space]
assert expected_tokens == token_list

View File

View File

@ -0,0 +1,48 @@
import pytest
def test_long_text(sk_tokenizer):
# Excerpt: European Convention on Human Rights
text = """
majúc na zreteli, že cieľom tejto deklarácie je zabezpečiť všeobecné
a účinné uznávanie a dodržiavanie práv v nej vyhlásených;
majúc na zreteli, že cieľom Rady Európy je dosiahnutie väčšej
jednoty medzi jej členmi, a že jedným zo spôsobov, ktorým sa
tento cieľ napĺňať, je ochrana a ďalší rozvoj ľudských práv
a základných slobôd;
znovu potvrdzujúc svoju hlbokú vieru v tie základné slobody, ktoré
základom spravodlivosti a mieru vo svete, a ktoré najlepšie
zachovávané na jednej strane účinnou politickou demokraciou
a na strane druhej spoločným poňatím a dodržiavaním ľudských
práv, od ktorých závisia;
"""
tokens = sk_tokenizer(text)
assert len(tokens) == 118
@pytest.mark.parametrize(
"text,match",
[
("10", True),
("1", True),
("10,000", True),
("10,00", True),
("štyri", True),
("devätnásť", True),
("milión", True),
("pes", False),
(",", False),
("1/2", True),
],
)
def test_lex_attrs_like_number(sk_tokenizer, text, match):
tokens = sk_tokenizer(text)
assert len(tokens) == 1
assert tokens[0].like_num == match
@pytest.mark.xfail
def test_ordinal_number(sk_tokenizer):
text = "10. decembra 1948"
tokens = sk_tokenizer(text)
assert len(tokens) == 3

View File

@ -0,0 +1,15 @@
import pytest
SK_BASIC_TOKENIZATION_TESTS = [
(
"Kedy sa narodil Andrej Kiska?",
["Kedy", "sa", "narodil", "Andrej", "Kiska", "?"],
),
]
@pytest.mark.parametrize("text,expected_tokens", SK_BASIC_TOKENIZATION_TESTS)
def test_sk_tokenizer_basic(sk_tokenizer, text, expected_tokens):
tokens = sk_tokenizer(text)
token_list = [token.text for token in tokens if not token.is_space]
assert expected_tokens == token_list

View File

View File

@ -0,0 +1,27 @@
import pytest
def test_long_text(sl_tokenizer):
# Excerpt: European Convention on Human Rights
text = """
upoštevajoč, da si ta deklaracija prizadeva zagotoviti splošno in
učinkovito priznavanje in spoštovanje v njej razglašenih pravic,
upoštevajoč, da je cilj Sveta Evrope doseči večjo enotnost med
njegovimi članicami, in da je eden izmed načinov za zagotavljanje
tega cilja varstvo in nadaljnji razvoj človekovih pravic in temeljnih
svoboščin,
ponovno potrjujoč svojo globoko vero v temeljne svoboščine, na
katerih temeljita pravičnost in mir v svetu, in ki jih je mogoče najbolje
zavarovati na eni strani z dejansko politično demokracijo in na drugi
strani s skupnim razumevanjem in spoštovanjem človekovih pravic,
od katerih so te svoboščine odvisne,
"""
tokens = sl_tokenizer(text)
assert len(tokens) == 116
@pytest.mark.xfail
def test_ordinal_number(sl_tokenizer):
text = "10. decembra 1948"
tokens = sl_tokenizer(text)
assert len(tokens) == 3

View File

@ -0,0 +1,32 @@
import pytest
SL_BASIC_TOKENIZATION_TESTS = [
(
"Vsakdo ima pravico do spoštovanja njegovega zasebnega in "
"družinskega življenja, doma in dopisovanja.",
[
"Vsakdo",
"ima",
"pravico",
"do",
"spoštovanja",
"njegovega",
"zasebnega",
"in",
"družinskega",
"življenja",
",",
"doma",
"in",
"dopisovanja",
".",
],
),
]
@pytest.mark.parametrize("text,expected_tokens", SL_BASIC_TOKENIZATION_TESTS)
def test_sl_tokenizer_basic(sl_tokenizer, text, expected_tokens):
tokens = sl_tokenizer(text)
token_list = [token.text for token in tokens if not token.is_space]
assert expected_tokens == token_list

View File

View File

@ -0,0 +1,25 @@
import pytest
def test_long_text(sq_tokenizer):
# Excerpt: European Convention on Human Rights
text = """
Qeveritë nënshkruese, anëtare Këshillit Evropës,
Duke pasur parasysh Deklaratën Universale Drejtave
Njeriut, shpallur nga Asambleja e Përgjithshme e Kombeve
Bashkuara 10 dhjetor 1948;
Duke pasur parasysh, se kjo Deklaratë ka për qëllim sigurojë
njohjen dhe zbatimin universal dhe efektiv drejtave
shpallura ;
Duke pasur parasysh se qëllimi i Këshillit Evropës është
realizojë një bashkim ngushtë midis anëtarëve tij dhe
se një nga mjetet për arritur këtë qëllim është mbrojtja dhe
zhvillimi i drejtave njeriut dhe i lirive themelore;
Duke ripohuar besimin e tyre thellë këto liri themelore
përbëjnë themelet e drejtësisë dhe paqes botë, ruajtja e
cilave mbështetet kryesisht mbi një regjim politik demokratik nga
njëra anë, dhe nga ana tjetër mbi një kuptim dhe respektim
përbashkët drejtave njeriut nga cilat varen;
"""
tokens = sq_tokenizer(text)
assert len(tokens) == 182

View File

@ -0,0 +1,31 @@
import pytest
SQ_BASIC_TOKENIZATION_TESTS = [
(
"Askush nuk mund ti nënshtrohet torturës ose dënimeve ose "
"trajtimeve çnjerëzore ose poshtëruese.",
[
"Askush",
"nuk",
"mund",
"ti",
"nënshtrohet",
"torturës",
"ose",
"dënimeve",
"ose",
"trajtimeve",
"çnjerëzore",
"ose",
"poshtëruese",
".",
],
),
]
@pytest.mark.parametrize("text,expected_tokens", SQ_BASIC_TOKENIZATION_TESTS)
def test_sq_tokenizer_basic(sq_tokenizer, text, expected_tokens):
tokens = sq_tokenizer(text)
token_list = [token.text for token in tokens if not token.is_space]
assert expected_tokens == token_list

View File

View File

@ -0,0 +1,24 @@
import pytest
def test_long_text(xx_tokenizer):
# Excerpt: Text in Skolt Sami taken from https://www.samediggi.fi
text = """
ʹmmla lie Euroopp unioon oʹdinakai alggmeer. ʹmmlai alggmeerstatus lij raʹvvjum Lääʹddjânnam vuâđđlääʹjjest.
Alggmeer kriteeʹr vuâđđâʹvve meeraikõskksaž tuâjjorganisaatio, ILO, suåppmõʹšše nââmar 169.
Suåppmõõžž mieʹldd jiõččvälddsaž jânnmin jälsteei meeraid ââʹnet alggmeeran,
ko sij puõlvvâʹvve naroodâst, kååʹtt jânnam välddmõõžž leʹbe aazztummuž leʹbe ânnʼjõž riikkraaʹji šõddâm ääiʹj jälste
jânnmest leʹbe tõn mäddtiõđlaž vuuʹdest, koozz jânnam kooll. Alggmeer ij leäkku mieʹrreei sââʹjest jiiʹjjes jälstemvuuʹdest.
Alggmeer âlgg jiõčč ââʹnned jiiʹjjes alggmeeran leʹbe leeʹd tõn miõlâst, što sij lie alggmeer.
Alggmeer lij õlggâm seeilted vuõiggâdvuõđlaž sââʹjest huõlǩâni obbnes leʹbe vueʹzzi jiiʹjjes sosiaalʼlaž, täälʼlaž,
kulttuurlaž da poliittlaž instituutioid.
ʹmmlai statuuzz ǩeeʹrjteš Lääʹddjânnam vuâđđläkka eeʹjj 1995. ʹmmlain alggmeeran lij vuõiggâdvuõtt tuõʹllʼjed da
ooudâsviikkâd ǩiõlâz da kulttuurâz di tõõzz kuulli ääʹrbvuâlaž jieʹllemvueʹjjeez. Sääʹmǩiõl ââʹnnmest veʹrǧǧniiʹǩǩi
åʹrnn lij šiõttuum jiiʹjjes lääʹǩǩ. ʹmmlain lij leämmaž eeʹjjest 1996 vueʹljeeʹl dommvuuʹdsteez ǩiõlâz da kulttuurâz kuõskki
vuâđđlääʹjj meâldlaž jiõččvaaldâšm. ʹmmlai jiõččvaldšma kuulli tuâjaid håidd ʹmmlai vaalin vaʹlljääm parlameʹntt,
Sääʹmteʹǧǧ.
"""
tokens = xx_tokenizer(text)
assert len(tokens) == 179

View File

@ -0,0 +1,25 @@
import pytest
XX_BASIC_TOKENIZATION_TESTS = [
(
"Lääʹddjânnmest lie nuʹtt 10 000 säʹmmliʹžžed. Seeʹst pâʹjjel",
[
"Lääʹddjânnmest",
"lie",
"nuʹtt",
"10",
"000",
"ʹmmliʹžžed",
".",
"Seeʹst",
"ʹjjel",
],
),
]
@pytest.mark.parametrize("text,expected_tokens", XX_BASIC_TOKENIZATION_TESTS)
def test_xx_tokenizer_basic(xx_tokenizer, text, expected_tokens):
tokens = xx_tokenizer(text)
token_list = [token.text for token in tokens if not token.is_space]
assert expected_tokens == token_list