This commit is contained in:
Matthew Honnibal 2024-09-09 11:22:52 +02:00
parent b65491b641
commit 59ac7e6bdb
7 changed files with 68 additions and 18 deletions

View File

@ -4,10 +4,43 @@ from ...attrs import LIKE_NUM
_num_words = [
"ཀླད་ཀོར་",
"གཅིག་", "གཉིས་", "གསུམ་", "བཞི་", "ལྔ་", "དྲུག་", "བདུན་", "བརྒྱད་", "དགུ་", "བཅུ་",
"བཅུ་གཅིག་", "བཅུ་གཉིས་", "བཅུ་གསུམ་", "བཅུ་བཞི་", "བཅུ་ལྔ་", "བཅུ་དྲུག་", "བཅུ་བདུན་", "བཅུ་པརྒྱད", "བཅུ་དགུ་", "ཉི་ཤུ་",
"སུམ་ཅུ", "བཞི་བཅུ", "ལྔ་བཅུ", "དྲུག་ཅུ", "བདུན་ཅུ", "བརྒྱད་ཅུ", "དགུ་བཅུ", "བརྒྱ་",
"སྟོང་", "ཁྲི་", "ས་ཡ་", " བྱེ་བ་", "དུང་ཕྱུར་", "ཐེར་འབུམ་", "ཐེར་འབུམ་ཆེན་པོ་", "ཁྲག་ཁྲིག་", "ཁྲག་ཁྲིག་ཆེན་པོ་",
"གཅིག་",
"གཉིས་",
"གསུམ་",
"བཞི་",
"ལྔ་",
"དྲུག་",
"བདུན་",
"བརྒྱད་",
"དགུ་",
"བཅུ་",
"བཅུ་གཅིག་",
"བཅུ་གཉིས་",
"བཅུ་གསུམ་",
"བཅུ་བཞི་",
"བཅུ་ལྔ་",
"བཅུ་དྲུག་",
"བཅུ་བདུན་",
"བཅུ་པརྒྱད",
"བཅུ་དགུ་",
"ཉི་ཤུ་",
"སུམ་ཅུ",
"བཞི་བཅུ",
"ལྔ་བཅུ",
"དྲུག་ཅུ",
"བདུན་ཅུ",
"བརྒྱད་ཅུ",
"དགུ་བཅུ",
"བརྒྱ་",
"སྟོང་",
"ཁྲི་",
"ས་ཡ་",
" བྱེ་བ་",
"དུང་ཕྱུར་",
"ཐེར་འབུམ་",
"ཐེར་འབུམ་ཆེན་པོ་",
"ཁྲག་ཁྲིག་",
"ཁྲག་ཁྲིག་ཆེན་པོ་",
]

View File

@ -382,5 +382,7 @@ urrainn
ì
ò
ó
""".split("\n")
""".split(
"\n"
)
)

View File

@ -18,19 +18,18 @@ _exc = {
"càil": [{ORTH: "", NORM: "càite"}, {ORTH: "il", NORM: "bheil"}],
"sna": [{ORTH: "s", NORM: "anns"}, {ORTH: "na", NORM: "na"}],
"orra": [{ORTH: "orr", NORM: "air"}, {ORTH: "a", NORM: "do"}],
"fiùs": [{ORTH: "fiù", NORM: "fiù"}, {ORTH: "s", NORM: "'s"}]
"fiùs": [{ORTH: "fiù", NORM: "fiù"}, {ORTH: "s", NORM: "'s"}],
}
# Hyphenations that are alternative forms of words
for exc_data in [
{ORTH: "fa-near",NORM: "fainear"},
{ORTH: "Fa-near",NORM: "Fainear"},
{ORTH: "fa-near", NORM: "fainear"},
{ORTH: "Fa-near", NORM: "Fainear"},
]:
_exc[exc_data[ORTH]] = [exc_data]
# Abreviations and shortened words
for exc_data in [
{ORTH: "'", NORM: "a"},
@ -1529,7 +1528,7 @@ Weld-adh
Òige-sa
òrd-mhòr
Òrd-mhòr""".split():
_exc[orth] = [{ORTH: orth}]
_exc[orth] = [{ORTH: orth}]
# Multiple words that should remain as one token
for orth in """'n diugh
@ -1975,8 +1974,10 @@ Tron an
tuilleadh 's a chòir
Tuilleadh 's a chòir
tuilleadh sa chòir
Tuilleadh sa chòir""".split("\n"):
_exc[orth] = [{ORTH: orth}]
Tuilleadh sa chòir""".split(
"\n"
):
_exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -12,4 +12,5 @@ class Kurmanji(Language):
lang = "kmr"
Defaults = KurmanjiDefaults
__all__ = ["Kurmanji"]

View File

@ -80,10 +80,12 @@ def bg_tokenizer():
def bn_tokenizer():
return get_lang_class("bn")().tokenizer
@pytest.fixture(scope="session")
def bo_tokenizer():
return get_lang_class("bo")().tokenizer
@pytest.fixture(scope="session")
def ca_tokenizer():
return get_lang_class("ca")().tokenizer

View File

@ -18,4 +18,4 @@ import pytest
def test_lex_attrs_like_number(bo_tokenizer, text, match):
tokens = bo_tokenizer(text)
assert len(tokens) == 1
assert tokens[0].like_num == match
assert tokens[0].like_num == match

View File

@ -4,7 +4,18 @@ from spacy.lang.kmr.lex_attrs import like_num
@pytest.mark.parametrize(
"word", ["yekem", "duyemîn", "100em", "dehem", "sedemîn", "34em", "30yem", "20emîn", "50yemîn"]
"word",
[
"yekem",
"duyemîn",
"100em",
"dehem",
"sedemîn",
"34em",
"30yem",
"20emîn",
"50yemîn",
],
)
def test_kmr_lex_attrs_like_number_for_ordinal(word):
assert like_num(word)