mirror of
https://github.com/explosion/spaCy.git
synced 2025-04-27 20:33:42 +03:00
Format
This commit is contained in:
parent
b65491b641
commit
59ac7e6bdb
|
@ -4,10 +4,43 @@ from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
_num_words = [
|
_num_words = [
|
||||||
"ཀླད་ཀོར་",
|
"ཀླད་ཀོར་",
|
||||||
"གཅིག་", "གཉིས་", "གསུམ་", "བཞི་", "ལྔ་", "དྲུག་", "བདུན་", "བརྒྱད་", "དགུ་", "བཅུ་",
|
"གཅིག་",
|
||||||
"བཅུ་གཅིག་", "བཅུ་གཉིས་", "བཅུ་གསུམ་", "བཅུ་བཞི་", "བཅུ་ལྔ་", "བཅུ་དྲུག་", "བཅུ་བདུན་", "བཅུ་པརྒྱད", "བཅུ་དགུ་", "ཉི་ཤུ་",
|
"གཉིས་",
|
||||||
"སུམ་ཅུ", "བཞི་བཅུ", "ལྔ་བཅུ", "དྲུག་ཅུ", "བདུན་ཅུ", "བརྒྱད་ཅུ", "དགུ་བཅུ", "བརྒྱ་",
|
"གསུམ་",
|
||||||
"སྟོང་", "ཁྲི་", "ས་ཡ་", " བྱེ་བ་", "དུང་ཕྱུར་", "ཐེར་འབུམ་", "ཐེར་འབུམ་ཆེན་པོ་", "ཁྲག་ཁྲིག་", "ཁྲག་ཁྲིག་ཆེན་པོ་",
|
"བཞི་",
|
||||||
|
"ལྔ་",
|
||||||
|
"དྲུག་",
|
||||||
|
"བདུན་",
|
||||||
|
"བརྒྱད་",
|
||||||
|
"དགུ་",
|
||||||
|
"བཅུ་",
|
||||||
|
"བཅུ་གཅིག་",
|
||||||
|
"བཅུ་གཉིས་",
|
||||||
|
"བཅུ་གསུམ་",
|
||||||
|
"བཅུ་བཞི་",
|
||||||
|
"བཅུ་ལྔ་",
|
||||||
|
"བཅུ་དྲུག་",
|
||||||
|
"བཅུ་བདུན་",
|
||||||
|
"བཅུ་པརྒྱད",
|
||||||
|
"བཅུ་དགུ་",
|
||||||
|
"ཉི་ཤུ་",
|
||||||
|
"སུམ་ཅུ",
|
||||||
|
"བཞི་བཅུ",
|
||||||
|
"ལྔ་བཅུ",
|
||||||
|
"དྲུག་ཅུ",
|
||||||
|
"བདུན་ཅུ",
|
||||||
|
"བརྒྱད་ཅུ",
|
||||||
|
"དགུ་བཅུ",
|
||||||
|
"བརྒྱ་",
|
||||||
|
"སྟོང་",
|
||||||
|
"ཁྲི་",
|
||||||
|
"ས་ཡ་",
|
||||||
|
" བྱེ་བ་",
|
||||||
|
"དུང་ཕྱུར་",
|
||||||
|
"ཐེར་འབུམ་",
|
||||||
|
"ཐེར་འབུམ་ཆེན་པོ་",
|
||||||
|
"ཁྲག་ཁྲིག་",
|
||||||
|
"ཁྲག་ཁྲིག་ཆེན་པོ་",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -382,5 +382,7 @@ urrainn
|
||||||
ì
|
ì
|
||||||
ò
|
ò
|
||||||
ó
|
ó
|
||||||
""".split("\n")
|
""".split(
|
||||||
|
"\n"
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
|
@ -18,19 +18,18 @@ _exc = {
|
||||||
"càil": [{ORTH: "cà", NORM: "càite"}, {ORTH: "il", NORM: "bheil"}],
|
"càil": [{ORTH: "cà", NORM: "càite"}, {ORTH: "il", NORM: "bheil"}],
|
||||||
"sna": [{ORTH: "s", NORM: "anns"}, {ORTH: "na", NORM: "na"}],
|
"sna": [{ORTH: "s", NORM: "anns"}, {ORTH: "na", NORM: "na"}],
|
||||||
"orra": [{ORTH: "orr", NORM: "air"}, {ORTH: "a", NORM: "do"}],
|
"orra": [{ORTH: "orr", NORM: "air"}, {ORTH: "a", NORM: "do"}],
|
||||||
"fiùs": [{ORTH: "fiù", NORM: "fiù"}, {ORTH: "s", NORM: "'s"}]
|
"fiùs": [{ORTH: "fiù", NORM: "fiù"}, {ORTH: "s", NORM: "'s"}],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
# Hyphenations that are alternative forms of words
|
# Hyphenations that are alternative forms of words
|
||||||
for exc_data in [
|
for exc_data in [
|
||||||
{ORTH: "fa-near",NORM: "fainear"},
|
{ORTH: "fa-near", NORM: "fainear"},
|
||||||
{ORTH: "Fa-near",NORM: "Fainear"},
|
{ORTH: "Fa-near", NORM: "Fainear"},
|
||||||
]:
|
]:
|
||||||
_exc[exc_data[ORTH]] = [exc_data]
|
_exc[exc_data[ORTH]] = [exc_data]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Abreviations and shortened words
|
# Abreviations and shortened words
|
||||||
for exc_data in [
|
for exc_data in [
|
||||||
{ORTH: "'", NORM: "a"},
|
{ORTH: "'", NORM: "a"},
|
||||||
|
@ -1529,7 +1528,7 @@ Weld-adh
|
||||||
Òige-sa
|
Òige-sa
|
||||||
òrd-mhòr
|
òrd-mhòr
|
||||||
Òrd-mhòr""".split():
|
Òrd-mhòr""".split():
|
||||||
_exc[orth] = [{ORTH: orth}]
|
_exc[orth] = [{ORTH: orth}]
|
||||||
|
|
||||||
# Multiple words that should remain as one token
|
# Multiple words that should remain as one token
|
||||||
for orth in """'n diugh
|
for orth in """'n diugh
|
||||||
|
@ -1975,8 +1974,10 @@ Tron an
|
||||||
tuilleadh 's a chòir
|
tuilleadh 's a chòir
|
||||||
Tuilleadh 's a chòir
|
Tuilleadh 's a chòir
|
||||||
tuilleadh sa chòir
|
tuilleadh sa chòir
|
||||||
Tuilleadh sa chòir""".split("\n"):
|
Tuilleadh sa chòir""".split(
|
||||||
_exc[orth] = [{ORTH: orth}]
|
"\n"
|
||||||
|
):
|
||||||
|
_exc[orth] = [{ORTH: orth}]
|
||||||
|
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||||
|
|
|
@ -12,4 +12,5 @@ class Kurmanji(Language):
|
||||||
lang = "kmr"
|
lang = "kmr"
|
||||||
Defaults = KurmanjiDefaults
|
Defaults = KurmanjiDefaults
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Kurmanji"]
|
__all__ = ["Kurmanji"]
|
||||||
|
|
|
@ -80,10 +80,12 @@ def bg_tokenizer():
|
||||||
def bn_tokenizer():
|
def bn_tokenizer():
|
||||||
return get_lang_class("bn")().tokenizer
|
return get_lang_class("bn")().tokenizer
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def bo_tokenizer():
|
def bo_tokenizer():
|
||||||
return get_lang_class("bo")().tokenizer
|
return get_lang_class("bo")().tokenizer
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def ca_tokenizer():
|
def ca_tokenizer():
|
||||||
return get_lang_class("ca")().tokenizer
|
return get_lang_class("ca")().tokenizer
|
||||||
|
|
|
@ -4,7 +4,18 @@ from spacy.lang.kmr.lex_attrs import like_num
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"word", ["yekem", "duyemîn", "100em", "dehem", "sedemîn", "34em", "30yem", "20emîn", "50yemîn"]
|
"word",
|
||||||
|
[
|
||||||
|
"yekem",
|
||||||
|
"duyemîn",
|
||||||
|
"100em",
|
||||||
|
"dehem",
|
||||||
|
"sedemîn",
|
||||||
|
"34em",
|
||||||
|
"30yem",
|
||||||
|
"20emîn",
|
||||||
|
"50yemîn",
|
||||||
|
],
|
||||||
)
|
)
|
||||||
def test_kmr_lex_attrs_like_number_for_ordinal(word):
|
def test_kmr_lex_attrs_like_number_for_ordinal(word):
|
||||||
assert like_num(word)
|
assert like_num(word)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user