Format

2025-09-13 23:52:38 +03:00 · 2024-09-09 11:22:52 +02:00 · 2024-09-09 11:22:52 +02:00 · 59ac7e6bdb
commit 59ac7e6bdb
parent b65491b641
7 changed files with 68 additions and 18 deletions
--- a/spacy/lang/bo/lex_attrs.py
+++ b/spacy/lang/bo/lex_attrs.py
@ -4,10 +4,43 @@ from ...attrs import LIKE_NUM
 _num_words = [
    "ཀླད་ཀོར་",
-    "གཅིག་", "གཉིས་", "གསུམ་", "བཞི་", "ལྔ་", "དྲུག་", "བདུན་", "བརྒྱད་", "དགུ་", "བཅུ་",
+    "གཅིག་",
-    "བཅུ་གཅིག་", "བཅུ་གཉིས་", "བཅུ་གསུམ་", "བཅུ་བཞི་", "བཅུ་ལྔ་", "བཅུ་དྲུག་", "བཅུ་བདུན་", "བཅུ་པརྒྱད", "བཅུ་དགུ་", "ཉི་ཤུ་",
+    "གཉིས་",
-    "སུམ་ཅུ", "བཞི་བཅུ", "ལྔ་བཅུ", "དྲུག་ཅུ", "བདུན་ཅུ", "བརྒྱད་ཅུ", "དགུ་བཅུ", "བརྒྱ་",
+    "གསུམ་",
-    "སྟོང་", "ཁྲི་", "ས་ཡ་", "	བྱེ་བ་", "དུང་ཕྱུར་", "ཐེར་འབུམ་", "ཐེར་འབུམ་ཆེན་པོ་", "ཁྲག་ཁྲིག་", "ཁྲག་ཁྲིག་ཆེན་པོ་",
+    "བཞི་",
    "ལྔ་",
    "དྲུག་",
    "བདུན་",
    "བརྒྱད་",
    "དགུ་",
    "བཅུ་",
    "བཅུ་གཅིག་",
    "བཅུ་གཉིས་",
    "བཅུ་གསུམ་",
    "བཅུ་བཞི་",
    "བཅུ་ལྔ་",
    "བཅུ་དྲུག་",
    "བཅུ་བདུན་",
    "བཅུ་པརྒྱད",
    "བཅུ་དགུ་",
    "ཉི་ཤུ་",
    "སུམ་ཅུ",
    "བཞི་བཅུ",
    "ལྔ་བཅུ",
    "དྲུག་ཅུ",
    "བདུན་ཅུ",
    "བརྒྱད་ཅུ",
    "དགུ་བཅུ",
    "བརྒྱ་",
    "སྟོང་",
    "ཁྲི་",
    "ས་ཡ་",
    "	བྱེ་བ་",
    "དུང་ཕྱུར་",
    "ཐེར་འབུམ་",
    "ཐེར་འབུམ་ཆེན་པོ་",
    "ཁྲག་ཁྲིག་",
    "ཁྲག་ཁྲིག་ཆེན་པོ་",
 ]
--- a/spacy/lang/gd/stop_words.py
+++ b/spacy/lang/gd/stop_words.py
@ -382,5 +382,7 @@ urrainn
 ì
 ò
 ó
-""".split("\n")
+""".split(
        "\n"
    )
 )
--- a/spacy/lang/gd/tokenizer_exceptions.py
+++ b/spacy/lang/gd/tokenizer_exceptions.py
@ -18,19 +18,18 @@ _exc = {
    "càil": [{ORTH: "cà", NORM: "càite"}, {ORTH: "il", NORM: "bheil"}],
    "sna": [{ORTH: "s", NORM: "anns"}, {ORTH: "na", NORM: "na"}],
    "orra": [{ORTH: "orr", NORM: "air"}, {ORTH: "a", NORM: "do"}],
-    "fiùs": [{ORTH: "fiù", NORM: "fiù"}, {ORTH: "s", NORM: "'s"}]
+    "fiùs": [{ORTH: "fiù", NORM: "fiù"}, {ORTH: "s", NORM: "'s"}],
 }
 # Hyphenations that are alternative forms of words
 for exc_data in [
-    {ORTH: "fa-near",NORM: "fainear"},
+    {ORTH: "fa-near", NORM: "fainear"},
-    {ORTH: "Fa-near",NORM: "Fainear"},
+    {ORTH: "Fa-near", NORM: "Fainear"},
 ]:
    _exc[exc_data[ORTH]] = [exc_data]
 # Abreviations and shortened words
 for exc_data in [
    {ORTH: "'", NORM: "a"},
@ -1529,7 +1528,7 @@ Weld-adh
 Òige-sa
 òrd-mhòr
 Òrd-mhòr""".split():
-   _exc[orth] = [{ORTH: orth}]
+    _exc[orth] = [{ORTH: orth}]
 # Multiple words that should remain as one token
 for orth in """'n diugh
@ -1975,8 +1974,10 @@ Tron an
 tuilleadh 's a chòir
 Tuilleadh 's a chòir
 tuilleadh sa chòir
-Tuilleadh sa chòir""".split("\n"):
+Tuilleadh sa chòir""".split(
-  _exc[orth] = [{ORTH: orth}]
+    "\n"
 ):
    _exc[orth] = [{ORTH: orth}]
 TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
--- a/spacy/lang/kmr/init.py
+++ b/spacy/lang/kmr/init.py
@ -12,4 +12,5 @@ class Kurmanji(Language):
    lang = "kmr"
    Defaults = KurmanjiDefaults
 __all__ = ["Kurmanji"]
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -80,10 +80,12 @@ def bg_tokenizer():
 def bn_tokenizer():
    return get_lang_class("bn")().tokenizer
@pytest.fixture(scope="session")
 def bo_tokenizer():
    return get_lang_class("bo")().tokenizer
@pytest.fixture(scope="session")
 def ca_tokenizer():
    return get_lang_class("ca")().tokenizer
--- a/spacy/tests/lang/kmr/test_text.py
+++ b/spacy/tests/lang/kmr/test_text.py
@ -4,7 +4,18 @@ from spacy.lang.kmr.lex_attrs import like_num
@pytest.mark.parametrize(
-    "word", ["yekem", "duyemîn", "100em", "dehem", "sedemîn", "34em", "30yem", "20emîn", "50yemîn"]
+    "word",
    [
        "yekem",
        "duyemîn",
        "100em",
        "dehem",
        "sedemîn",
        "34em",
        "30yem",
        "20emîn",
        "50yemîn",
    ],
 )
 def test_kmr_lex_attrs_like_number_for_ordinal(word):
    assert like_num(word)