Format

2025-11-07 03:17:37 +03:00 · 2024-09-09 11:22:52 +02:00 · 2024-09-09 11:22:52 +02:00 · 59ac7e6bdb
commit 59ac7e6bdb
parent b65491b641
7 changed files with 68 additions and 18 deletions
--- a/spacy/lang/bo/lex_attrs.py
+++ b/spacy/lang/bo/lex_attrs.py
@ -4,10 +4,43 @@ from ...attrs import LIKE_NUM

 _num_words = [
    "ཀླད་ཀོར་",
-    "གཅིག་", "གཉིས་", "གསུམ་", "བཞི་", "ལྔ་", "དྲུག་", "བདུན་", "བརྒྱད་", "དགུ་", "བཅུ་",
-    "བཅུ་གཅིག་", "བཅུ་གཉིས་", "བཅུ་གསུམ་", "བཅུ་བཞི་", "བཅུ་ལྔ་", "བཅུ་དྲུག་", "བཅུ་བདུན་", "བཅུ་པརྒྱད", "བཅུ་དགུ་", "ཉི་ཤུ་",
-    "སུམ་ཅུ", "བཞི་བཅུ", "ལྔ་བཅུ", "དྲུག་ཅུ", "བདུན་ཅུ", "བརྒྱད་ཅུ", "དགུ་བཅུ", "བརྒྱ་",
-    "སྟོང་", "ཁྲི་", "ས་ཡ་", "	བྱེ་བ་", "དུང་ཕྱུར་", "ཐེར་འབུམ་", "ཐེར་འབུམ་ཆེན་པོ་", "ཁྲག་ཁྲིག་", "ཁྲག་ཁྲིག་ཆེན་པོ་",
+    "གཅིག་",
+    "གཉིས་",
+    "གསུམ་",
+    "བཞི་",
+    "ལྔ་",
+    "དྲུག་",
+    "བདུན་",
+    "བརྒྱད་",
+    "དགུ་",
+    "བཅུ་",
+    "བཅུ་གཅིག་",
+    "བཅུ་གཉིས་",
+    "བཅུ་གསུམ་",
+    "བཅུ་བཞི་",
+    "བཅུ་ལྔ་",
+    "བཅུ་དྲུག་",
+    "བཅུ་བདུན་",
+    "བཅུ་པརྒྱད",
+    "བཅུ་དགུ་",
+    "ཉི་ཤུ་",
+    "སུམ་ཅུ",
+    "བཞི་བཅུ",
+    "ལྔ་བཅུ",
+    "དྲུག་ཅུ",
+    "བདུན་ཅུ",
+    "བརྒྱད་ཅུ",
+    "དགུ་བཅུ",
+    "བརྒྱ་",
+    "སྟོང་",
+    "ཁྲི་",
+    "ས་ཡ་",
+    "	བྱེ་བ་",
+    "དུང་ཕྱུར་",
+    "ཐེར་འབུམ་",
+    "ཐེར་འབུམ་ཆེན་པོ་",
+    "ཁྲག་ཁྲིག་",
+    "ཁྲག་ཁྲིག་ཆེན་པོ་",
 ]


--- a/spacy/lang/gd/stop_words.py
+++ b/spacy/lang/gd/stop_words.py
@ -382,5 +382,7 @@ urrainn
 ì
 ò
 ó
-""".split("\n")
+""".split(
+        "\n"
+    )
 )
--- a/spacy/lang/gd/tokenizer_exceptions.py
+++ b/spacy/lang/gd/tokenizer_exceptions.py
@ -18,19 +18,18 @@ _exc = {
    "càil": [{ORTH: "cà", NORM: "càite"}, {ORTH: "il", NORM: "bheil"}],
    "sna": [{ORTH: "s", NORM: "anns"}, {ORTH: "na", NORM: "na"}],
    "orra": [{ORTH: "orr", NORM: "air"}, {ORTH: "a", NORM: "do"}],
-    "fiùs": [{ORTH: "fiù", NORM: "fiù"}, {ORTH: "s", NORM: "'s"}]
+    "fiùs": [{ORTH: "fiù", NORM: "fiù"}, {ORTH: "s", NORM: "'s"}],
 }


 # Hyphenations that are alternative forms of words
 for exc_data in [
-    {ORTH: "fa-near",NORM: "fainear"},
-    {ORTH: "Fa-near",NORM: "Fainear"},
+    {ORTH: "fa-near", NORM: "fainear"},
+    {ORTH: "Fa-near", NORM: "Fainear"},
 ]:
    _exc[exc_data[ORTH]] = [exc_data]


-    
 # Abreviations and shortened words
 for exc_data in [
    {ORTH: "'", NORM: "a"},
@ -1529,7 +1528,7 @@ Weld-adh
 Òige-sa
 òrd-mhòr
 Òrd-mhòr""".split():
-   _exc[orth] = [{ORTH: orth}]
+    _exc[orth] = [{ORTH: orth}]

 # Multiple words that should remain as one token
 for orth in """'n diugh
@ -1975,8 +1974,10 @@ Tron an
 tuilleadh 's a chòir
 Tuilleadh 's a chòir
 tuilleadh sa chòir
-Tuilleadh sa chòir""".split("\n"):
-  _exc[orth] = [{ORTH: orth}]
+Tuilleadh sa chòir""".split(
+    "\n"
+):
+    _exc[orth] = [{ORTH: orth}]


 TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
--- a/spacy/lang/kmr/init.py
+++ b/spacy/lang/kmr/init.py
@ -12,4 +12,5 @@ class Kurmanji(Language):
    lang = "kmr"
    Defaults = KurmanjiDefaults

+
 __all__ = ["Kurmanji"]
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -80,10 +80,12 @@ def bg_tokenizer():
 def bn_tokenizer():
    return get_lang_class("bn")().tokenizer

+
@pytest.fixture(scope="session")
 def bo_tokenizer():
    return get_lang_class("bo")().tokenizer

+
@pytest.fixture(scope="session")
 def ca_tokenizer():
    return get_lang_class("ca")().tokenizer
--- a/spacy/tests/lang/kmr/test_text.py
+++ b/spacy/tests/lang/kmr/test_text.py
@ -4,7 +4,18 @@ from spacy.lang.kmr.lex_attrs import like_num


@pytest.mark.parametrize(
-    "word", ["yekem", "duyemîn", "100em", "dehem", "sedemîn", "34em", "30yem", "20emîn", "50yemîn"]
+    "word",
+    [
+        "yekem",
+        "duyemîn",
+        "100em",
+        "dehem",
+        "sedemîn",
+        "34em",
+        "30yem",
+        "20emîn",
+        "50yemîn",
+    ],
 )
 def test_kmr_lex_attrs_like_number_for_ordinal(word):
    assert like_num(word)