From 59ac7e6bdb32b34c58e94a65b8f93fe63d4290e7 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 9 Sep 2024 11:22:52 +0200 Subject: [PATCH] Format --- spacy/lang/bo/lex_attrs.py | 41 ++++++++++++++++++++++++--- spacy/lang/gd/stop_words.py | 4 ++- spacy/lang/gd/tokenizer_exceptions.py | 23 ++++++++------- spacy/lang/kmr/__init__.py | 1 + spacy/tests/conftest.py | 2 ++ spacy/tests/lang/bo/test_text.py | 2 +- spacy/tests/lang/kmr/test_text.py | 13 ++++++++- 7 files changed, 68 insertions(+), 18 deletions(-) diff --git a/spacy/lang/bo/lex_attrs.py b/spacy/lang/bo/lex_attrs.py index c6a593868..5535934af 100644 --- a/spacy/lang/bo/lex_attrs.py +++ b/spacy/lang/bo/lex_attrs.py @@ -4,10 +4,43 @@ from ...attrs import LIKE_NUM _num_words = [ "ཀླད་ཀོར་", - "གཅིག་", "གཉིས་", "གསུམ་", "བཞི་", "ལྔ་", "དྲུག་", "བདུན་", "བརྒྱད་", "དགུ་", "བཅུ་", - "བཅུ་གཅིག་", "བཅུ་གཉིས་", "བཅུ་གསུམ་", "བཅུ་བཞི་", "བཅུ་ལྔ་", "བཅུ་དྲུག་", "བཅུ་བདུན་", "བཅུ་པརྒྱད", "བཅུ་དགུ་", "ཉི་ཤུ་", - "སུམ་ཅུ", "བཞི་བཅུ", "ལྔ་བཅུ", "དྲུག་ཅུ", "བདུན་ཅུ", "བརྒྱད་ཅུ", "དགུ་བཅུ", "བརྒྱ་", - "སྟོང་", "ཁྲི་", "ས་ཡ་", " བྱེ་བ་", "དུང་ཕྱུར་", "ཐེར་འབུམ་", "ཐེར་འབུམ་ཆེན་པོ་", "ཁྲག་ཁྲིག་", "ཁྲག་ཁྲིག་ཆེན་པོ་", + "གཅིག་", + "གཉིས་", + "གསུམ་", + "བཞི་", + "ལྔ་", + "དྲུག་", + "བདུན་", + "བརྒྱད་", + "དགུ་", + "བཅུ་", + "བཅུ་གཅིག་", + "བཅུ་གཉིས་", + "བཅུ་གསུམ་", + "བཅུ་བཞི་", + "བཅུ་ལྔ་", + "བཅུ་དྲུག་", + "བཅུ་བདུན་", + "བཅུ་པརྒྱད", + "བཅུ་དགུ་", + "ཉི་ཤུ་", + "སུམ་ཅུ", + "བཞི་བཅུ", + "ལྔ་བཅུ", + "དྲུག་ཅུ", + "བདུན་ཅུ", + "བརྒྱད་ཅུ", + "དགུ་བཅུ", + "བརྒྱ་", + "སྟོང་", + "ཁྲི་", + "ས་ཡ་", + " བྱེ་བ་", + "དུང་ཕྱུར་", + "ཐེར་འབུམ་", + "ཐེར་འབུམ་ཆེན་པོ་", + "ཁྲག་ཁྲིག་", + "ཁྲག་ཁྲིག་ཆེན་པོ་", ] diff --git a/spacy/lang/gd/stop_words.py b/spacy/lang/gd/stop_words.py index d5132c35e..9f5a66cbc 100644 --- a/spacy/lang/gd/stop_words.py +++ b/spacy/lang/gd/stop_words.py @@ -382,5 +382,7 @@ urrainn ì ò ó -""".split("\n") +""".split( + "\n" + ) ) diff --git a/spacy/lang/gd/tokenizer_exceptions.py b/spacy/lang/gd/tokenizer_exceptions.py index bf47bd859..76e169d90 100644 --- a/spacy/lang/gd/tokenizer_exceptions.py +++ b/spacy/lang/gd/tokenizer_exceptions.py @@ -18,19 +18,18 @@ _exc = { "càil": [{ORTH: "cà", NORM: "càite"}, {ORTH: "il", NORM: "bheil"}], "sna": [{ORTH: "s", NORM: "anns"}, {ORTH: "na", NORM: "na"}], "orra": [{ORTH: "orr", NORM: "air"}, {ORTH: "a", NORM: "do"}], - "fiùs": [{ORTH: "fiù", NORM: "fiù"}, {ORTH: "s", NORM: "'s"}] + "fiùs": [{ORTH: "fiù", NORM: "fiù"}, {ORTH: "s", NORM: "'s"}], } - + # Hyphenations that are alternative forms of words for exc_data in [ - {ORTH: "fa-near",NORM: "fainear"}, - {ORTH: "Fa-near",NORM: "Fainear"}, + {ORTH: "fa-near", NORM: "fainear"}, + {ORTH: "Fa-near", NORM: "Fainear"}, ]: _exc[exc_data[ORTH]] = [exc_data] - - - + + # Abreviations and shortened words for exc_data in [ {ORTH: "'", NORM: "a"}, @@ -1529,7 +1528,7 @@ Weld-adh Òige-sa òrd-mhòr Òrd-mhòr""".split(): - _exc[orth] = [{ORTH: orth}] + _exc[orth] = [{ORTH: orth}] # Multiple words that should remain as one token for orth in """'n diugh @@ -1975,8 +1974,10 @@ Tron an tuilleadh 's a chòir Tuilleadh 's a chòir tuilleadh sa chòir -Tuilleadh sa chòir""".split("\n"): - _exc[orth] = [{ORTH: orth}] - +Tuilleadh sa chòir""".split( + "\n" +): + _exc[orth] = [{ORTH: orth}] + TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) diff --git a/spacy/lang/kmr/__init__.py b/spacy/lang/kmr/__init__.py index 379696f23..124321a8e 100644 --- a/spacy/lang/kmr/__init__.py +++ b/spacy/lang/kmr/__init__.py @@ -12,4 +12,5 @@ class Kurmanji(Language): lang = "kmr" Defaults = KurmanjiDefaults + __all__ = ["Kurmanji"] diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index d2bc02081..e30300a33 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -80,10 +80,12 @@ def bg_tokenizer(): def bn_tokenizer(): return get_lang_class("bn")().tokenizer + @pytest.fixture(scope="session") def bo_tokenizer(): return get_lang_class("bo")().tokenizer + @pytest.fixture(scope="session") def ca_tokenizer(): return get_lang_class("ca")().tokenizer diff --git a/spacy/tests/lang/bo/test_text.py b/spacy/tests/lang/bo/test_text.py index 9275c15c4..fb3900d51 100644 --- a/spacy/tests/lang/bo/test_text.py +++ b/spacy/tests/lang/bo/test_text.py @@ -18,4 +18,4 @@ import pytest def test_lex_attrs_like_number(bo_tokenizer, text, match): tokens = bo_tokenizer(text) assert len(tokens) == 1 - assert tokens[0].like_num == match \ No newline at end of file + assert tokens[0].like_num == match diff --git a/spacy/tests/lang/kmr/test_text.py b/spacy/tests/lang/kmr/test_text.py index 209f386ec..405dc28f6 100644 --- a/spacy/tests/lang/kmr/test_text.py +++ b/spacy/tests/lang/kmr/test_text.py @@ -4,7 +4,18 @@ from spacy.lang.kmr.lex_attrs import like_num @pytest.mark.parametrize( - "word", ["yekem", "duyemîn", "100em", "dehem", "sedemîn", "34em", "30yem", "20emîn", "50yemîn"] + "word", + [ + "yekem", + "duyemîn", + "100em", + "dehem", + "sedemîn", + "34em", + "30yem", + "20emîn", + "50yemîn", + ], ) def test_kmr_lex_attrs_like_number_for_ordinal(word): assert like_num(word)