Format

2025-09-12 23:22:38 +03:00 · 2025-05-28 17:27:36 +02:00 · 2025-05-28 17:27:36 +02:00 · 80aa445f34
commit 80aa445f34
parent 79f9d3ea2a
9 changed files with 183 additions and 127 deletions
--- a/spacy/lang/ht/init.py
+++ b/spacy/lang/ht/init.py
@ -22,10 +22,12 @@ class HaitianCreoleDefaults(BaseDefaults):
    stop_words = STOP_WORDS
    tag_map = TAG_MAP
 class HaitianCreole(Language):
    lang = "ht"
    Defaults = HaitianCreoleDefaults
@HaitianCreole.factory(
    "lemmatizer",
    assigns=["token.lemma"],
@ -49,4 +51,5 @@ def make_lemmatizer(
        nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
    )
 __all__ = ["HaitianCreole"]
--- a/spacy/lang/ht/lex_attrs.py
+++ b/spacy/lang/ht/lex_attrs.py
@ -49,6 +49,7 @@ NORM_MAP = {
    "P": "Pa",
 }
 def like_num(text):
    text = text.strip().lower()
    if text.startswith(("+", "-", "±", "~")):
@ -69,9 +70,11 @@ def like_num(text):
        return True
    return False
 def norm_custom(text):
    return NORM_MAP.get(text, text.lower())
 LEX_ATTRS = {
    LIKE_NUM: like_num,
    NORM: norm_custom,
--- a/spacy/lang/ht/punctuation.py
+++ b/spacy/lang/ht/punctuation.py
@ -16,28 +16,43 @@ ELISION = "'’".replace(" ", "")
 _prefixes_elision = "m n l y t k w"
 _prefixes_elision += " " + _prefixes_elision.upper()
-TOKENIZER_PREFIXES = LIST_PUNCT + LIST_QUOTES + [
+TOKENIZER_PREFIXES = (
-    r"(?:({pe})[{el}])(?=[{a}])".format(
+    LIST_PUNCT
-        a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision)
+    + LIST_QUOTES
-    )
+    + [
-]
+        r"(?:({pe})[{el}])(?=[{a}])".format(
            a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision)
        )
    ]
 )
-TOKENIZER_SUFFIXES = LIST_PUNCT + LIST_QUOTES + LIST_ELLIPSES + [
+TOKENIZER_SUFFIXES = (
-    r"(?<=[0-9])%",  # numbers like 10%
+    LIST_PUNCT
-    r"(?<=[0-9])(?:{h})".format(h=HYPHENS),  # hyphens after numbers
+    + LIST_QUOTES
-    r"(?<=[{a}])['’]".format(a=ALPHA),  # apostrophes after letters
+    + LIST_ELLIPSES
-    r"(?<=[{a}])['’][mwlnytk](?=\s|$)".format(a=ALPHA),  # contractions
+    + [
-    r"(?<=[{a}0-9])\)",  # right parenthesis after letter/number
+        r"(?<=[0-9])%",  # numbers like 10%
-    r"(?<=[{a}])\.(?=\s|$)".format(a=ALPHA),  # period after letter if space or end of string
+        r"(?<=[0-9])(?:{h})".format(h=HYPHENS),  # hyphens after numbers
-    r"(?<=\))[\.\?!]",  # punctuation immediately after right parenthesis
+        r"(?<=[{a}])['’]".format(a=ALPHA),  # apostrophes after letters
-]
+        r"(?<=[{a}])['’][mwlnytk](?=\s|$)".format(a=ALPHA),  # contractions
        r"(?<=[{a}0-9])\)",  # right parenthesis after letter/number
        r"(?<=[{a}])\.(?=\s|$)".format(
            a=ALPHA
        ),  # period after letter if space or end of string
        r"(?<=\))[\.\?!]",  # punctuation immediately after right parenthesis
    ]
 )
-TOKENIZER_INFIXES = LIST_ELLIPSES + LIST_ICONS + [
+TOKENIZER_INFIXES = (
-    r"(?<=[0-9])[+\-\*^](?=[0-9-])",
+    LIST_ELLIPSES
-    r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
+    + LIST_ICONS
-        al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
+    + [
-    ),
+        r"(?<=[0-9])[+\-\*^](?=[0-9-])",
-    r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
+        r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
-    r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
+            al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
-    r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION),
+        ),
-]
+        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
        r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
        r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION),
    ]
 )
--- a/spacy/lang/ht/stop_words.py
+++ b/spacy/lang/ht/stop_words.py
@ -39,8 +39,7 @@ sa san si swa si
 men mèsi oswa osinon
-"""
+""".split()
 .split()
 )
 # Add common contractions, with and without apostrophe variants
--- a/spacy/lang/ht/tag_map.py
+++ b/spacy/lang/ht/tag_map.py
@ -1,4 +1,22 @@
-from spacy.symbols import NOUN, VERB, AUX, ADJ, ADV, PRON, DET, ADP, SCONJ, CCONJ, PART, INTJ, NUM, PROPN, PUNCT, SYM, X
+from spacy.symbols import (
    NOUN,
    VERB,
    AUX,
    ADJ,
    ADV,
    PRON,
    DET,
    ADP,
    SCONJ,
    CCONJ,
    PART,
    INTJ,
    NUM,
    PROPN,
    PUNCT,
    SYM,
    X,
 )
 TAG_MAP = {
    "NOUN": {"pos": NOUN},
--- a/spacy/lang/ht/tokenizer_exceptions.py
+++ b/spacy/lang/ht/tokenizer_exceptions.py
@ -1,5 +1,6 @@
 from spacy.symbols import ORTH, NORM
 def make_variants(base, first_norm, second_orth, second_norm):
    return {
        base: [
@ -7,14 +8,16 @@ def make_variants(base, first_norm, second_orth, second_norm):
            {ORTH: second_orth, NORM: second_norm},
        ],
        base.capitalize(): [
-            {ORTH: base.split("'")[0].capitalize() + "'", NORM: first_norm.capitalize()},
+            {
                ORTH: base.split("'")[0].capitalize() + "'",
                NORM: first_norm.capitalize(),
            },
            {ORTH: second_orth, NORM: second_norm},
-        ]
+        ],
    }
-TOKENIZER_EXCEPTIONS = {
+
-    "Dr.": [{ORTH: "Dr."}]
+TOKENIZER_EXCEPTIONS = {"Dr.": [{ORTH: "Dr."}]}
 }
 # Apostrophe forms
 TOKENIZER_EXCEPTIONS.update(make_variants("m'ap", "mwen", "ap", "ap"))
@ -29,93 +32,95 @@ TOKENIZER_EXCEPTIONS.update(make_variants("p'ap", "pa", "ap", "ap"))
 TOKENIZER_EXCEPTIONS.update(make_variants("t'ap", "te", "ap", "ap"))
 # Non-apostrophe contractions (with capitalized variants)
-TOKENIZER_EXCEPTIONS.update({
+TOKENIZER_EXCEPTIONS.update(
-    "map": [
+    {
-        {ORTH: "m", NORM: "mwen"},
+        "map": [
-        {ORTH: "ap", NORM: "ap"},
+            {ORTH: "m", NORM: "mwen"},
-    ],
+            {ORTH: "ap", NORM: "ap"},
-    "Map": [
+        ],
-        {ORTH: "M", NORM: "Mwen"},
+        "Map": [
-        {ORTH: "ap", NORM: "ap"},
+            {ORTH: "M", NORM: "Mwen"},
-    ],
+            {ORTH: "ap", NORM: "ap"},
-    "lem": [
+        ],
-        {ORTH: "le", NORM: "le"},
+        "lem": [
-        {ORTH: "m", NORM: "mwen"},
+            {ORTH: "le", NORM: "le"},
-    ],
+            {ORTH: "m", NORM: "mwen"},
-    "Lem": [
+        ],
-        {ORTH: "Le", NORM: "Le"},
+        "Lem": [
-        {ORTH: "m", NORM: "mwen"},
+            {ORTH: "Le", NORM: "Le"},
-    ],
+            {ORTH: "m", NORM: "mwen"},
-    "lew": [
+        ],
-        {ORTH: "le", NORM: "le"},
+        "lew": [
-        {ORTH: "w", NORM: "ou"},
+            {ORTH: "le", NORM: "le"},
-    ],
+            {ORTH: "w", NORM: "ou"},
-    "Lew": [
+        ],
-        {ORTH: "Le", NORM: "Le"},
+        "Lew": [
-        {ORTH: "w", NORM: "ou"},
+            {ORTH: "Le", NORM: "Le"},
-    ],
+            {ORTH: "w", NORM: "ou"},
-    "nap": [
+        ],
-        {ORTH: "n", NORM: "nou"},
+        "nap": [
-        {ORTH: "ap", NORM: "ap"},
+            {ORTH: "n", NORM: "nou"},
-    ],
+            {ORTH: "ap", NORM: "ap"},
-    "Nap": [
+        ],
-        {ORTH: "N", NORM: "Nou"},
+        "Nap": [
-        {ORTH: "ap", NORM: "ap"},
+            {ORTH: "N", NORM: "Nou"},
-    ],
+            {ORTH: "ap", NORM: "ap"},
-    "lap": [
+        ],
-        {ORTH: "l", NORM: "li"},
+        "lap": [
-        {ORTH: "ap", NORM: "ap"},
+            {ORTH: "l", NORM: "li"},
-    ],
+            {ORTH: "ap", NORM: "ap"},
-    "Lap": [
+        ],
-        {ORTH: "L", NORM: "Li"},
+        "Lap": [
-        {ORTH: "ap", NORM: "ap"},
+            {ORTH: "L", NORM: "Li"},
-    ],
+            {ORTH: "ap", NORM: "ap"},
-    "yap": [
+        ],
-        {ORTH: "y", NORM: "yo"},
+        "yap": [
-        {ORTH: "ap", NORM: "ap"},
+            {ORTH: "y", NORM: "yo"},
-    ],
+            {ORTH: "ap", NORM: "ap"},
-    "Yap": [
+        ],
-        {ORTH: "Y", NORM: "Yo"},
+        "Yap": [
-        {ORTH: "ap", NORM: "ap"},
+            {ORTH: "Y", NORM: "Yo"},
-    ],
+            {ORTH: "ap", NORM: "ap"},
-    "mte": [
+        ],
-        {ORTH: "m", NORM: "mwen"},
+        "mte": [
-        {ORTH: "te", NORM: "te"},
+            {ORTH: "m", NORM: "mwen"},
-    ],
+            {ORTH: "te", NORM: "te"},
-    "Mte": [
+        ],
-        {ORTH: "M", NORM: "Mwen"},
+        "Mte": [
-        {ORTH: "te", NORM: "te"},
+            {ORTH: "M", NORM: "Mwen"},
-    ],
+            {ORTH: "te", NORM: "te"},
-    "mpral": [
+        ],
-        {ORTH: "m", NORM: "mwen"},
+        "mpral": [
-        {ORTH: "pral", NORM: "pral"},
+            {ORTH: "m", NORM: "mwen"},
-    ],
+            {ORTH: "pral", NORM: "pral"},
-    "Mpral": [
+        ],
-        {ORTH: "M", NORM: "Mwen"},
+        "Mpral": [
-        {ORTH: "pral", NORM: "pral"},
+            {ORTH: "M", NORM: "Mwen"},
-    ],
+            {ORTH: "pral", NORM: "pral"},
-    "wap": [
+        ],
-        {ORTH: "w", NORM: "ou"},
+        "wap": [
-        {ORTH: "ap", NORM: "ap"},
+            {ORTH: "w", NORM: "ou"},
-    ],
+            {ORTH: "ap", NORM: "ap"},
-    "Wap": [
+        ],
-        {ORTH: "W", NORM: "Ou"},
+        "Wap": [
-        {ORTH: "ap", NORM: "ap"},
+            {ORTH: "W", NORM: "Ou"},
-    ],
+            {ORTH: "ap", NORM: "ap"},
-    "kap": [
+        ],
-        {ORTH: "k", NORM: "ki"},
+        "kap": [
-        {ORTH: "ap", NORM: "ap"},
+            {ORTH: "k", NORM: "ki"},
-    ],
+            {ORTH: "ap", NORM: "ap"},
-    "Kap": [
+        ],
-        {ORTH: "K", NORM: "Ki"},
+        "Kap": [
-        {ORTH: "ap", NORM: "ap"},
+            {ORTH: "K", NORM: "Ki"},
-    ],
+            {ORTH: "ap", NORM: "ap"},
-    "tap": [
+        ],
-        {ORTH: "t", NORM: "te"},
+        "tap": [
-        {ORTH: "ap", NORM: "ap"},
+            {ORTH: "t", NORM: "te"},
-    ],
+            {ORTH: "ap", NORM: "ap"},
-    "Tap": [
+        ],
-        {ORTH: "T", NORM: "Te"},
+        "Tap": [
-        {ORTH: "ap", NORM: "ap"},
+            {ORTH: "T", NORM: "Te"},
-    ],
+            {ORTH: "ap", NORM: "ap"},
-})
+        ],
    }
 )
--- a/spacy/tests/lang/ht/test_exceptions.py
+++ b/spacy/tests/lang/ht/test_exceptions.py
@ -29,4 +29,16 @@ def test_ht_tokenizer_handles_basic_abbreviation(ht_tokenizer, text):
 def test_ht_tokenizer_full_sentence(ht_tokenizer):
    text = "Si'm ka vini, m'ap pale ak li."
    tokens = [t.text for t in ht_tokenizer(text)]
-    assert tokens == ["Si", "'m", "ka", "vini", ",", "m'", "ap", "pale", "ak", "li", "."]
+    assert tokens == [
        "Si",
        "'m",
        "ka",
        "vini",
        ",",
        "m'",
        "ap",
        "pale",
        "ak",
        "li",
        ".",
    ]
--- a/spacy/tests/lang/ht/test_prefix_suffix_infix.py
+++ b/spacy/tests/lang/ht/test_prefix_suffix_infix.py
@ -37,7 +37,9 @@ def test_ht_tokenizer_splits_uneven_wrap(ht_tokenizer, text):
    assert len(tokens) == 5
-@pytest.mark.parametrize("text,length", [("Ozetazini.", 2), ("Frans.", 2), ("(Ozetazini.", 3)])
+@pytest.mark.parametrize(
    "text,length", [("Ozetazini.", 2), ("Frans.", 2), ("(Ozetazini.", 3)]
 )
 def test_ht_tokenizer_splits_prefix_interact(ht_tokenizer, text, length):
    tokens = ht_tokenizer(text)
    assert len(tokens) == length
--- a/spacy/tests/lang/ht/test_text.py
+++ b/spacy/tests/lang/ht/test_text.py
@ -16,7 +16,6 @@ Nan Washington, Depatman Deta Etazini pibliye yon deklarasyon ki eksprime "regre
    assert len(tokens) == 84
@pytest.mark.parametrize(
    "text,length",
    [
@ -66,14 +65,14 @@ def test_ht_lex_attrs_capitals(word):
@pytest.mark.parametrize(
-    "word, expected", [
+    "word, expected",
    [
        ("'m", "mwen"),
        ("'n", "nou"),
        ("'l", "li"),
        ("'y", "yo"),
        ("'w", "ou"),
-    ]
+    ],
 )
 def test_ht_lex_attrs_norm_custom(word, expected):
    assert norm_custom(word) == expected