Merge c015dd1fa6 into 41e07772dc

2025-08-05 21:00:19 +03:00 · 2025-05-28 17:28:07 +02:00 · 2025-05-28 17:28:07 +02:00 · b84c131df0
commit b84c131df0
parent 41e07772dc c015dd1fa6
13 changed files with 196 additions and 134 deletions
--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@ -91,6 +91,9 @@ IDS = {
    "MORPH": MORPH,
    "IDX": IDX,
 }
+# Make these ints in Python, so that we don't get this unexpected 'flag' type
+# This will match the behaviour before Cython 3
+IDS = {name: int(value) for name, value in IDS.items()}


 # ATTR IDs, in order of the symbol
--- a/spacy/lang/ht/init.py
+++ b/spacy/lang/ht/init.py
@ -5,11 +5,11 @@ from thinc.api import Model
 from ...language import BaseDefaults, Language
 from .lemmatizer import HaitianCreoleLemmatizer
 from .lex_attrs import LEX_ATTRS
-from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
 from .stop_words import STOP_WORDS
 from .syntax_iterators import SYNTAX_ITERATORS
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .tag_map import TAG_MAP
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS


 class HaitianCreoleDefaults(BaseDefaults):
@ -22,10 +22,12 @@ class HaitianCreoleDefaults(BaseDefaults):
    stop_words = STOP_WORDS
    tag_map = TAG_MAP

+
 class HaitianCreole(Language):
    lang = "ht"
    Defaults = HaitianCreoleDefaults

+
@HaitianCreole.factory(
    "lemmatizer",
    assigns=["token.lemma"],
@ -49,4 +51,5 @@ def make_lemmatizer(
        nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
    )

+
 __all__ = ["HaitianCreole"]
--- a/spacy/lang/ht/lemmatizer.py
+++ b/spacy/lang/ht/lemmatizer.py
@ -1,8 +1,8 @@
 from typing import List, Tuple

+from ...lookups import Lookups
 from ...pipeline import Lemmatizer
 from ...tokens import Token
-from ...lookups import Lookups


 class HaitianCreoleLemmatizer(Lemmatizer):
--- a/spacy/lang/ht/lex_attrs.py
+++ b/spacy/lang/ht/lex_attrs.py
@ -49,6 +49,7 @@ NORM_MAP = {
    "P": "Pa",
 }

+
 def like_num(text):
    text = text.strip().lower()
    if text.startswith(("+", "-", "±", "~")):
@ -69,9 +70,11 @@ def like_num(text):
        return True
    return False

+
 def norm_custom(text):
    return NORM_MAP.get(text, text.lower())

+
 LEX_ATTRS = {
    LIKE_NUM: like_num,
    NORM: norm_custom,
--- a/spacy/lang/ht/punctuation.py
+++ b/spacy/lang/ht/punctuation.py
@ -4,10 +4,10 @@ from ..char_classes import (
    ALPHA_UPPER,
    CONCAT_QUOTES,
    HYPHENS,
-    LIST_PUNCT,
-    LIST_QUOTES,
    LIST_ELLIPSES,
    LIST_ICONS,
+    LIST_PUNCT,
+    LIST_QUOTES,
    merge_chars,
 )

@ -16,28 +16,43 @@ ELISION = "'’".replace(" ", "")
 _prefixes_elision = "m n l y t k w"
 _prefixes_elision += " " + _prefixes_elision.upper()

-TOKENIZER_PREFIXES = LIST_PUNCT + LIST_QUOTES + [
-    r"(?:({pe})[{el}])(?=[{a}])".format(
-        a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision)
-    )
-]
+TOKENIZER_PREFIXES = (
+    LIST_PUNCT
+    + LIST_QUOTES
+    + [
+        r"(?:({pe})[{el}])(?=[{a}])".format(
+            a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision)
+        )
+    ]
+)

-TOKENIZER_SUFFIXES = LIST_PUNCT + LIST_QUOTES + LIST_ELLIPSES + [
-    r"(?<=[0-9])%",  # numbers like 10%
-    r"(?<=[0-9])(?:{h})".format(h=HYPHENS),  # hyphens after numbers
-    r"(?<=[{a}])['’]".format(a=ALPHA),  # apostrophes after letters
-    r"(?<=[{a}])['’][mwlnytk](?=\s|$)".format(a=ALPHA),  # contractions
-    r"(?<=[{a}0-9])\)",  # right parenthesis after letter/number
-    r"(?<=[{a}])\.(?=\s|$)".format(a=ALPHA),  # period after letter if space or end of string
-    r"(?<=\))[\.\?!]",  # punctuation immediately after right parenthesis
-]
+TOKENIZER_SUFFIXES = (
+    LIST_PUNCT
+    + LIST_QUOTES
+    + LIST_ELLIPSES
+    + [
+        r"(?<=[0-9])%",  # numbers like 10%
+        r"(?<=[0-9])(?:{h})".format(h=HYPHENS),  # hyphens after numbers
+        r"(?<=[{a}])['’]".format(a=ALPHA),  # apostrophes after letters
+        r"(?<=[{a}])['’][mwlnytk](?=\s|$)".format(a=ALPHA),  # contractions
+        r"(?<=[{a}0-9])\)",  # right parenthesis after letter/number
+        r"(?<=[{a}])\.(?=\s|$)".format(
+            a=ALPHA
+        ),  # period after letter if space or end of string
+        r"(?<=\))[\.\?!]",  # punctuation immediately after right parenthesis
+    ]
+)

-TOKENIZER_INFIXES = LIST_ELLIPSES + LIST_ICONS + [
-    r"(?<=[0-9])[+\-\*^](?=[0-9-])",
-    r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
-        al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
-    ),
-    r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
-    r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
-    r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION),
-]
+TOKENIZER_INFIXES = (
+    LIST_ELLIPSES
+    + LIST_ICONS
+    + [
+        r"(?<=[0-9])[+\-\*^](?=[0-9-])",
+        r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
+            al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
+        ),
+        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
+        r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
+        r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION),
+    ]
+)
--- a/spacy/lang/ht/stop_words.py
+++ b/spacy/lang/ht/stop_words.py
@ -39,8 +39,7 @@ sa san si swa si

 men mèsi oswa osinon

-"""
-.split()
+""".split()
 )

 # Add common contractions, with and without apostrophe variants
--- a/spacy/lang/ht/tag_map.py
+++ b/spacy/lang/ht/tag_map.py
@ -1,4 +1,22 @@
-from spacy.symbols import NOUN, VERB, AUX, ADJ, ADV, PRON, DET, ADP, SCONJ, CCONJ, PART, INTJ, NUM, PROPN, PUNCT, SYM, X
+from spacy.symbols import (
+    ADJ,
+    ADP,
+    ADV,
+    AUX,
+    CCONJ,
+    DET,
+    INTJ,
+    NOUN,
+    NUM,
+    PART,
+    PRON,
+    PROPN,
+    PUNCT,
+    SCONJ,
+    SYM,
+    VERB,
+    X,
+)

 TAG_MAP = {
    "NOUN": {"pos": NOUN},
--- a/spacy/lang/ht/tokenizer_exceptions.py
+++ b/spacy/lang/ht/tokenizer_exceptions.py
@ -1,4 +1,5 @@
-from spacy.symbols import ORTH, NORM
+from spacy.symbols import NORM, ORTH
+

 def make_variants(base, first_norm, second_orth, second_norm):
    return {
@ -7,14 +8,16 @@ def make_variants(base, first_norm, second_orth, second_norm):
            {ORTH: second_orth, NORM: second_norm},
        ],
        base.capitalize(): [
-            {ORTH: base.split("'")[0].capitalize() + "'", NORM: first_norm.capitalize()},
+            {
+                ORTH: base.split("'")[0].capitalize() + "'",
+                NORM: first_norm.capitalize(),
+            },
            {ORTH: second_orth, NORM: second_norm},
-        ]
+        ],
    }

-TOKENIZER_EXCEPTIONS = {
-    "Dr.": [{ORTH: "Dr."}]
-}
+
+TOKENIZER_EXCEPTIONS = {"Dr.": [{ORTH: "Dr."}]}

 # Apostrophe forms
 TOKENIZER_EXCEPTIONS.update(make_variants("m'ap", "mwen", "ap", "ap"))
@ -29,93 +32,95 @@ TOKENIZER_EXCEPTIONS.update(make_variants("p'ap", "pa", "ap", "ap"))
 TOKENIZER_EXCEPTIONS.update(make_variants("t'ap", "te", "ap", "ap"))

 # Non-apostrophe contractions (with capitalized variants)
-TOKENIZER_EXCEPTIONS.update({
-    "map": [
-        {ORTH: "m", NORM: "mwen"},
-        {ORTH: "ap", NORM: "ap"},
-    ],
-    "Map": [
-        {ORTH: "M", NORM: "Mwen"},
-        {ORTH: "ap", NORM: "ap"},
-    ],
-    "lem": [
-        {ORTH: "le", NORM: "le"},
-        {ORTH: "m", NORM: "mwen"},
-    ],
-    "Lem": [
-        {ORTH: "Le", NORM: "Le"},
-        {ORTH: "m", NORM: "mwen"},
-    ],
-    "lew": [
-        {ORTH: "le", NORM: "le"},
-        {ORTH: "w", NORM: "ou"},
-    ],
-    "Lew": [
-        {ORTH: "Le", NORM: "Le"},
-        {ORTH: "w", NORM: "ou"},
-    ],
-    "nap": [
-        {ORTH: "n", NORM: "nou"},
-        {ORTH: "ap", NORM: "ap"},
-    ],
-    "Nap": [
-        {ORTH: "N", NORM: "Nou"},
-        {ORTH: "ap", NORM: "ap"},
-    ],
-    "lap": [
-        {ORTH: "l", NORM: "li"},
-        {ORTH: "ap", NORM: "ap"},
-    ],
-    "Lap": [
-        {ORTH: "L", NORM: "Li"},
-        {ORTH: "ap", NORM: "ap"},
-    ],
-    "yap": [
-        {ORTH: "y", NORM: "yo"},
-        {ORTH: "ap", NORM: "ap"},
-    ],
-    "Yap": [
-        {ORTH: "Y", NORM: "Yo"},
-        {ORTH: "ap", NORM: "ap"},
-    ],
-    "mte": [
-        {ORTH: "m", NORM: "mwen"},
-        {ORTH: "te", NORM: "te"},
-    ],
-    "Mte": [
-        {ORTH: "M", NORM: "Mwen"},
-        {ORTH: "te", NORM: "te"},
-    ],
-    "mpral": [
-        {ORTH: "m", NORM: "mwen"},
-        {ORTH: "pral", NORM: "pral"},
-    ],
-    "Mpral": [
-        {ORTH: "M", NORM: "Mwen"},
-        {ORTH: "pral", NORM: "pral"},
-    ],
-    "wap": [
-        {ORTH: "w", NORM: "ou"},
-        {ORTH: "ap", NORM: "ap"},
-    ],
-    "Wap": [
-        {ORTH: "W", NORM: "Ou"},
-        {ORTH: "ap", NORM: "ap"},
-    ],
-    "kap": [
-        {ORTH: "k", NORM: "ki"},
-        {ORTH: "ap", NORM: "ap"},
-    ],
-    "Kap": [
-        {ORTH: "K", NORM: "Ki"},
-        {ORTH: "ap", NORM: "ap"},
-    ],
-    "tap": [
-        {ORTH: "t", NORM: "te"},
-        {ORTH: "ap", NORM: "ap"},
-    ],
-    "Tap": [
-        {ORTH: "T", NORM: "Te"},
-        {ORTH: "ap", NORM: "ap"},
-    ],
-})
+TOKENIZER_EXCEPTIONS.update(
+    {
+        "map": [
+            {ORTH: "m", NORM: "mwen"},
+            {ORTH: "ap", NORM: "ap"},
+        ],
+        "Map": [
+            {ORTH: "M", NORM: "Mwen"},
+            {ORTH: "ap", NORM: "ap"},
+        ],
+        "lem": [
+            {ORTH: "le", NORM: "le"},
+            {ORTH: "m", NORM: "mwen"},
+        ],
+        "Lem": [
+            {ORTH: "Le", NORM: "Le"},
+            {ORTH: "m", NORM: "mwen"},
+        ],
+        "lew": [
+            {ORTH: "le", NORM: "le"},
+            {ORTH: "w", NORM: "ou"},
+        ],
+        "Lew": [
+            {ORTH: "Le", NORM: "Le"},
+            {ORTH: "w", NORM: "ou"},
+        ],
+        "nap": [
+            {ORTH: "n", NORM: "nou"},
+            {ORTH: "ap", NORM: "ap"},
+        ],
+        "Nap": [
+            {ORTH: "N", NORM: "Nou"},
+            {ORTH: "ap", NORM: "ap"},
+        ],
+        "lap": [
+            {ORTH: "l", NORM: "li"},
+            {ORTH: "ap", NORM: "ap"},
+        ],
+        "Lap": [
+            {ORTH: "L", NORM: "Li"},
+            {ORTH: "ap", NORM: "ap"},
+        ],
+        "yap": [
+            {ORTH: "y", NORM: "yo"},
+            {ORTH: "ap", NORM: "ap"},
+        ],
+        "Yap": [
+            {ORTH: "Y", NORM: "Yo"},
+            {ORTH: "ap", NORM: "ap"},
+        ],
+        "mte": [
+            {ORTH: "m", NORM: "mwen"},
+            {ORTH: "te", NORM: "te"},
+        ],
+        "Mte": [
+            {ORTH: "M", NORM: "Mwen"},
+            {ORTH: "te", NORM: "te"},
+        ],
+        "mpral": [
+            {ORTH: "m", NORM: "mwen"},
+            {ORTH: "pral", NORM: "pral"},
+        ],
+        "Mpral": [
+            {ORTH: "M", NORM: "Mwen"},
+            {ORTH: "pral", NORM: "pral"},
+        ],
+        "wap": [
+            {ORTH: "w", NORM: "ou"},
+            {ORTH: "ap", NORM: "ap"},
+        ],
+        "Wap": [
+            {ORTH: "W", NORM: "Ou"},
+            {ORTH: "ap", NORM: "ap"},
+        ],
+        "kap": [
+            {ORTH: "k", NORM: "ki"},
+            {ORTH: "ap", NORM: "ap"},
+        ],
+        "Kap": [
+            {ORTH: "K", NORM: "Ki"},
+            {ORTH: "ap", NORM: "ap"},
+        ],
+        "tap": [
+            {ORTH: "t", NORM: "te"},
+            {ORTH: "ap", NORM: "ap"},
+        ],
+        "Tap": [
+            {ORTH: "T", NORM: "Te"},
+            {ORTH: "ap", NORM: "ap"},
+        ],
+    }
+)
--- a/spacy/parts_of_speech.pyx
+++ b/spacy/parts_of_speech.pyx
@ -23,7 +23,9 @@ IDS = {
    "SPACE": SPACE
 }

-
+# Make these ints in Python, so that we don't get this unexpected 'flag' type
+# This will match the behaviour before Cython 3
+IDS = {name: int(value) for name, value in IDS.items()}
 NAMES = {value: key for key, value in IDS.items()}

 # As of Cython 3.1, the global Python namespace no longer has the enum
--- a/spacy/tests/lang/ht/test_exceptions.py
+++ b/spacy/tests/lang/ht/test_exceptions.py
@ -29,4 +29,16 @@ def test_ht_tokenizer_handles_basic_abbreviation(ht_tokenizer, text):
 def test_ht_tokenizer_full_sentence(ht_tokenizer):
    text = "Si'm ka vini, m'ap pale ak li."
    tokens = [t.text for t in ht_tokenizer(text)]
-    assert tokens == ["Si", "'m", "ka", "vini", ",", "m'", "ap", "pale", "ak", "li", "."]
+    assert tokens == [
+        "Si",
+        "'m",
+        "ka",
+        "vini",
+        ",",
+        "m'",
+        "ap",
+        "pale",
+        "ak",
+        "li",
+        ".",
+    ]
--- a/spacy/tests/lang/ht/test_noun_chunks.py
+++ b/spacy/tests/lang/ht/test_noun_chunks.py
@ -1,4 +1,5 @@
 import pytest
+
 from spacy.tokens import Doc


--- a/spacy/tests/lang/ht/test_prefix_suffix_infix.py
+++ b/spacy/tests/lang/ht/test_prefix_suffix_infix.py
@ -37,7 +37,9 @@ def test_ht_tokenizer_splits_uneven_wrap(ht_tokenizer, text):
    assert len(tokens) == 5


-@pytest.mark.parametrize("text,length", [("Ozetazini.", 2), ("Frans.", 2), ("(Ozetazini.", 3)])
+@pytest.mark.parametrize(
+    "text,length", [("Ozetazini.", 2), ("Frans.", 2), ("(Ozetazini.", 3)]
+)
 def test_ht_tokenizer_splits_prefix_interact(ht_tokenizer, text, length):
    tokens = ht_tokenizer(text)
    assert len(tokens) == length
--- a/spacy/tests/lang/ht/test_text.py
+++ b/spacy/tests/lang/ht/test_text.py
@ -16,7 +16,6 @@ Nan Washington, Depatman Deta Etazini pibliye yon deklarasyon ki eksprime "regre
    assert len(tokens) == 84


-
@pytest.mark.parametrize(
    "text,length",
    [
@ -66,14 +65,14 @@ def test_ht_lex_attrs_capitals(word):


@pytest.mark.parametrize(
-    "word, expected", [
+    "word, expected",
+    [
        ("'m", "mwen"),
        ("'n", "nou"),
        ("'l", "li"),
        ("'y", "yo"),
        ("'w", "ou"),
-    ]
+    ],
 )
 def test_ht_lex_attrs_norm_custom(word, expected):
    assert norm_custom(word) == expected
-