Merge c015dd1fa6 into 41e07772dc

2025-09-20 02:52:45 +03:00 · 2025-05-28 17:28:07 +02:00 · 2025-05-28 17:28:07 +02:00 · b84c131df0
commit b84c131df0
parent 41e07772dc c015dd1fa6
13 changed files with 196 additions and 134 deletions
--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@ -91,6 +91,9 @@ IDS = {
    "MORPH": MORPH,
    "IDX": IDX,
 }
 # Make these ints in Python, so that we don't get this unexpected 'flag' type
 # This will match the behaviour before Cython 3
 IDS = {name: int(value) for name, value in IDS.items()}
 # ATTR IDs, in order of the symbol
--- a/spacy/lang/ht/init.py
+++ b/spacy/lang/ht/init.py
@ -5,11 +5,11 @@ from thinc.api import Model
 from ...language import BaseDefaults, Language
 from .lemmatizer import HaitianCreoleLemmatizer
 from .lex_attrs import LEX_ATTRS
-from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
 from .stop_words import STOP_WORDS
 from .syntax_iterators import SYNTAX_ITERATORS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .tag_map import TAG_MAP
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 class HaitianCreoleDefaults(BaseDefaults):
@ -22,10 +22,12 @@ class HaitianCreoleDefaults(BaseDefaults):
    stop_words = STOP_WORDS
    tag_map = TAG_MAP
 class HaitianCreole(Language):
    lang = "ht"
    Defaults = HaitianCreoleDefaults
@HaitianCreole.factory(
    "lemmatizer",
    assigns=["token.lemma"],
@ -49,4 +51,5 @@ def make_lemmatizer(
        nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
    )
 __all__ = ["HaitianCreole"]
--- a/spacy/lang/ht/lemmatizer.py
+++ b/spacy/lang/ht/lemmatizer.py
@ -1,8 +1,8 @@
 from typing import List, Tuple
 from ...lookups import Lookups
 from ...pipeline import Lemmatizer
 from ...tokens import Token
 from ...lookups import Lookups
 class HaitianCreoleLemmatizer(Lemmatizer):
--- a/spacy/lang/ht/lex_attrs.py
+++ b/spacy/lang/ht/lex_attrs.py
@ -49,6 +49,7 @@ NORM_MAP = {
    "P": "Pa",
 }
 def like_num(text):
    text = text.strip().lower()
    if text.startswith(("+", "-", "±", "~")):
@ -69,9 +70,11 @@ def like_num(text):
        return True
    return False
 def norm_custom(text):
    return NORM_MAP.get(text, text.lower())
 LEX_ATTRS = {
    LIKE_NUM: like_num,
    NORM: norm_custom,
--- a/spacy/lang/ht/punctuation.py
+++ b/spacy/lang/ht/punctuation.py
@ -4,10 +4,10 @@ from ..char_classes import (
    ALPHA_UPPER,
    CONCAT_QUOTES,
    HYPHENS,
    LIST_PUNCT,
    LIST_QUOTES,
    LIST_ELLIPSES,
    LIST_ICONS,
    LIST_PUNCT,
    LIST_QUOTES,
    merge_chars,
 )
@ -16,23 +16,37 @@ ELISION = "'’".replace(" ", "")
 _prefixes_elision = "m n l y t k w"
 _prefixes_elision += " " + _prefixes_elision.upper()
-TOKENIZER_PREFIXES = LIST_PUNCT + LIST_QUOTES + [
+TOKENIZER_PREFIXES = (
    LIST_PUNCT
    + LIST_QUOTES
    + [
        r"(?:({pe})[{el}])(?=[{a}])".format(
            a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision)
        )
-]
+    ]
 )
-TOKENIZER_SUFFIXES = LIST_PUNCT + LIST_QUOTES + LIST_ELLIPSES + [
+TOKENIZER_SUFFIXES = (
    LIST_PUNCT
    + LIST_QUOTES
    + LIST_ELLIPSES
    + [
        r"(?<=[0-9])%",  # numbers like 10%
        r"(?<=[0-9])(?:{h})".format(h=HYPHENS),  # hyphens after numbers
        r"(?<=[{a}])['’]".format(a=ALPHA),  # apostrophes after letters
        r"(?<=[{a}])['’][mwlnytk](?=\s|$)".format(a=ALPHA),  # contractions
        r"(?<=[{a}0-9])\)",  # right parenthesis after letter/number
-    r"(?<=[{a}])\.(?=\s|$)".format(a=ALPHA),  # period after letter if space or end of string
+        r"(?<=[{a}])\.(?=\s|$)".format(
            a=ALPHA
        ),  # period after letter if space or end of string
        r"(?<=\))[\.\?!]",  # punctuation immediately after right parenthesis
-]
+    ]
 )
-TOKENIZER_INFIXES = LIST_ELLIPSES + LIST_ICONS + [
+TOKENIZER_INFIXES = (
    LIST_ELLIPSES
    + LIST_ICONS
    + [
        r"(?<=[0-9])[+\-\*^](?=[0-9-])",
        r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
            al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
@ -40,4 +54,5 @@ TOKENIZER_INFIXES = LIST_ELLIPSES + LIST_ICONS + [
        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
        r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
        r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION),
-]
+    ]
 )
--- a/spacy/lang/ht/stop_words.py
+++ b/spacy/lang/ht/stop_words.py
@ -39,8 +39,7 @@ sa san si swa si
 men mèsi oswa osinon
-"""
+""".split()
 .split()
 )
 # Add common contractions, with and without apostrophe variants
--- a/spacy/lang/ht/tag_map.py
+++ b/spacy/lang/ht/tag_map.py
@ -1,4 +1,22 @@
-from spacy.symbols import NOUN, VERB, AUX, ADJ, ADV, PRON, DET, ADP, SCONJ, CCONJ, PART, INTJ, NUM, PROPN, PUNCT, SYM, X
+from spacy.symbols import (
    ADJ,
    ADP,
    ADV,
    AUX,
    CCONJ,
    DET,
    INTJ,
    NOUN,
    NUM,
    PART,
    PRON,
    PROPN,
    PUNCT,
    SCONJ,
    SYM,
    VERB,
    X,
 )
 TAG_MAP = {
    "NOUN": {"pos": NOUN},
--- a/spacy/lang/ht/tokenizer_exceptions.py
+++ b/spacy/lang/ht/tokenizer_exceptions.py
@ -1,4 +1,5 @@
-from spacy.symbols import ORTH, NORM
+from spacy.symbols import NORM, ORTH
 def make_variants(base, first_norm, second_orth, second_norm):
    return {
@ -7,14 +8,16 @@ def make_variants(base, first_norm, second_orth, second_norm):
            {ORTH: second_orth, NORM: second_norm},
        ],
        base.capitalize(): [
-            {ORTH: base.split("'")[0].capitalize() + "'", NORM: first_norm.capitalize()},
+            {
                ORTH: base.split("'")[0].capitalize() + "'",
                NORM: first_norm.capitalize(),
            },
            {ORTH: second_orth, NORM: second_norm},
-        ]
+        ],
    }
-TOKENIZER_EXCEPTIONS = {
+
-    "Dr.": [{ORTH: "Dr."}]
+TOKENIZER_EXCEPTIONS = {"Dr.": [{ORTH: "Dr."}]}
 }
 # Apostrophe forms
 TOKENIZER_EXCEPTIONS.update(make_variants("m'ap", "mwen", "ap", "ap"))
@ -29,7 +32,8 @@ TOKENIZER_EXCEPTIONS.update(make_variants("p'ap", "pa", "ap", "ap"))
 TOKENIZER_EXCEPTIONS.update(make_variants("t'ap", "te", "ap", "ap"))
 # Non-apostrophe contractions (with capitalized variants)
-TOKENIZER_EXCEPTIONS.update({
+TOKENIZER_EXCEPTIONS.update(
    {
        "map": [
            {ORTH: "m", NORM: "mwen"},
            {ORTH: "ap", NORM: "ap"},
@ -118,4 +122,5 @@ TOKENIZER_EXCEPTIONS.update({
            {ORTH: "T", NORM: "Te"},
            {ORTH: "ap", NORM: "ap"},
        ],
-})
+    }
 )
--- a/spacy/parts_of_speech.pyx
+++ b/spacy/parts_of_speech.pyx
@ -23,7 +23,9 @@ IDS = {
    "SPACE": SPACE
 }
-
+# Make these ints in Python, so that we don't get this unexpected 'flag' type
 # This will match the behaviour before Cython 3
 IDS = {name: int(value) for name, value in IDS.items()}
 NAMES = {value: key for key, value in IDS.items()}
 # As of Cython 3.1, the global Python namespace no longer has the enum
--- a/spacy/tests/lang/ht/test_exceptions.py
+++ b/spacy/tests/lang/ht/test_exceptions.py
@ -29,4 +29,16 @@ def test_ht_tokenizer_handles_basic_abbreviation(ht_tokenizer, text):
 def test_ht_tokenizer_full_sentence(ht_tokenizer):
    text = "Si'm ka vini, m'ap pale ak li."
    tokens = [t.text for t in ht_tokenizer(text)]
-    assert tokens == ["Si", "'m", "ka", "vini", ",", "m'", "ap", "pale", "ak", "li", "."]
+    assert tokens == [
        "Si",
        "'m",
        "ka",
        "vini",
        ",",
        "m'",
        "ap",
        "pale",
        "ak",
        "li",
        ".",
    ]
--- a/spacy/tests/lang/ht/test_noun_chunks.py
+++ b/spacy/tests/lang/ht/test_noun_chunks.py
@ -1,4 +1,5 @@
 import pytest
 from spacy.tokens import Doc
--- a/spacy/tests/lang/ht/test_prefix_suffix_infix.py
+++ b/spacy/tests/lang/ht/test_prefix_suffix_infix.py
@ -37,7 +37,9 @@ def test_ht_tokenizer_splits_uneven_wrap(ht_tokenizer, text):
    assert len(tokens) == 5
-@pytest.mark.parametrize("text,length", [("Ozetazini.", 2), ("Frans.", 2), ("(Ozetazini.", 3)])
+@pytest.mark.parametrize(
    "text,length", [("Ozetazini.", 2), ("Frans.", 2), ("(Ozetazini.", 3)]
 )
 def test_ht_tokenizer_splits_prefix_interact(ht_tokenizer, text, length):
    tokens = ht_tokenizer(text)
    assert len(tokens) == length
--- a/spacy/tests/lang/ht/test_text.py
+++ b/spacy/tests/lang/ht/test_text.py
@ -16,7 +16,6 @@ Nan Washington, Depatman Deta Etazini pibliye yon deklarasyon ki eksprime "regre
    assert len(tokens) == 84
@pytest.mark.parametrize(
    "text,length",
    [
@ -66,14 +65,14 @@ def test_ht_lex_attrs_capitals(word):
@pytest.mark.parametrize(
-    "word, expected", [
+    "word, expected",
    [
        ("'m", "mwen"),
        ("'n", "nou"),
        ("'l", "li"),
        ("'y", "yo"),
        ("'w", "ou"),
-    ]
+    ],
 )
 def test_ht_lex_attrs_norm_custom(word, expected):
    assert norm_custom(word) == expected
`@ -1,4 +1,5 @@`
	`import pytest`	`import pytest`

	`from spacy.tokens import Doc`	`from spacy.tokens import Doc`