From 5e1ee975c94287a984bc231fe82d02d8153c8d7e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 27 May 2025 11:09:37 +0200 Subject: [PATCH 1/3] Fix quirk of enum values in Python After the Cython 3 change, the types of enum members such as spacy.parts_of_speech.NOUN became 'flag', rather than simple 'int'. This change mostly doesn't matter because the flag type does duck-type like an int -- it compares, additions, prints etc the same. However, it doesn't repr the same and if you do an isinstance check it will fail. It's therefore better to just make them ints like they were before. --- spacy/attrs.pyx | 3 +++ spacy/parts_of_speech.pyx | 4 +++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx index 363dd094d..50b868bc4 100644 --- a/spacy/attrs.pyx +++ b/spacy/attrs.pyx @@ -91,6 +91,9 @@ IDS = { "MORPH": MORPH, "IDX": IDX, } +# Make these ints in Python, so that we don't get this unexpected 'flag' type +# This will match the behaviour before Cython 3 +IDS = {name: int(value) for name, value in IDS.items()} # ATTR IDs, in order of the symbol diff --git a/spacy/parts_of_speech.pyx b/spacy/parts_of_speech.pyx index 1e643c099..9e539c16c 100644 --- a/spacy/parts_of_speech.pyx +++ b/spacy/parts_of_speech.pyx @@ -23,7 +23,9 @@ IDS = { "SPACE": SPACE } - +# Make these ints in Python, so that we don't get this unexpected 'flag' type +# This will match the behaviour before Cython 3 +IDS = {name: int(value) for name, value in IDS.items()} NAMES = {value: key for key, value in IDS.items()} # As of Cython 3.1, the global Python namespace no longer has the enum From 80aa445f343ca33a21060eab70235600427d25e9 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 28 May 2025 17:27:36 +0200 Subject: [PATCH 2/3] Format --- spacy/lang/ht/__init__.py | 3 + spacy/lang/ht/lex_attrs.py | 3 + spacy/lang/ht/punctuation.py | 61 +++--- spacy/lang/ht/stop_words.py | 3 +- spacy/lang/ht/tag_map.py | 20 +- spacy/lang/ht/tokenizer_exceptions.py | 195 +++++++++--------- spacy/tests/lang/ht/test_exceptions.py | 14 +- .../tests/lang/ht/test_prefix_suffix_infix.py | 4 +- spacy/tests/lang/ht/test_text.py | 7 +- 9 files changed, 183 insertions(+), 127 deletions(-) diff --git a/spacy/lang/ht/__init__.py b/spacy/lang/ht/__init__.py index e5c1c2770..9fc2df40c 100644 --- a/spacy/lang/ht/__init__.py +++ b/spacy/lang/ht/__init__.py @@ -22,10 +22,12 @@ class HaitianCreoleDefaults(BaseDefaults): stop_words = STOP_WORDS tag_map = TAG_MAP + class HaitianCreole(Language): lang = "ht" Defaults = HaitianCreoleDefaults + @HaitianCreole.factory( "lemmatizer", assigns=["token.lemma"], @@ -49,4 +51,5 @@ def make_lemmatizer( nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer ) + __all__ = ["HaitianCreole"] diff --git a/spacy/lang/ht/lex_attrs.py b/spacy/lang/ht/lex_attrs.py index 8a3ec1ff9..ab1a39a82 100644 --- a/spacy/lang/ht/lex_attrs.py +++ b/spacy/lang/ht/lex_attrs.py @@ -49,6 +49,7 @@ NORM_MAP = { "P": "Pa", } + def like_num(text): text = text.strip().lower() if text.startswith(("+", "-", "±", "~")): @@ -69,9 +70,11 @@ def like_num(text): return True return False + def norm_custom(text): return NORM_MAP.get(text, text.lower()) + LEX_ATTRS = { LIKE_NUM: like_num, NORM: norm_custom, diff --git a/spacy/lang/ht/punctuation.py b/spacy/lang/ht/punctuation.py index 61d88d6e1..0077db1c0 100644 --- a/spacy/lang/ht/punctuation.py +++ b/spacy/lang/ht/punctuation.py @@ -16,28 +16,43 @@ ELISION = "'’".replace(" ", "") _prefixes_elision = "m n l y t k w" _prefixes_elision += " " + _prefixes_elision.upper() -TOKENIZER_PREFIXES = LIST_PUNCT + LIST_QUOTES + [ - r"(?:({pe})[{el}])(?=[{a}])".format( - a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision) - ) -] +TOKENIZER_PREFIXES = ( + LIST_PUNCT + + LIST_QUOTES + + [ + r"(?:({pe})[{el}])(?=[{a}])".format( + a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision) + ) + ] +) -TOKENIZER_SUFFIXES = LIST_PUNCT + LIST_QUOTES + LIST_ELLIPSES + [ - r"(?<=[0-9])%", # numbers like 10% - r"(?<=[0-9])(?:{h})".format(h=HYPHENS), # hyphens after numbers - r"(?<=[{a}])['’]".format(a=ALPHA), # apostrophes after letters - r"(?<=[{a}])['’][mwlnytk](?=\s|$)".format(a=ALPHA), # contractions - r"(?<=[{a}0-9])\)", # right parenthesis after letter/number - r"(?<=[{a}])\.(?=\s|$)".format(a=ALPHA), # period after letter if space or end of string - r"(?<=\))[\.\?!]", # punctuation immediately after right parenthesis -] +TOKENIZER_SUFFIXES = ( + LIST_PUNCT + + LIST_QUOTES + + LIST_ELLIPSES + + [ + r"(?<=[0-9])%", # numbers like 10% + r"(?<=[0-9])(?:{h})".format(h=HYPHENS), # hyphens after numbers + r"(?<=[{a}])['’]".format(a=ALPHA), # apostrophes after letters + r"(?<=[{a}])['’][mwlnytk](?=\s|$)".format(a=ALPHA), # contractions + r"(?<=[{a}0-9])\)", # right parenthesis after letter/number + r"(?<=[{a}])\.(?=\s|$)".format( + a=ALPHA + ), # period after letter if space or end of string + r"(?<=\))[\.\?!]", # punctuation immediately after right parenthesis + ] +) -TOKENIZER_INFIXES = LIST_ELLIPSES + LIST_ICONS + [ - r"(?<=[0-9])[+\-\*^](?=[0-9-])", - r"(?<=[{al}{q}])\.(?=[{au}{q}])".format( - al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES - ), - r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), - r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS), - r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION), -] +TOKENIZER_INFIXES = ( + LIST_ELLIPSES + + LIST_ICONS + + [ + r"(?<=[0-9])[+\-\*^](?=[0-9-])", + r"(?<=[{al}{q}])\.(?=[{au}{q}])".format( + al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES + ), + r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), + r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS), + r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION), + ] +) diff --git a/spacy/lang/ht/stop_words.py b/spacy/lang/ht/stop_words.py index 6243887a4..50998e0e5 100644 --- a/spacy/lang/ht/stop_words.py +++ b/spacy/lang/ht/stop_words.py @@ -39,8 +39,7 @@ sa san si swa si men mèsi oswa osinon -""" -.split() +""".split() ) # Add common contractions, with and without apostrophe variants diff --git a/spacy/lang/ht/tag_map.py b/spacy/lang/ht/tag_map.py index 8c9cdd6d4..261d1aef3 100644 --- a/spacy/lang/ht/tag_map.py +++ b/spacy/lang/ht/tag_map.py @@ -1,4 +1,22 @@ -from spacy.symbols import NOUN, VERB, AUX, ADJ, ADV, PRON, DET, ADP, SCONJ, CCONJ, PART, INTJ, NUM, PROPN, PUNCT, SYM, X +from spacy.symbols import ( + NOUN, + VERB, + AUX, + ADJ, + ADV, + PRON, + DET, + ADP, + SCONJ, + CCONJ, + PART, + INTJ, + NUM, + PROPN, + PUNCT, + SYM, + X, +) TAG_MAP = { "NOUN": {"pos": NOUN}, diff --git a/spacy/lang/ht/tokenizer_exceptions.py b/spacy/lang/ht/tokenizer_exceptions.py index b44ad7a6f..4d617fd36 100644 --- a/spacy/lang/ht/tokenizer_exceptions.py +++ b/spacy/lang/ht/tokenizer_exceptions.py @@ -1,5 +1,6 @@ from spacy.symbols import ORTH, NORM + def make_variants(base, first_norm, second_orth, second_norm): return { base: [ @@ -7,14 +8,16 @@ def make_variants(base, first_norm, second_orth, second_norm): {ORTH: second_orth, NORM: second_norm}, ], base.capitalize(): [ - {ORTH: base.split("'")[0].capitalize() + "'", NORM: first_norm.capitalize()}, + { + ORTH: base.split("'")[0].capitalize() + "'", + NORM: first_norm.capitalize(), + }, {ORTH: second_orth, NORM: second_norm}, - ] + ], } -TOKENIZER_EXCEPTIONS = { - "Dr.": [{ORTH: "Dr."}] -} + +TOKENIZER_EXCEPTIONS = {"Dr.": [{ORTH: "Dr."}]} # Apostrophe forms TOKENIZER_EXCEPTIONS.update(make_variants("m'ap", "mwen", "ap", "ap")) @@ -29,93 +32,95 @@ TOKENIZER_EXCEPTIONS.update(make_variants("p'ap", "pa", "ap", "ap")) TOKENIZER_EXCEPTIONS.update(make_variants("t'ap", "te", "ap", "ap")) # Non-apostrophe contractions (with capitalized variants) -TOKENIZER_EXCEPTIONS.update({ - "map": [ - {ORTH: "m", NORM: "mwen"}, - {ORTH: "ap", NORM: "ap"}, - ], - "Map": [ - {ORTH: "M", NORM: "Mwen"}, - {ORTH: "ap", NORM: "ap"}, - ], - "lem": [ - {ORTH: "le", NORM: "le"}, - {ORTH: "m", NORM: "mwen"}, - ], - "Lem": [ - {ORTH: "Le", NORM: "Le"}, - {ORTH: "m", NORM: "mwen"}, - ], - "lew": [ - {ORTH: "le", NORM: "le"}, - {ORTH: "w", NORM: "ou"}, - ], - "Lew": [ - {ORTH: "Le", NORM: "Le"}, - {ORTH: "w", NORM: "ou"}, - ], - "nap": [ - {ORTH: "n", NORM: "nou"}, - {ORTH: "ap", NORM: "ap"}, - ], - "Nap": [ - {ORTH: "N", NORM: "Nou"}, - {ORTH: "ap", NORM: "ap"}, - ], - "lap": [ - {ORTH: "l", NORM: "li"}, - {ORTH: "ap", NORM: "ap"}, - ], - "Lap": [ - {ORTH: "L", NORM: "Li"}, - {ORTH: "ap", NORM: "ap"}, - ], - "yap": [ - {ORTH: "y", NORM: "yo"}, - {ORTH: "ap", NORM: "ap"}, - ], - "Yap": [ - {ORTH: "Y", NORM: "Yo"}, - {ORTH: "ap", NORM: "ap"}, - ], - "mte": [ - {ORTH: "m", NORM: "mwen"}, - {ORTH: "te", NORM: "te"}, - ], - "Mte": [ - {ORTH: "M", NORM: "Mwen"}, - {ORTH: "te", NORM: "te"}, - ], - "mpral": [ - {ORTH: "m", NORM: "mwen"}, - {ORTH: "pral", NORM: "pral"}, - ], - "Mpral": [ - {ORTH: "M", NORM: "Mwen"}, - {ORTH: "pral", NORM: "pral"}, - ], - "wap": [ - {ORTH: "w", NORM: "ou"}, - {ORTH: "ap", NORM: "ap"}, - ], - "Wap": [ - {ORTH: "W", NORM: "Ou"}, - {ORTH: "ap", NORM: "ap"}, - ], - "kap": [ - {ORTH: "k", NORM: "ki"}, - {ORTH: "ap", NORM: "ap"}, - ], - "Kap": [ - {ORTH: "K", NORM: "Ki"}, - {ORTH: "ap", NORM: "ap"}, - ], - "tap": [ - {ORTH: "t", NORM: "te"}, - {ORTH: "ap", NORM: "ap"}, - ], - "Tap": [ - {ORTH: "T", NORM: "Te"}, - {ORTH: "ap", NORM: "ap"}, - ], -}) +TOKENIZER_EXCEPTIONS.update( + { + "map": [ + {ORTH: "m", NORM: "mwen"}, + {ORTH: "ap", NORM: "ap"}, + ], + "Map": [ + {ORTH: "M", NORM: "Mwen"}, + {ORTH: "ap", NORM: "ap"}, + ], + "lem": [ + {ORTH: "le", NORM: "le"}, + {ORTH: "m", NORM: "mwen"}, + ], + "Lem": [ + {ORTH: "Le", NORM: "Le"}, + {ORTH: "m", NORM: "mwen"}, + ], + "lew": [ + {ORTH: "le", NORM: "le"}, + {ORTH: "w", NORM: "ou"}, + ], + "Lew": [ + {ORTH: "Le", NORM: "Le"}, + {ORTH: "w", NORM: "ou"}, + ], + "nap": [ + {ORTH: "n", NORM: "nou"}, + {ORTH: "ap", NORM: "ap"}, + ], + "Nap": [ + {ORTH: "N", NORM: "Nou"}, + {ORTH: "ap", NORM: "ap"}, + ], + "lap": [ + {ORTH: "l", NORM: "li"}, + {ORTH: "ap", NORM: "ap"}, + ], + "Lap": [ + {ORTH: "L", NORM: "Li"}, + {ORTH: "ap", NORM: "ap"}, + ], + "yap": [ + {ORTH: "y", NORM: "yo"}, + {ORTH: "ap", NORM: "ap"}, + ], + "Yap": [ + {ORTH: "Y", NORM: "Yo"}, + {ORTH: "ap", NORM: "ap"}, + ], + "mte": [ + {ORTH: "m", NORM: "mwen"}, + {ORTH: "te", NORM: "te"}, + ], + "Mte": [ + {ORTH: "M", NORM: "Mwen"}, + {ORTH: "te", NORM: "te"}, + ], + "mpral": [ + {ORTH: "m", NORM: "mwen"}, + {ORTH: "pral", NORM: "pral"}, + ], + "Mpral": [ + {ORTH: "M", NORM: "Mwen"}, + {ORTH: "pral", NORM: "pral"}, + ], + "wap": [ + {ORTH: "w", NORM: "ou"}, + {ORTH: "ap", NORM: "ap"}, + ], + "Wap": [ + {ORTH: "W", NORM: "Ou"}, + {ORTH: "ap", NORM: "ap"}, + ], + "kap": [ + {ORTH: "k", NORM: "ki"}, + {ORTH: "ap", NORM: "ap"}, + ], + "Kap": [ + {ORTH: "K", NORM: "Ki"}, + {ORTH: "ap", NORM: "ap"}, + ], + "tap": [ + {ORTH: "t", NORM: "te"}, + {ORTH: "ap", NORM: "ap"}, + ], + "Tap": [ + {ORTH: "T", NORM: "Te"}, + {ORTH: "ap", NORM: "ap"}, + ], + } +) diff --git a/spacy/tests/lang/ht/test_exceptions.py b/spacy/tests/lang/ht/test_exceptions.py index 685b72c07..ea2e2b204 100644 --- a/spacy/tests/lang/ht/test_exceptions.py +++ b/spacy/tests/lang/ht/test_exceptions.py @@ -29,4 +29,16 @@ def test_ht_tokenizer_handles_basic_abbreviation(ht_tokenizer, text): def test_ht_tokenizer_full_sentence(ht_tokenizer): text = "Si'm ka vini, m'ap pale ak li." tokens = [t.text for t in ht_tokenizer(text)] - assert tokens == ["Si", "'m", "ka", "vini", ",", "m'", "ap", "pale", "ak", "li", "."] + assert tokens == [ + "Si", + "'m", + "ka", + "vini", + ",", + "m'", + "ap", + "pale", + "ak", + "li", + ".", + ] diff --git a/spacy/tests/lang/ht/test_prefix_suffix_infix.py b/spacy/tests/lang/ht/test_prefix_suffix_infix.py index 7dabec17a..5ff409cd9 100644 --- a/spacy/tests/lang/ht/test_prefix_suffix_infix.py +++ b/spacy/tests/lang/ht/test_prefix_suffix_infix.py @@ -37,7 +37,9 @@ def test_ht_tokenizer_splits_uneven_wrap(ht_tokenizer, text): assert len(tokens) == 5 -@pytest.mark.parametrize("text,length", [("Ozetazini.", 2), ("Frans.", 2), ("(Ozetazini.", 3)]) +@pytest.mark.parametrize( + "text,length", [("Ozetazini.", 2), ("Frans.", 2), ("(Ozetazini.", 3)] +) def test_ht_tokenizer_splits_prefix_interact(ht_tokenizer, text, length): tokens = ht_tokenizer(text) assert len(tokens) == length diff --git a/spacy/tests/lang/ht/test_text.py b/spacy/tests/lang/ht/test_text.py index f396e352a..e63299fc0 100644 --- a/spacy/tests/lang/ht/test_text.py +++ b/spacy/tests/lang/ht/test_text.py @@ -16,7 +16,6 @@ Nan Washington, Depatman Deta Etazini pibliye yon deklarasyon ki eksprime "regre assert len(tokens) == 84 - @pytest.mark.parametrize( "text,length", [ @@ -66,14 +65,14 @@ def test_ht_lex_attrs_capitals(word): @pytest.mark.parametrize( - "word, expected", [ + "word, expected", + [ ("'m", "mwen"), ("'n", "nou"), ("'l", "li"), ("'y", "yo"), ("'w", "ou"), - ] + ], ) def test_ht_lex_attrs_norm_custom(word, expected): assert norm_custom(word) == expected - From c015dd1fa6f28a324340dacf4a409e92af8a3af8 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 28 May 2025 17:27:59 +0200 Subject: [PATCH 3/3] isort --- spacy/lang/ht/__init__.py | 4 ++-- spacy/lang/ht/lemmatizer.py | 2 +- spacy/lang/ht/punctuation.py | 4 ++-- spacy/lang/ht/tag_map.py | 16 ++++++++-------- spacy/lang/ht/tokenizer_exceptions.py | 2 +- spacy/tests/lang/ht/test_noun_chunks.py | 1 + 6 files changed, 15 insertions(+), 14 deletions(-) diff --git a/spacy/lang/ht/__init__.py b/spacy/lang/ht/__init__.py index 9fc2df40c..7f9feb057 100644 --- a/spacy/lang/ht/__init__.py +++ b/spacy/lang/ht/__init__.py @@ -5,11 +5,11 @@ from thinc.api import Model from ...language import BaseDefaults, Language from .lemmatizer import HaitianCreoleLemmatizer from .lex_attrs import LEX_ATTRS -from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES, TOKENIZER_SUFFIXES +from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES from .stop_words import STOP_WORDS from .syntax_iterators import SYNTAX_ITERATORS -from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tag_map import TAG_MAP +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS class HaitianCreoleDefaults(BaseDefaults): diff --git a/spacy/lang/ht/lemmatizer.py b/spacy/lang/ht/lemmatizer.py index 9ac096f6d..52bf23d23 100644 --- a/spacy/lang/ht/lemmatizer.py +++ b/spacy/lang/ht/lemmatizer.py @@ -1,8 +1,8 @@ from typing import List, Tuple +from ...lookups import Lookups from ...pipeline import Lemmatizer from ...tokens import Token -from ...lookups import Lookups class HaitianCreoleLemmatizer(Lemmatizer): diff --git a/spacy/lang/ht/punctuation.py b/spacy/lang/ht/punctuation.py index 0077db1c0..c4a5d090e 100644 --- a/spacy/lang/ht/punctuation.py +++ b/spacy/lang/ht/punctuation.py @@ -4,10 +4,10 @@ from ..char_classes import ( ALPHA_UPPER, CONCAT_QUOTES, HYPHENS, - LIST_PUNCT, - LIST_QUOTES, LIST_ELLIPSES, LIST_ICONS, + LIST_PUNCT, + LIST_QUOTES, merge_chars, ) diff --git a/spacy/lang/ht/tag_map.py b/spacy/lang/ht/tag_map.py index 261d1aef3..a190984a6 100644 --- a/spacy/lang/ht/tag_map.py +++ b/spacy/lang/ht/tag_map.py @@ -1,20 +1,20 @@ from spacy.symbols import ( - NOUN, - VERB, - AUX, ADJ, - ADV, - PRON, - DET, ADP, - SCONJ, + ADV, + AUX, CCONJ, - PART, + DET, INTJ, + NOUN, NUM, + PART, + PRON, PROPN, PUNCT, + SCONJ, SYM, + VERB, X, ) diff --git a/spacy/lang/ht/tokenizer_exceptions.py b/spacy/lang/ht/tokenizer_exceptions.py index 4d617fd36..deb152c25 100644 --- a/spacy/lang/ht/tokenizer_exceptions.py +++ b/spacy/lang/ht/tokenizer_exceptions.py @@ -1,4 +1,4 @@ -from spacy.symbols import ORTH, NORM +from spacy.symbols import NORM, ORTH def make_variants(base, first_norm, second_orth, second_norm): diff --git a/spacy/tests/lang/ht/test_noun_chunks.py b/spacy/tests/lang/ht/test_noun_chunks.py index 76c5a1df3..fcefd7dfd 100644 --- a/spacy/tests/lang/ht/test_noun_chunks.py +++ b/spacy/tests/lang/ht/test_noun_chunks.py @@ -1,4 +1,5 @@ import pytest + from spacy.tokens import Doc