Christian Clauss 2025-08-31 15:40:20 +02:00
parent 41e07772dc
commit 972f8a6354
14 changed files with 195 additions and 138 deletions

View File

@ -54,9 +54,9 @@ jobs:
tests: tests:
name: Test name: Test
needs: Validate needs: validate
strategy: strategy:
fail-fast: true fail-fast: false
matrix: matrix:
os: [ubuntu-latest, windows-latest, macos-latest] os: [ubuntu-latest, windows-latest, macos-latest]
python_version: ["3.9", "3.12", "3.13"] python_version: ["3.9", "3.12", "3.13"]

View File

@ -99,7 +99,7 @@ def parse_config_overrides(
RETURNS (Dict[str, Any]): The parsed dict, keyed by nested config setting. RETURNS (Dict[str, Any]): The parsed dict, keyed by nested config setting.
""" """
env_string = os.environ.get(env_var, "") if env_var else "" env_string = os.environ.get(env_var, "") if env_var else ""
env_overrides = _parse_overrides(split_arg_string(env_string)) env_overrides = _parse_overrides(split_arg_string(env_string)) # type: ignore[operator]
cli_overrides = _parse_overrides(args, is_cli=True) cli_overrides = _parse_overrides(args, is_cli=True)
if cli_overrides: if cli_overrides:
keys = [k for k in cli_overrides if k not in env_overrides] keys = [k for k in cli_overrides if k not in env_overrides]

View File

@ -5,11 +5,11 @@ from thinc.api import Model
from ...language import BaseDefaults, Language from ...language import BaseDefaults, Language
from .lemmatizer import HaitianCreoleLemmatizer from .lemmatizer import HaitianCreoleLemmatizer
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .tag_map import TAG_MAP from .tag_map import TAG_MAP
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class HaitianCreoleDefaults(BaseDefaults): class HaitianCreoleDefaults(BaseDefaults):
@ -22,10 +22,12 @@ class HaitianCreoleDefaults(BaseDefaults):
stop_words = STOP_WORDS stop_words = STOP_WORDS
tag_map = TAG_MAP tag_map = TAG_MAP
class HaitianCreole(Language): class HaitianCreole(Language):
lang = "ht" lang = "ht"
Defaults = HaitianCreoleDefaults Defaults = HaitianCreoleDefaults
@HaitianCreole.factory( @HaitianCreole.factory(
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
@ -49,4 +51,5 @@ def make_lemmatizer(
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
) )
__all__ = ["HaitianCreole"] __all__ = ["HaitianCreole"]

View File

@ -1,8 +1,8 @@
from typing import List, Tuple from typing import List, Tuple
from ...lookups import Lookups
from ...pipeline import Lemmatizer from ...pipeline import Lemmatizer
from ...tokens import Token from ...tokens import Token
from ...lookups import Lookups
class HaitianCreoleLemmatizer(Lemmatizer): class HaitianCreoleLemmatizer(Lemmatizer):

View File

@ -49,6 +49,7 @@ NORM_MAP = {
"P": "Pa", "P": "Pa",
} }
def like_num(text): def like_num(text):
text = text.strip().lower() text = text.strip().lower()
if text.startswith(("+", "-", "±", "~")): if text.startswith(("+", "-", "±", "~")):
@ -69,9 +70,11 @@ def like_num(text):
return True return True
return False return False
def norm_custom(text): def norm_custom(text):
return NORM_MAP.get(text, text.lower()) return NORM_MAP.get(text, text.lower())
LEX_ATTRS = { LEX_ATTRS = {
LIKE_NUM: like_num, LIKE_NUM: like_num,
NORM: norm_custom, NORM: norm_custom,

View File

@ -4,10 +4,10 @@ from ..char_classes import (
ALPHA_UPPER, ALPHA_UPPER,
CONCAT_QUOTES, CONCAT_QUOTES,
HYPHENS, HYPHENS,
LIST_PUNCT,
LIST_QUOTES,
LIST_ELLIPSES, LIST_ELLIPSES,
LIST_ICONS, LIST_ICONS,
LIST_PUNCT,
LIST_QUOTES,
merge_chars, merge_chars,
) )
@ -16,23 +16,37 @@ ELISION = "'".replace(" ", "")
_prefixes_elision = "m n l y t k w" _prefixes_elision = "m n l y t k w"
_prefixes_elision += " " + _prefixes_elision.upper() _prefixes_elision += " " + _prefixes_elision.upper()
TOKENIZER_PREFIXES = LIST_PUNCT + LIST_QUOTES + [ TOKENIZER_PREFIXES = (
LIST_PUNCT
+ LIST_QUOTES
+ [
r"(?:({pe})[{el}])(?=[{a}])".format( r"(?:({pe})[{el}])(?=[{a}])".format(
a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision) a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision)
) )
] ]
)
TOKENIZER_SUFFIXES = LIST_PUNCT + LIST_QUOTES + LIST_ELLIPSES + [ TOKENIZER_SUFFIXES = (
LIST_PUNCT
+ LIST_QUOTES
+ LIST_ELLIPSES
+ [
r"(?<=[0-9])%", # numbers like 10% r"(?<=[0-9])%", # numbers like 10%
r"(?<=[0-9])(?:{h})".format(h=HYPHENS), # hyphens after numbers r"(?<=[0-9])(?:{h})".format(h=HYPHENS), # hyphens after numbers
r"(?<=[{a}])[']".format(a=ALPHA), # apostrophes after letters r"(?<=[{a}])[']".format(a=ALPHA), # apostrophes after letters
r"(?<=[{a}])['][mwlnytk](?=\s|$)".format(a=ALPHA), # contractions r"(?<=[{a}])['][mwlnytk](?=\s|$)".format(a=ALPHA), # contractions
r"(?<=[{a}0-9])\)", # right parenthesis after letter/number r"(?<=[{a}0-9])\)", # right parenthesis after letter/number
r"(?<=[{a}])\.(?=\s|$)".format(a=ALPHA), # period after letter if space or end of string r"(?<=[{a}])\.(?=\s|$)".format(
a=ALPHA
), # period after letter if space or end of string
r"(?<=\))[\.\?!]", # punctuation immediately after right parenthesis r"(?<=\))[\.\?!]", # punctuation immediately after right parenthesis
] ]
)
TOKENIZER_INFIXES = LIST_ELLIPSES + LIST_ICONS + [ TOKENIZER_INFIXES = (
LIST_ELLIPSES
+ LIST_ICONS
+ [
r"(?<=[0-9])[+\-\*^](?=[0-9-])", r"(?<=[0-9])[+\-\*^](?=[0-9-])",
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format( r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
@ -40,4 +54,5 @@ TOKENIZER_INFIXES = LIST_ELLIPSES + LIST_ICONS + [
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS), r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION), r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION),
] ]
)

View File

@ -39,8 +39,7 @@ sa san si swa si
men mèsi oswa osinon men mèsi oswa osinon
""" """.split()
.split()
) )
# Add common contractions, with and without apostrophe variants # Add common contractions, with and without apostrophe variants

View File

@ -1,4 +1,22 @@
from spacy.symbols import NOUN, VERB, AUX, ADJ, ADV, PRON, DET, ADP, SCONJ, CCONJ, PART, INTJ, NUM, PROPN, PUNCT, SYM, X from spacy.symbols import (
ADJ,
ADP,
ADV,
AUX,
CCONJ,
DET,
INTJ,
NOUN,
NUM,
PART,
PRON,
PROPN,
PUNCT,
SCONJ,
SYM,
VERB,
X,
)
TAG_MAP = { TAG_MAP = {
"NOUN": {"pos": NOUN}, "NOUN": {"pos": NOUN},

View File

@ -1,4 +1,5 @@
from spacy.symbols import ORTH, NORM from spacy.symbols import NORM, ORTH
def make_variants(base, first_norm, second_orth, second_norm): def make_variants(base, first_norm, second_orth, second_norm):
return { return {
@ -7,14 +8,16 @@ def make_variants(base, first_norm, second_orth, second_norm):
{ORTH: second_orth, NORM: second_norm}, {ORTH: second_orth, NORM: second_norm},
], ],
base.capitalize(): [ base.capitalize(): [
{ORTH: base.split("'")[0].capitalize() + "'", NORM: first_norm.capitalize()}, {
ORTH: base.split("'")[0].capitalize() + "'",
NORM: first_norm.capitalize(),
},
{ORTH: second_orth, NORM: second_norm}, {ORTH: second_orth, NORM: second_norm},
] ],
} }
TOKENIZER_EXCEPTIONS = {
"Dr.": [{ORTH: "Dr."}] TOKENIZER_EXCEPTIONS = {"Dr.": [{ORTH: "Dr."}]}
}
# Apostrophe forms # Apostrophe forms
TOKENIZER_EXCEPTIONS.update(make_variants("m'ap", "mwen", "ap", "ap")) TOKENIZER_EXCEPTIONS.update(make_variants("m'ap", "mwen", "ap", "ap"))
@ -29,7 +32,8 @@ TOKENIZER_EXCEPTIONS.update(make_variants("p'ap", "pa", "ap", "ap"))
TOKENIZER_EXCEPTIONS.update(make_variants("t'ap", "te", "ap", "ap")) TOKENIZER_EXCEPTIONS.update(make_variants("t'ap", "te", "ap", "ap"))
# Non-apostrophe contractions (with capitalized variants) # Non-apostrophe contractions (with capitalized variants)
TOKENIZER_EXCEPTIONS.update({ TOKENIZER_EXCEPTIONS.update(
{
"map": [ "map": [
{ORTH: "m", NORM: "mwen"}, {ORTH: "m", NORM: "mwen"},
{ORTH: "ap", NORM: "ap"}, {ORTH: "ap", NORM: "ap"},
@ -118,4 +122,5 @@ TOKENIZER_EXCEPTIONS.update({
{ORTH: "T", NORM: "Te"}, {ORTH: "T", NORM: "Te"},
{ORTH: "ap", NORM: "ap"}, {ORTH: "ap", NORM: "ap"},
], ],
}) }
)

View File

@ -187,7 +187,7 @@ class Lemmatizer(Pipe):
if univ_pos == "": if univ_pos == "":
warnings.warn(Warnings.W108) warnings.warn(Warnings.W108)
return [string.lower()] return [string.lower()]
# See Issue #435 for example of where this logic is requied. # See Issue #435 for example of where this logic is required.
if self.is_base_form(token): if self.is_base_form(token):
return [string.lower()] return [string.lower()]
index_table = self.lookups.get_table("lemma_index", {}) index_table = self.lookups.get_table("lemma_index", {})
@ -210,7 +210,7 @@ class Lemmatizer(Pipe):
rules = rules_table.get(univ_pos, {}) rules = rules_table.get(univ_pos, {})
orig = string orig = string
string = string.lower() string = string.lower()
forms = [] forms: List[str] = []
oov_forms = [] oov_forms = []
for old, new in rules: for old, new in rules:
if string.endswith(old): if string.endswith(old):

View File

@ -29,4 +29,16 @@ def test_ht_tokenizer_handles_basic_abbreviation(ht_tokenizer, text):
def test_ht_tokenizer_full_sentence(ht_tokenizer): def test_ht_tokenizer_full_sentence(ht_tokenizer):
text = "Si'm ka vini, m'ap pale ak li." text = "Si'm ka vini, m'ap pale ak li."
tokens = [t.text for t in ht_tokenizer(text)] tokens = [t.text for t in ht_tokenizer(text)]
assert tokens == ["Si", "'m", "ka", "vini", ",", "m'", "ap", "pale", "ak", "li", "."] assert tokens == [
"Si",
"'m",
"ka",
"vini",
",",
"m'",
"ap",
"pale",
"ak",
"li",
".",
]

View File

@ -1,4 +1,5 @@
import pytest import pytest
from spacy.tokens import Doc from spacy.tokens import Doc

View File

@ -37,7 +37,9 @@ def test_ht_tokenizer_splits_uneven_wrap(ht_tokenizer, text):
assert len(tokens) == 5 assert len(tokens) == 5
@pytest.mark.parametrize("text,length", [("Ozetazini.", 2), ("Frans.", 2), ("(Ozetazini.", 3)]) @pytest.mark.parametrize(
"text,length", [("Ozetazini.", 2), ("Frans.", 2), ("(Ozetazini.", 3)]
)
def test_ht_tokenizer_splits_prefix_interact(ht_tokenizer, text, length): def test_ht_tokenizer_splits_prefix_interact(ht_tokenizer, text, length):
tokens = ht_tokenizer(text) tokens = ht_tokenizer(text)
assert len(tokens) == length assert len(tokens) == length

View File

@ -16,7 +16,6 @@ Nan Washington, Depatman Deta Etazini pibliye yon deklarasyon ki eksprime "regre
assert len(tokens) == 84 assert len(tokens) == 84
@pytest.mark.parametrize( @pytest.mark.parametrize(
"text,length", "text,length",
[ [
@ -66,14 +65,14 @@ def test_ht_lex_attrs_capitals(word):
@pytest.mark.parametrize( @pytest.mark.parametrize(
"word, expected", [ "word, expected",
[
("'m", "mwen"), ("'m", "mwen"),
("'n", "nou"), ("'n", "nou"),
("'l", "li"), ("'l", "li"),
("'y", "yo"), ("'y", "yo"),
("'w", "ou"), ("'w", "ou"),
] ],
) )
def test_ht_lex_attrs_norm_custom(word, expected): def test_ht_lex_attrs_norm_custom(word, expected):
assert norm_custom(word) == expected assert norm_custom(word) == expected