This commit is contained in:
Matthew Honnibal 2025-05-28 17:27:36 +02:00
parent 79f9d3ea2a
commit 80aa445f34
9 changed files with 183 additions and 127 deletions

View File

@ -22,10 +22,12 @@ class HaitianCreoleDefaults(BaseDefaults):
stop_words = STOP_WORDS stop_words = STOP_WORDS
tag_map = TAG_MAP tag_map = TAG_MAP
class HaitianCreole(Language): class HaitianCreole(Language):
lang = "ht" lang = "ht"
Defaults = HaitianCreoleDefaults Defaults = HaitianCreoleDefaults
@HaitianCreole.factory( @HaitianCreole.factory(
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
@ -49,4 +51,5 @@ def make_lemmatizer(
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
) )
__all__ = ["HaitianCreole"] __all__ = ["HaitianCreole"]

View File

@ -49,6 +49,7 @@ NORM_MAP = {
"P": "Pa", "P": "Pa",
} }
def like_num(text): def like_num(text):
text = text.strip().lower() text = text.strip().lower()
if text.startswith(("+", "-", "±", "~")): if text.startswith(("+", "-", "±", "~")):
@ -69,9 +70,11 @@ def like_num(text):
return True return True
return False return False
def norm_custom(text): def norm_custom(text):
return NORM_MAP.get(text, text.lower()) return NORM_MAP.get(text, text.lower())
LEX_ATTRS = { LEX_ATTRS = {
LIKE_NUM: like_num, LIKE_NUM: like_num,
NORM: norm_custom, NORM: norm_custom,

View File

@ -16,28 +16,43 @@ ELISION = "'".replace(" ", "")
_prefixes_elision = "m n l y t k w" _prefixes_elision = "m n l y t k w"
_prefixes_elision += " " + _prefixes_elision.upper() _prefixes_elision += " " + _prefixes_elision.upper()
TOKENIZER_PREFIXES = LIST_PUNCT + LIST_QUOTES + [ TOKENIZER_PREFIXES = (
r"(?:({pe})[{el}])(?=[{a}])".format( LIST_PUNCT
a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision) + LIST_QUOTES
) + [
] r"(?:({pe})[{el}])(?=[{a}])".format(
a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision)
)
]
)
TOKENIZER_SUFFIXES = LIST_PUNCT + LIST_QUOTES + LIST_ELLIPSES + [ TOKENIZER_SUFFIXES = (
r"(?<=[0-9])%", # numbers like 10% LIST_PUNCT
r"(?<=[0-9])(?:{h})".format(h=HYPHENS), # hyphens after numbers + LIST_QUOTES
r"(?<=[{a}])[']".format(a=ALPHA), # apostrophes after letters + LIST_ELLIPSES
r"(?<=[{a}])['][mwlnytk](?=\s|$)".format(a=ALPHA), # contractions + [
r"(?<=[{a}0-9])\)", # right parenthesis after letter/number r"(?<=[0-9])%", # numbers like 10%
r"(?<=[{a}])\.(?=\s|$)".format(a=ALPHA), # period after letter if space or end of string r"(?<=[0-9])(?:{h})".format(h=HYPHENS), # hyphens after numbers
r"(?<=\))[\.\?!]", # punctuation immediately after right parenthesis r"(?<=[{a}])[']".format(a=ALPHA), # apostrophes after letters
] r"(?<=[{a}])['][mwlnytk](?=\s|$)".format(a=ALPHA), # contractions
r"(?<=[{a}0-9])\)", # right parenthesis after letter/number
r"(?<=[{a}])\.(?=\s|$)".format(
a=ALPHA
), # period after letter if space or end of string
r"(?<=\))[\.\?!]", # punctuation immediately after right parenthesis
]
)
TOKENIZER_INFIXES = LIST_ELLIPSES + LIST_ICONS + [ TOKENIZER_INFIXES = (
r"(?<=[0-9])[+\-\*^](?=[0-9-])", LIST_ELLIPSES
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format( + LIST_ICONS
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES + [
), r"(?<=[0-9])[+\-\*^](?=[0-9-])",
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS), al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION), ),
] r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION),
]
)

View File

@ -39,8 +39,7 @@ sa san si swa si
men mèsi oswa osinon men mèsi oswa osinon
""" """.split()
.split()
) )
# Add common contractions, with and without apostrophe variants # Add common contractions, with and without apostrophe variants

View File

@ -1,4 +1,22 @@
from spacy.symbols import NOUN, VERB, AUX, ADJ, ADV, PRON, DET, ADP, SCONJ, CCONJ, PART, INTJ, NUM, PROPN, PUNCT, SYM, X from spacy.symbols import (
NOUN,
VERB,
AUX,
ADJ,
ADV,
PRON,
DET,
ADP,
SCONJ,
CCONJ,
PART,
INTJ,
NUM,
PROPN,
PUNCT,
SYM,
X,
)
TAG_MAP = { TAG_MAP = {
"NOUN": {"pos": NOUN}, "NOUN": {"pos": NOUN},

View File

@ -1,5 +1,6 @@
from spacy.symbols import ORTH, NORM from spacy.symbols import ORTH, NORM
def make_variants(base, first_norm, second_orth, second_norm): def make_variants(base, first_norm, second_orth, second_norm):
return { return {
base: [ base: [
@ -7,14 +8,16 @@ def make_variants(base, first_norm, second_orth, second_norm):
{ORTH: second_orth, NORM: second_norm}, {ORTH: second_orth, NORM: second_norm},
], ],
base.capitalize(): [ base.capitalize(): [
{ORTH: base.split("'")[0].capitalize() + "'", NORM: first_norm.capitalize()}, {
ORTH: base.split("'")[0].capitalize() + "'",
NORM: first_norm.capitalize(),
},
{ORTH: second_orth, NORM: second_norm}, {ORTH: second_orth, NORM: second_norm},
] ],
} }
TOKENIZER_EXCEPTIONS = {
"Dr.": [{ORTH: "Dr."}] TOKENIZER_EXCEPTIONS = {"Dr.": [{ORTH: "Dr."}]}
}
# Apostrophe forms # Apostrophe forms
TOKENIZER_EXCEPTIONS.update(make_variants("m'ap", "mwen", "ap", "ap")) TOKENIZER_EXCEPTIONS.update(make_variants("m'ap", "mwen", "ap", "ap"))
@ -29,93 +32,95 @@ TOKENIZER_EXCEPTIONS.update(make_variants("p'ap", "pa", "ap", "ap"))
TOKENIZER_EXCEPTIONS.update(make_variants("t'ap", "te", "ap", "ap")) TOKENIZER_EXCEPTIONS.update(make_variants("t'ap", "te", "ap", "ap"))
# Non-apostrophe contractions (with capitalized variants) # Non-apostrophe contractions (with capitalized variants)
TOKENIZER_EXCEPTIONS.update({ TOKENIZER_EXCEPTIONS.update(
"map": [ {
{ORTH: "m", NORM: "mwen"}, "map": [
{ORTH: "ap", NORM: "ap"}, {ORTH: "m", NORM: "mwen"},
], {ORTH: "ap", NORM: "ap"},
"Map": [ ],
{ORTH: "M", NORM: "Mwen"}, "Map": [
{ORTH: "ap", NORM: "ap"}, {ORTH: "M", NORM: "Mwen"},
], {ORTH: "ap", NORM: "ap"},
"lem": [ ],
{ORTH: "le", NORM: "le"}, "lem": [
{ORTH: "m", NORM: "mwen"}, {ORTH: "le", NORM: "le"},
], {ORTH: "m", NORM: "mwen"},
"Lem": [ ],
{ORTH: "Le", NORM: "Le"}, "Lem": [
{ORTH: "m", NORM: "mwen"}, {ORTH: "Le", NORM: "Le"},
], {ORTH: "m", NORM: "mwen"},
"lew": [ ],
{ORTH: "le", NORM: "le"}, "lew": [
{ORTH: "w", NORM: "ou"}, {ORTH: "le", NORM: "le"},
], {ORTH: "w", NORM: "ou"},
"Lew": [ ],
{ORTH: "Le", NORM: "Le"}, "Lew": [
{ORTH: "w", NORM: "ou"}, {ORTH: "Le", NORM: "Le"},
], {ORTH: "w", NORM: "ou"},
"nap": [ ],
{ORTH: "n", NORM: "nou"}, "nap": [
{ORTH: "ap", NORM: "ap"}, {ORTH: "n", NORM: "nou"},
], {ORTH: "ap", NORM: "ap"},
"Nap": [ ],
{ORTH: "N", NORM: "Nou"}, "Nap": [
{ORTH: "ap", NORM: "ap"}, {ORTH: "N", NORM: "Nou"},
], {ORTH: "ap", NORM: "ap"},
"lap": [ ],
{ORTH: "l", NORM: "li"}, "lap": [
{ORTH: "ap", NORM: "ap"}, {ORTH: "l", NORM: "li"},
], {ORTH: "ap", NORM: "ap"},
"Lap": [ ],
{ORTH: "L", NORM: "Li"}, "Lap": [
{ORTH: "ap", NORM: "ap"}, {ORTH: "L", NORM: "Li"},
], {ORTH: "ap", NORM: "ap"},
"yap": [ ],
{ORTH: "y", NORM: "yo"}, "yap": [
{ORTH: "ap", NORM: "ap"}, {ORTH: "y", NORM: "yo"},
], {ORTH: "ap", NORM: "ap"},
"Yap": [ ],
{ORTH: "Y", NORM: "Yo"}, "Yap": [
{ORTH: "ap", NORM: "ap"}, {ORTH: "Y", NORM: "Yo"},
], {ORTH: "ap", NORM: "ap"},
"mte": [ ],
{ORTH: "m", NORM: "mwen"}, "mte": [
{ORTH: "te", NORM: "te"}, {ORTH: "m", NORM: "mwen"},
], {ORTH: "te", NORM: "te"},
"Mte": [ ],
{ORTH: "M", NORM: "Mwen"}, "Mte": [
{ORTH: "te", NORM: "te"}, {ORTH: "M", NORM: "Mwen"},
], {ORTH: "te", NORM: "te"},
"mpral": [ ],
{ORTH: "m", NORM: "mwen"}, "mpral": [
{ORTH: "pral", NORM: "pral"}, {ORTH: "m", NORM: "mwen"},
], {ORTH: "pral", NORM: "pral"},
"Mpral": [ ],
{ORTH: "M", NORM: "Mwen"}, "Mpral": [
{ORTH: "pral", NORM: "pral"}, {ORTH: "M", NORM: "Mwen"},
], {ORTH: "pral", NORM: "pral"},
"wap": [ ],
{ORTH: "w", NORM: "ou"}, "wap": [
{ORTH: "ap", NORM: "ap"}, {ORTH: "w", NORM: "ou"},
], {ORTH: "ap", NORM: "ap"},
"Wap": [ ],
{ORTH: "W", NORM: "Ou"}, "Wap": [
{ORTH: "ap", NORM: "ap"}, {ORTH: "W", NORM: "Ou"},
], {ORTH: "ap", NORM: "ap"},
"kap": [ ],
{ORTH: "k", NORM: "ki"}, "kap": [
{ORTH: "ap", NORM: "ap"}, {ORTH: "k", NORM: "ki"},
], {ORTH: "ap", NORM: "ap"},
"Kap": [ ],
{ORTH: "K", NORM: "Ki"}, "Kap": [
{ORTH: "ap", NORM: "ap"}, {ORTH: "K", NORM: "Ki"},
], {ORTH: "ap", NORM: "ap"},
"tap": [ ],
{ORTH: "t", NORM: "te"}, "tap": [
{ORTH: "ap", NORM: "ap"}, {ORTH: "t", NORM: "te"},
], {ORTH: "ap", NORM: "ap"},
"Tap": [ ],
{ORTH: "T", NORM: "Te"}, "Tap": [
{ORTH: "ap", NORM: "ap"}, {ORTH: "T", NORM: "Te"},
], {ORTH: "ap", NORM: "ap"},
}) ],
}
)

View File

@ -29,4 +29,16 @@ def test_ht_tokenizer_handles_basic_abbreviation(ht_tokenizer, text):
def test_ht_tokenizer_full_sentence(ht_tokenizer): def test_ht_tokenizer_full_sentence(ht_tokenizer):
text = "Si'm ka vini, m'ap pale ak li." text = "Si'm ka vini, m'ap pale ak li."
tokens = [t.text for t in ht_tokenizer(text)] tokens = [t.text for t in ht_tokenizer(text)]
assert tokens == ["Si", "'m", "ka", "vini", ",", "m'", "ap", "pale", "ak", "li", "."] assert tokens == [
"Si",
"'m",
"ka",
"vini",
",",
"m'",
"ap",
"pale",
"ak",
"li",
".",
]

View File

@ -37,7 +37,9 @@ def test_ht_tokenizer_splits_uneven_wrap(ht_tokenizer, text):
assert len(tokens) == 5 assert len(tokens) == 5
@pytest.mark.parametrize("text,length", [("Ozetazini.", 2), ("Frans.", 2), ("(Ozetazini.", 3)]) @pytest.mark.parametrize(
"text,length", [("Ozetazini.", 2), ("Frans.", 2), ("(Ozetazini.", 3)]
)
def test_ht_tokenizer_splits_prefix_interact(ht_tokenizer, text, length): def test_ht_tokenizer_splits_prefix_interact(ht_tokenizer, text, length):
tokens = ht_tokenizer(text) tokens = ht_tokenizer(text)
assert len(tokens) == length assert len(tokens) == length

View File

@ -16,7 +16,6 @@ Nan Washington, Depatman Deta Etazini pibliye yon deklarasyon ki eksprime "regre
assert len(tokens) == 84 assert len(tokens) == 84
@pytest.mark.parametrize( @pytest.mark.parametrize(
"text,length", "text,length",
[ [
@ -66,14 +65,14 @@ def test_ht_lex_attrs_capitals(word):
@pytest.mark.parametrize( @pytest.mark.parametrize(
"word, expected", [ "word, expected",
[
("'m", "mwen"), ("'m", "mwen"),
("'n", "nou"), ("'n", "nou"),
("'l", "li"), ("'l", "li"),
("'y", "yo"), ("'y", "yo"),
("'w", "ou"), ("'w", "ou"),
] ],
) )
def test_ht_lex_attrs_norm_custom(word, expected): def test_ht_lex_attrs_norm_custom(word, expected):
assert norm_custom(word) == expected assert norm_custom(word) == expected