This commit is contained in:
Matthew Honnibal 2025-05-28 17:27:36 +02:00
parent 79f9d3ea2a
commit 80aa445f34
9 changed files with 183 additions and 127 deletions

View File

@ -22,10 +22,12 @@ class HaitianCreoleDefaults(BaseDefaults):
stop_words = STOP_WORDS stop_words = STOP_WORDS
tag_map = TAG_MAP tag_map = TAG_MAP
class HaitianCreole(Language): class HaitianCreole(Language):
lang = "ht" lang = "ht"
Defaults = HaitianCreoleDefaults Defaults = HaitianCreoleDefaults
@HaitianCreole.factory( @HaitianCreole.factory(
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
@ -49,4 +51,5 @@ def make_lemmatizer(
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
) )
__all__ = ["HaitianCreole"] __all__ = ["HaitianCreole"]

View File

@ -49,6 +49,7 @@ NORM_MAP = {
"P": "Pa", "P": "Pa",
} }
def like_num(text): def like_num(text):
text = text.strip().lower() text = text.strip().lower()
if text.startswith(("+", "-", "±", "~")): if text.startswith(("+", "-", "±", "~")):
@ -69,9 +70,11 @@ def like_num(text):
return True return True
return False return False
def norm_custom(text): def norm_custom(text):
return NORM_MAP.get(text, text.lower()) return NORM_MAP.get(text, text.lower())
LEX_ATTRS = { LEX_ATTRS = {
LIKE_NUM: like_num, LIKE_NUM: like_num,
NORM: norm_custom, NORM: norm_custom,

View File

@ -16,23 +16,37 @@ ELISION = "'".replace(" ", "")
_prefixes_elision = "m n l y t k w" _prefixes_elision = "m n l y t k w"
_prefixes_elision += " " + _prefixes_elision.upper() _prefixes_elision += " " + _prefixes_elision.upper()
TOKENIZER_PREFIXES = LIST_PUNCT + LIST_QUOTES + [ TOKENIZER_PREFIXES = (
LIST_PUNCT
+ LIST_QUOTES
+ [
r"(?:({pe})[{el}])(?=[{a}])".format( r"(?:({pe})[{el}])(?=[{a}])".format(
a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision) a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision)
) )
] ]
)
TOKENIZER_SUFFIXES = LIST_PUNCT + LIST_QUOTES + LIST_ELLIPSES + [ TOKENIZER_SUFFIXES = (
LIST_PUNCT
+ LIST_QUOTES
+ LIST_ELLIPSES
+ [
r"(?<=[0-9])%", # numbers like 10% r"(?<=[0-9])%", # numbers like 10%
r"(?<=[0-9])(?:{h})".format(h=HYPHENS), # hyphens after numbers r"(?<=[0-9])(?:{h})".format(h=HYPHENS), # hyphens after numbers
r"(?<=[{a}])[']".format(a=ALPHA), # apostrophes after letters r"(?<=[{a}])[']".format(a=ALPHA), # apostrophes after letters
r"(?<=[{a}])['][mwlnytk](?=\s|$)".format(a=ALPHA), # contractions r"(?<=[{a}])['][mwlnytk](?=\s|$)".format(a=ALPHA), # contractions
r"(?<=[{a}0-9])\)", # right parenthesis after letter/number r"(?<=[{a}0-9])\)", # right parenthesis after letter/number
r"(?<=[{a}])\.(?=\s|$)".format(a=ALPHA), # period after letter if space or end of string r"(?<=[{a}])\.(?=\s|$)".format(
a=ALPHA
), # period after letter if space or end of string
r"(?<=\))[\.\?!]", # punctuation immediately after right parenthesis r"(?<=\))[\.\?!]", # punctuation immediately after right parenthesis
] ]
)
TOKENIZER_INFIXES = LIST_ELLIPSES + LIST_ICONS + [ TOKENIZER_INFIXES = (
LIST_ELLIPSES
+ LIST_ICONS
+ [
r"(?<=[0-9])[+\-\*^](?=[0-9-])", r"(?<=[0-9])[+\-\*^](?=[0-9-])",
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format( r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
@ -41,3 +55,4 @@ TOKENIZER_INFIXES = LIST_ELLIPSES + LIST_ICONS + [
r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS), r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION), r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION),
] ]
)

View File

@ -39,8 +39,7 @@ sa san si swa si
men mèsi oswa osinon men mèsi oswa osinon
""" """.split()
.split()
) )
# Add common contractions, with and without apostrophe variants # Add common contractions, with and without apostrophe variants

View File

@ -1,4 +1,22 @@
from spacy.symbols import NOUN, VERB, AUX, ADJ, ADV, PRON, DET, ADP, SCONJ, CCONJ, PART, INTJ, NUM, PROPN, PUNCT, SYM, X from spacy.symbols import (
NOUN,
VERB,
AUX,
ADJ,
ADV,
PRON,
DET,
ADP,
SCONJ,
CCONJ,
PART,
INTJ,
NUM,
PROPN,
PUNCT,
SYM,
X,
)
TAG_MAP = { TAG_MAP = {
"NOUN": {"pos": NOUN}, "NOUN": {"pos": NOUN},

View File

@ -1,5 +1,6 @@
from spacy.symbols import ORTH, NORM from spacy.symbols import ORTH, NORM
def make_variants(base, first_norm, second_orth, second_norm): def make_variants(base, first_norm, second_orth, second_norm):
return { return {
base: [ base: [
@ -7,14 +8,16 @@ def make_variants(base, first_norm, second_orth, second_norm):
{ORTH: second_orth, NORM: second_norm}, {ORTH: second_orth, NORM: second_norm},
], ],
base.capitalize(): [ base.capitalize(): [
{ORTH: base.split("'")[0].capitalize() + "'", NORM: first_norm.capitalize()}, {
ORTH: base.split("'")[0].capitalize() + "'",
NORM: first_norm.capitalize(),
},
{ORTH: second_orth, NORM: second_norm}, {ORTH: second_orth, NORM: second_norm},
] ],
} }
TOKENIZER_EXCEPTIONS = {
"Dr.": [{ORTH: "Dr."}] TOKENIZER_EXCEPTIONS = {"Dr.": [{ORTH: "Dr."}]}
}
# Apostrophe forms # Apostrophe forms
TOKENIZER_EXCEPTIONS.update(make_variants("m'ap", "mwen", "ap", "ap")) TOKENIZER_EXCEPTIONS.update(make_variants("m'ap", "mwen", "ap", "ap"))
@ -29,7 +32,8 @@ TOKENIZER_EXCEPTIONS.update(make_variants("p'ap", "pa", "ap", "ap"))
TOKENIZER_EXCEPTIONS.update(make_variants("t'ap", "te", "ap", "ap")) TOKENIZER_EXCEPTIONS.update(make_variants("t'ap", "te", "ap", "ap"))
# Non-apostrophe contractions (with capitalized variants) # Non-apostrophe contractions (with capitalized variants)
TOKENIZER_EXCEPTIONS.update({ TOKENIZER_EXCEPTIONS.update(
{
"map": [ "map": [
{ORTH: "m", NORM: "mwen"}, {ORTH: "m", NORM: "mwen"},
{ORTH: "ap", NORM: "ap"}, {ORTH: "ap", NORM: "ap"},
@ -118,4 +122,5 @@ TOKENIZER_EXCEPTIONS.update({
{ORTH: "T", NORM: "Te"}, {ORTH: "T", NORM: "Te"},
{ORTH: "ap", NORM: "ap"}, {ORTH: "ap", NORM: "ap"},
], ],
}) }
)

View File

@ -29,4 +29,16 @@ def test_ht_tokenizer_handles_basic_abbreviation(ht_tokenizer, text):
def test_ht_tokenizer_full_sentence(ht_tokenizer): def test_ht_tokenizer_full_sentence(ht_tokenizer):
text = "Si'm ka vini, m'ap pale ak li." text = "Si'm ka vini, m'ap pale ak li."
tokens = [t.text for t in ht_tokenizer(text)] tokens = [t.text for t in ht_tokenizer(text)]
assert tokens == ["Si", "'m", "ka", "vini", ",", "m'", "ap", "pale", "ak", "li", "."] assert tokens == [
"Si",
"'m",
"ka",
"vini",
",",
"m'",
"ap",
"pale",
"ak",
"li",
".",
]

View File

@ -37,7 +37,9 @@ def test_ht_tokenizer_splits_uneven_wrap(ht_tokenizer, text):
assert len(tokens) == 5 assert len(tokens) == 5
@pytest.mark.parametrize("text,length", [("Ozetazini.", 2), ("Frans.", 2), ("(Ozetazini.", 3)]) @pytest.mark.parametrize(
"text,length", [("Ozetazini.", 2), ("Frans.", 2), ("(Ozetazini.", 3)]
)
def test_ht_tokenizer_splits_prefix_interact(ht_tokenizer, text, length): def test_ht_tokenizer_splits_prefix_interact(ht_tokenizer, text, length):
tokens = ht_tokenizer(text) tokens = ht_tokenizer(text)
assert len(tokens) == length assert len(tokens) == length

View File

@ -16,7 +16,6 @@ Nan Washington, Depatman Deta Etazini pibliye yon deklarasyon ki eksprime "regre
assert len(tokens) == 84 assert len(tokens) == 84
@pytest.mark.parametrize( @pytest.mark.parametrize(
"text,length", "text,length",
[ [
@ -66,14 +65,14 @@ def test_ht_lex_attrs_capitals(word):
@pytest.mark.parametrize( @pytest.mark.parametrize(
"word, expected", [ "word, expected",
[
("'m", "mwen"), ("'m", "mwen"),
("'n", "nou"), ("'n", "nou"),
("'l", "li"), ("'l", "li"),
("'y", "yo"), ("'y", "yo"),
("'w", "ou"), ("'w", "ou"),
] ],
) )
def test_ht_lex_attrs_norm_custom(word, expected): def test_ht_lex_attrs_norm_custom(word, expected):
assert norm_custom(word) == expected assert norm_custom(word) == expected