diff --git a/spacy/lang/ht/__init__.py b/spacy/lang/ht/__init__.py index e5c1c2770..9fc2df40c 100644 --- a/spacy/lang/ht/__init__.py +++ b/spacy/lang/ht/__init__.py @@ -22,10 +22,12 @@ class HaitianCreoleDefaults(BaseDefaults): stop_words = STOP_WORDS tag_map = TAG_MAP + class HaitianCreole(Language): lang = "ht" Defaults = HaitianCreoleDefaults + @HaitianCreole.factory( "lemmatizer", assigns=["token.lemma"], @@ -49,4 +51,5 @@ def make_lemmatizer( nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer ) + __all__ = ["HaitianCreole"] diff --git a/spacy/lang/ht/lex_attrs.py b/spacy/lang/ht/lex_attrs.py index 8a3ec1ff9..ab1a39a82 100644 --- a/spacy/lang/ht/lex_attrs.py +++ b/spacy/lang/ht/lex_attrs.py @@ -49,6 +49,7 @@ NORM_MAP = { "P": "Pa", } + def like_num(text): text = text.strip().lower() if text.startswith(("+", "-", "±", "~")): @@ -69,9 +70,11 @@ def like_num(text): return True return False + def norm_custom(text): return NORM_MAP.get(text, text.lower()) + LEX_ATTRS = { LIKE_NUM: like_num, NORM: norm_custom, diff --git a/spacy/lang/ht/punctuation.py b/spacy/lang/ht/punctuation.py index 61d88d6e1..0077db1c0 100644 --- a/spacy/lang/ht/punctuation.py +++ b/spacy/lang/ht/punctuation.py @@ -16,28 +16,43 @@ ELISION = "'’".replace(" ", "") _prefixes_elision = "m n l y t k w" _prefixes_elision += " " + _prefixes_elision.upper() -TOKENIZER_PREFIXES = LIST_PUNCT + LIST_QUOTES + [ - r"(?:({pe})[{el}])(?=[{a}])".format( - a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision) - ) -] +TOKENIZER_PREFIXES = ( + LIST_PUNCT + + LIST_QUOTES + + [ + r"(?:({pe})[{el}])(?=[{a}])".format( + a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision) + ) + ] +) -TOKENIZER_SUFFIXES = LIST_PUNCT + LIST_QUOTES + LIST_ELLIPSES + [ - r"(?<=[0-9])%", # numbers like 10% - r"(?<=[0-9])(?:{h})".format(h=HYPHENS), # hyphens after numbers - r"(?<=[{a}])['’]".format(a=ALPHA), # apostrophes after letters - r"(?<=[{a}])['’][mwlnytk](?=\s|$)".format(a=ALPHA), # contractions - r"(?<=[{a}0-9])\)", # right parenthesis after letter/number - r"(?<=[{a}])\.(?=\s|$)".format(a=ALPHA), # period after letter if space or end of string - r"(?<=\))[\.\?!]", # punctuation immediately after right parenthesis -] +TOKENIZER_SUFFIXES = ( + LIST_PUNCT + + LIST_QUOTES + + LIST_ELLIPSES + + [ + r"(?<=[0-9])%", # numbers like 10% + r"(?<=[0-9])(?:{h})".format(h=HYPHENS), # hyphens after numbers + r"(?<=[{a}])['’]".format(a=ALPHA), # apostrophes after letters + r"(?<=[{a}])['’][mwlnytk](?=\s|$)".format(a=ALPHA), # contractions + r"(?<=[{a}0-9])\)", # right parenthesis after letter/number + r"(?<=[{a}])\.(?=\s|$)".format( + a=ALPHA + ), # period after letter if space or end of string + r"(?<=\))[\.\?!]", # punctuation immediately after right parenthesis + ] +) -TOKENIZER_INFIXES = LIST_ELLIPSES + LIST_ICONS + [ - r"(?<=[0-9])[+\-\*^](?=[0-9-])", - r"(?<=[{al}{q}])\.(?=[{au}{q}])".format( - al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES - ), - r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), - r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS), - r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION), -] +TOKENIZER_INFIXES = ( + LIST_ELLIPSES + + LIST_ICONS + + [ + r"(?<=[0-9])[+\-\*^](?=[0-9-])", + r"(?<=[{al}{q}])\.(?=[{au}{q}])".format( + al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES + ), + r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), + r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS), + r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION), + ] +) diff --git a/spacy/lang/ht/stop_words.py b/spacy/lang/ht/stop_words.py index 6243887a4..50998e0e5 100644 --- a/spacy/lang/ht/stop_words.py +++ b/spacy/lang/ht/stop_words.py @@ -39,8 +39,7 @@ sa san si swa si men mèsi oswa osinon -""" -.split() +""".split() ) # Add common contractions, with and without apostrophe variants diff --git a/spacy/lang/ht/tag_map.py b/spacy/lang/ht/tag_map.py index 8c9cdd6d4..261d1aef3 100644 --- a/spacy/lang/ht/tag_map.py +++ b/spacy/lang/ht/tag_map.py @@ -1,4 +1,22 @@ -from spacy.symbols import NOUN, VERB, AUX, ADJ, ADV, PRON, DET, ADP, SCONJ, CCONJ, PART, INTJ, NUM, PROPN, PUNCT, SYM, X +from spacy.symbols import ( + NOUN, + VERB, + AUX, + ADJ, + ADV, + PRON, + DET, + ADP, + SCONJ, + CCONJ, + PART, + INTJ, + NUM, + PROPN, + PUNCT, + SYM, + X, +) TAG_MAP = { "NOUN": {"pos": NOUN}, diff --git a/spacy/lang/ht/tokenizer_exceptions.py b/spacy/lang/ht/tokenizer_exceptions.py index b44ad7a6f..4d617fd36 100644 --- a/spacy/lang/ht/tokenizer_exceptions.py +++ b/spacy/lang/ht/tokenizer_exceptions.py @@ -1,5 +1,6 @@ from spacy.symbols import ORTH, NORM + def make_variants(base, first_norm, second_orth, second_norm): return { base: [ @@ -7,14 +8,16 @@ def make_variants(base, first_norm, second_orth, second_norm): {ORTH: second_orth, NORM: second_norm}, ], base.capitalize(): [ - {ORTH: base.split("'")[0].capitalize() + "'", NORM: first_norm.capitalize()}, + { + ORTH: base.split("'")[0].capitalize() + "'", + NORM: first_norm.capitalize(), + }, {ORTH: second_orth, NORM: second_norm}, - ] + ], } -TOKENIZER_EXCEPTIONS = { - "Dr.": [{ORTH: "Dr."}] -} + +TOKENIZER_EXCEPTIONS = {"Dr.": [{ORTH: "Dr."}]} # Apostrophe forms TOKENIZER_EXCEPTIONS.update(make_variants("m'ap", "mwen", "ap", "ap")) @@ -29,93 +32,95 @@ TOKENIZER_EXCEPTIONS.update(make_variants("p'ap", "pa", "ap", "ap")) TOKENIZER_EXCEPTIONS.update(make_variants("t'ap", "te", "ap", "ap")) # Non-apostrophe contractions (with capitalized variants) -TOKENIZER_EXCEPTIONS.update({ - "map": [ - {ORTH: "m", NORM: "mwen"}, - {ORTH: "ap", NORM: "ap"}, - ], - "Map": [ - {ORTH: "M", NORM: "Mwen"}, - {ORTH: "ap", NORM: "ap"}, - ], - "lem": [ - {ORTH: "le", NORM: "le"}, - {ORTH: "m", NORM: "mwen"}, - ], - "Lem": [ - {ORTH: "Le", NORM: "Le"}, - {ORTH: "m", NORM: "mwen"}, - ], - "lew": [ - {ORTH: "le", NORM: "le"}, - {ORTH: "w", NORM: "ou"}, - ], - "Lew": [ - {ORTH: "Le", NORM: "Le"}, - {ORTH: "w", NORM: "ou"}, - ], - "nap": [ - {ORTH: "n", NORM: "nou"}, - {ORTH: "ap", NORM: "ap"}, - ], - "Nap": [ - {ORTH: "N", NORM: "Nou"}, - {ORTH: "ap", NORM: "ap"}, - ], - "lap": [ - {ORTH: "l", NORM: "li"}, - {ORTH: "ap", NORM: "ap"}, - ], - "Lap": [ - {ORTH: "L", NORM: "Li"}, - {ORTH: "ap", NORM: "ap"}, - ], - "yap": [ - {ORTH: "y", NORM: "yo"}, - {ORTH: "ap", NORM: "ap"}, - ], - "Yap": [ - {ORTH: "Y", NORM: "Yo"}, - {ORTH: "ap", NORM: "ap"}, - ], - "mte": [ - {ORTH: "m", NORM: "mwen"}, - {ORTH: "te", NORM: "te"}, - ], - "Mte": [ - {ORTH: "M", NORM: "Mwen"}, - {ORTH: "te", NORM: "te"}, - ], - "mpral": [ - {ORTH: "m", NORM: "mwen"}, - {ORTH: "pral", NORM: "pral"}, - ], - "Mpral": [ - {ORTH: "M", NORM: "Mwen"}, - {ORTH: "pral", NORM: "pral"}, - ], - "wap": [ - {ORTH: "w", NORM: "ou"}, - {ORTH: "ap", NORM: "ap"}, - ], - "Wap": [ - {ORTH: "W", NORM: "Ou"}, - {ORTH: "ap", NORM: "ap"}, - ], - "kap": [ - {ORTH: "k", NORM: "ki"}, - {ORTH: "ap", NORM: "ap"}, - ], - "Kap": [ - {ORTH: "K", NORM: "Ki"}, - {ORTH: "ap", NORM: "ap"}, - ], - "tap": [ - {ORTH: "t", NORM: "te"}, - {ORTH: "ap", NORM: "ap"}, - ], - "Tap": [ - {ORTH: "T", NORM: "Te"}, - {ORTH: "ap", NORM: "ap"}, - ], -}) +TOKENIZER_EXCEPTIONS.update( + { + "map": [ + {ORTH: "m", NORM: "mwen"}, + {ORTH: "ap", NORM: "ap"}, + ], + "Map": [ + {ORTH: "M", NORM: "Mwen"}, + {ORTH: "ap", NORM: "ap"}, + ], + "lem": [ + {ORTH: "le", NORM: "le"}, + {ORTH: "m", NORM: "mwen"}, + ], + "Lem": [ + {ORTH: "Le", NORM: "Le"}, + {ORTH: "m", NORM: "mwen"}, + ], + "lew": [ + {ORTH: "le", NORM: "le"}, + {ORTH: "w", NORM: "ou"}, + ], + "Lew": [ + {ORTH: "Le", NORM: "Le"}, + {ORTH: "w", NORM: "ou"}, + ], + "nap": [ + {ORTH: "n", NORM: "nou"}, + {ORTH: "ap", NORM: "ap"}, + ], + "Nap": [ + {ORTH: "N", NORM: "Nou"}, + {ORTH: "ap", NORM: "ap"}, + ], + "lap": [ + {ORTH: "l", NORM: "li"}, + {ORTH: "ap", NORM: "ap"}, + ], + "Lap": [ + {ORTH: "L", NORM: "Li"}, + {ORTH: "ap", NORM: "ap"}, + ], + "yap": [ + {ORTH: "y", NORM: "yo"}, + {ORTH: "ap", NORM: "ap"}, + ], + "Yap": [ + {ORTH: "Y", NORM: "Yo"}, + {ORTH: "ap", NORM: "ap"}, + ], + "mte": [ + {ORTH: "m", NORM: "mwen"}, + {ORTH: "te", NORM: "te"}, + ], + "Mte": [ + {ORTH: "M", NORM: "Mwen"}, + {ORTH: "te", NORM: "te"}, + ], + "mpral": [ + {ORTH: "m", NORM: "mwen"}, + {ORTH: "pral", NORM: "pral"}, + ], + "Mpral": [ + {ORTH: "M", NORM: "Mwen"}, + {ORTH: "pral", NORM: "pral"}, + ], + "wap": [ + {ORTH: "w", NORM: "ou"}, + {ORTH: "ap", NORM: "ap"}, + ], + "Wap": [ + {ORTH: "W", NORM: "Ou"}, + {ORTH: "ap", NORM: "ap"}, + ], + "kap": [ + {ORTH: "k", NORM: "ki"}, + {ORTH: "ap", NORM: "ap"}, + ], + "Kap": [ + {ORTH: "K", NORM: "Ki"}, + {ORTH: "ap", NORM: "ap"}, + ], + "tap": [ + {ORTH: "t", NORM: "te"}, + {ORTH: "ap", NORM: "ap"}, + ], + "Tap": [ + {ORTH: "T", NORM: "Te"}, + {ORTH: "ap", NORM: "ap"}, + ], + } +) diff --git a/spacy/tests/lang/ht/test_exceptions.py b/spacy/tests/lang/ht/test_exceptions.py index 685b72c07..ea2e2b204 100644 --- a/spacy/tests/lang/ht/test_exceptions.py +++ b/spacy/tests/lang/ht/test_exceptions.py @@ -29,4 +29,16 @@ def test_ht_tokenizer_handles_basic_abbreviation(ht_tokenizer, text): def test_ht_tokenizer_full_sentence(ht_tokenizer): text = "Si'm ka vini, m'ap pale ak li." tokens = [t.text for t in ht_tokenizer(text)] - assert tokens == ["Si", "'m", "ka", "vini", ",", "m'", "ap", "pale", "ak", "li", "."] + assert tokens == [ + "Si", + "'m", + "ka", + "vini", + ",", + "m'", + "ap", + "pale", + "ak", + "li", + ".", + ] diff --git a/spacy/tests/lang/ht/test_prefix_suffix_infix.py b/spacy/tests/lang/ht/test_prefix_suffix_infix.py index 7dabec17a..5ff409cd9 100644 --- a/spacy/tests/lang/ht/test_prefix_suffix_infix.py +++ b/spacy/tests/lang/ht/test_prefix_suffix_infix.py @@ -37,7 +37,9 @@ def test_ht_tokenizer_splits_uneven_wrap(ht_tokenizer, text): assert len(tokens) == 5 -@pytest.mark.parametrize("text,length", [("Ozetazini.", 2), ("Frans.", 2), ("(Ozetazini.", 3)]) +@pytest.mark.parametrize( + "text,length", [("Ozetazini.", 2), ("Frans.", 2), ("(Ozetazini.", 3)] +) def test_ht_tokenizer_splits_prefix_interact(ht_tokenizer, text, length): tokens = ht_tokenizer(text) assert len(tokens) == length diff --git a/spacy/tests/lang/ht/test_text.py b/spacy/tests/lang/ht/test_text.py index f396e352a..e63299fc0 100644 --- a/spacy/tests/lang/ht/test_text.py +++ b/spacy/tests/lang/ht/test_text.py @@ -16,7 +16,6 @@ Nan Washington, Depatman Deta Etazini pibliye yon deklarasyon ki eksprime "regre assert len(tokens) == 84 - @pytest.mark.parametrize( "text,length", [ @@ -66,14 +65,14 @@ def test_ht_lex_attrs_capitals(word): @pytest.mark.parametrize( - "word, expected", [ + "word, expected", + [ ("'m", "mwen"), ("'n", "nou"), ("'l", "li"), ("'y", "yo"), ("'w", "ou"), - ] + ], ) def test_ht_lex_attrs_norm_custom(word, expected): assert norm_custom(word) == expected -