This commit is contained in:
Matthew Honnibal 2025-05-28 17:28:07 +02:00 committed by GitHub
commit b84c131df0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
13 changed files with 196 additions and 134 deletions

View File

@ -91,6 +91,9 @@ IDS = {
"MORPH": MORPH, "MORPH": MORPH,
"IDX": IDX, "IDX": IDX,
} }
# Make these ints in Python, so that we don't get this unexpected 'flag' type
# This will match the behaviour before Cython 3
IDS = {name: int(value) for name, value in IDS.items()}
# ATTR IDs, in order of the symbol # ATTR IDs, in order of the symbol

View File

@ -5,11 +5,11 @@ from thinc.api import Model
from ...language import BaseDefaults, Language from ...language import BaseDefaults, Language
from .lemmatizer import HaitianCreoleLemmatizer from .lemmatizer import HaitianCreoleLemmatizer
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .tag_map import TAG_MAP from .tag_map import TAG_MAP
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class HaitianCreoleDefaults(BaseDefaults): class HaitianCreoleDefaults(BaseDefaults):
@ -22,10 +22,12 @@ class HaitianCreoleDefaults(BaseDefaults):
stop_words = STOP_WORDS stop_words = STOP_WORDS
tag_map = TAG_MAP tag_map = TAG_MAP
class HaitianCreole(Language): class HaitianCreole(Language):
lang = "ht" lang = "ht"
Defaults = HaitianCreoleDefaults Defaults = HaitianCreoleDefaults
@HaitianCreole.factory( @HaitianCreole.factory(
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
@ -49,4 +51,5 @@ def make_lemmatizer(
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
) )
__all__ = ["HaitianCreole"] __all__ = ["HaitianCreole"]

View File

@ -1,8 +1,8 @@
from typing import List, Tuple from typing import List, Tuple
from ...lookups import Lookups
from ...pipeline import Lemmatizer from ...pipeline import Lemmatizer
from ...tokens import Token from ...tokens import Token
from ...lookups import Lookups
class HaitianCreoleLemmatizer(Lemmatizer): class HaitianCreoleLemmatizer(Lemmatizer):

View File

@ -49,6 +49,7 @@ NORM_MAP = {
"P": "Pa", "P": "Pa",
} }
def like_num(text): def like_num(text):
text = text.strip().lower() text = text.strip().lower()
if text.startswith(("+", "-", "±", "~")): if text.startswith(("+", "-", "±", "~")):
@ -69,9 +70,11 @@ def like_num(text):
return True return True
return False return False
def norm_custom(text): def norm_custom(text):
return NORM_MAP.get(text, text.lower()) return NORM_MAP.get(text, text.lower())
LEX_ATTRS = { LEX_ATTRS = {
LIKE_NUM: like_num, LIKE_NUM: like_num,
NORM: norm_custom, NORM: norm_custom,

View File

@ -4,10 +4,10 @@ from ..char_classes import (
ALPHA_UPPER, ALPHA_UPPER,
CONCAT_QUOTES, CONCAT_QUOTES,
HYPHENS, HYPHENS,
LIST_PUNCT,
LIST_QUOTES,
LIST_ELLIPSES, LIST_ELLIPSES,
LIST_ICONS, LIST_ICONS,
LIST_PUNCT,
LIST_QUOTES,
merge_chars, merge_chars,
) )
@ -16,23 +16,37 @@ ELISION = "'".replace(" ", "")
_prefixes_elision = "m n l y t k w" _prefixes_elision = "m n l y t k w"
_prefixes_elision += " " + _prefixes_elision.upper() _prefixes_elision += " " + _prefixes_elision.upper()
TOKENIZER_PREFIXES = LIST_PUNCT + LIST_QUOTES + [ TOKENIZER_PREFIXES = (
LIST_PUNCT
+ LIST_QUOTES
+ [
r"(?:({pe})[{el}])(?=[{a}])".format( r"(?:({pe})[{el}])(?=[{a}])".format(
a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision) a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision)
) )
] ]
)
TOKENIZER_SUFFIXES = LIST_PUNCT + LIST_QUOTES + LIST_ELLIPSES + [ TOKENIZER_SUFFIXES = (
LIST_PUNCT
+ LIST_QUOTES
+ LIST_ELLIPSES
+ [
r"(?<=[0-9])%", # numbers like 10% r"(?<=[0-9])%", # numbers like 10%
r"(?<=[0-9])(?:{h})".format(h=HYPHENS), # hyphens after numbers r"(?<=[0-9])(?:{h})".format(h=HYPHENS), # hyphens after numbers
r"(?<=[{a}])[']".format(a=ALPHA), # apostrophes after letters r"(?<=[{a}])[']".format(a=ALPHA), # apostrophes after letters
r"(?<=[{a}])['][mwlnytk](?=\s|$)".format(a=ALPHA), # contractions r"(?<=[{a}])['][mwlnytk](?=\s|$)".format(a=ALPHA), # contractions
r"(?<=[{a}0-9])\)", # right parenthesis after letter/number r"(?<=[{a}0-9])\)", # right parenthesis after letter/number
r"(?<=[{a}])\.(?=\s|$)".format(a=ALPHA), # period after letter if space or end of string r"(?<=[{a}])\.(?=\s|$)".format(
a=ALPHA
), # period after letter if space or end of string
r"(?<=\))[\.\?!]", # punctuation immediately after right parenthesis r"(?<=\))[\.\?!]", # punctuation immediately after right parenthesis
] ]
)
TOKENIZER_INFIXES = LIST_ELLIPSES + LIST_ICONS + [ TOKENIZER_INFIXES = (
LIST_ELLIPSES
+ LIST_ICONS
+ [
r"(?<=[0-9])[+\-\*^](?=[0-9-])", r"(?<=[0-9])[+\-\*^](?=[0-9-])",
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format( r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
@ -40,4 +54,5 @@ TOKENIZER_INFIXES = LIST_ELLIPSES + LIST_ICONS + [
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS), r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION), r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION),
] ]
)

View File

@ -39,8 +39,7 @@ sa san si swa si
men mèsi oswa osinon men mèsi oswa osinon
""" """.split()
.split()
) )
# Add common contractions, with and without apostrophe variants # Add common contractions, with and without apostrophe variants

View File

@ -1,4 +1,22 @@
from spacy.symbols import NOUN, VERB, AUX, ADJ, ADV, PRON, DET, ADP, SCONJ, CCONJ, PART, INTJ, NUM, PROPN, PUNCT, SYM, X from spacy.symbols import (
ADJ,
ADP,
ADV,
AUX,
CCONJ,
DET,
INTJ,
NOUN,
NUM,
PART,
PRON,
PROPN,
PUNCT,
SCONJ,
SYM,
VERB,
X,
)
TAG_MAP = { TAG_MAP = {
"NOUN": {"pos": NOUN}, "NOUN": {"pos": NOUN},

View File

@ -1,4 +1,5 @@
from spacy.symbols import ORTH, NORM from spacy.symbols import NORM, ORTH
def make_variants(base, first_norm, second_orth, second_norm): def make_variants(base, first_norm, second_orth, second_norm):
return { return {
@ -7,14 +8,16 @@ def make_variants(base, first_norm, second_orth, second_norm):
{ORTH: second_orth, NORM: second_norm}, {ORTH: second_orth, NORM: second_norm},
], ],
base.capitalize(): [ base.capitalize(): [
{ORTH: base.split("'")[0].capitalize() + "'", NORM: first_norm.capitalize()}, {
ORTH: base.split("'")[0].capitalize() + "'",
NORM: first_norm.capitalize(),
},
{ORTH: second_orth, NORM: second_norm}, {ORTH: second_orth, NORM: second_norm},
] ],
} }
TOKENIZER_EXCEPTIONS = {
"Dr.": [{ORTH: "Dr."}] TOKENIZER_EXCEPTIONS = {"Dr.": [{ORTH: "Dr."}]}
}
# Apostrophe forms # Apostrophe forms
TOKENIZER_EXCEPTIONS.update(make_variants("m'ap", "mwen", "ap", "ap")) TOKENIZER_EXCEPTIONS.update(make_variants("m'ap", "mwen", "ap", "ap"))
@ -29,7 +32,8 @@ TOKENIZER_EXCEPTIONS.update(make_variants("p'ap", "pa", "ap", "ap"))
TOKENIZER_EXCEPTIONS.update(make_variants("t'ap", "te", "ap", "ap")) TOKENIZER_EXCEPTIONS.update(make_variants("t'ap", "te", "ap", "ap"))
# Non-apostrophe contractions (with capitalized variants) # Non-apostrophe contractions (with capitalized variants)
TOKENIZER_EXCEPTIONS.update({ TOKENIZER_EXCEPTIONS.update(
{
"map": [ "map": [
{ORTH: "m", NORM: "mwen"}, {ORTH: "m", NORM: "mwen"},
{ORTH: "ap", NORM: "ap"}, {ORTH: "ap", NORM: "ap"},
@ -118,4 +122,5 @@ TOKENIZER_EXCEPTIONS.update({
{ORTH: "T", NORM: "Te"}, {ORTH: "T", NORM: "Te"},
{ORTH: "ap", NORM: "ap"}, {ORTH: "ap", NORM: "ap"},
], ],
}) }
)

View File

@ -23,7 +23,9 @@ IDS = {
"SPACE": SPACE "SPACE": SPACE
} }
# Make these ints in Python, so that we don't get this unexpected 'flag' type
# This will match the behaviour before Cython 3
IDS = {name: int(value) for name, value in IDS.items()}
NAMES = {value: key for key, value in IDS.items()} NAMES = {value: key for key, value in IDS.items()}
# As of Cython 3.1, the global Python namespace no longer has the enum # As of Cython 3.1, the global Python namespace no longer has the enum

View File

@ -29,4 +29,16 @@ def test_ht_tokenizer_handles_basic_abbreviation(ht_tokenizer, text):
def test_ht_tokenizer_full_sentence(ht_tokenizer): def test_ht_tokenizer_full_sentence(ht_tokenizer):
text = "Si'm ka vini, m'ap pale ak li." text = "Si'm ka vini, m'ap pale ak li."
tokens = [t.text for t in ht_tokenizer(text)] tokens = [t.text for t in ht_tokenizer(text)]
assert tokens == ["Si", "'m", "ka", "vini", ",", "m'", "ap", "pale", "ak", "li", "."] assert tokens == [
"Si",
"'m",
"ka",
"vini",
",",
"m'",
"ap",
"pale",
"ak",
"li",
".",
]

View File

@ -1,4 +1,5 @@
import pytest import pytest
from spacy.tokens import Doc from spacy.tokens import Doc

View File

@ -37,7 +37,9 @@ def test_ht_tokenizer_splits_uneven_wrap(ht_tokenizer, text):
assert len(tokens) == 5 assert len(tokens) == 5
@pytest.mark.parametrize("text,length", [("Ozetazini.", 2), ("Frans.", 2), ("(Ozetazini.", 3)]) @pytest.mark.parametrize(
"text,length", [("Ozetazini.", 2), ("Frans.", 2), ("(Ozetazini.", 3)]
)
def test_ht_tokenizer_splits_prefix_interact(ht_tokenizer, text, length): def test_ht_tokenizer_splits_prefix_interact(ht_tokenizer, text, length):
tokens = ht_tokenizer(text) tokens = ht_tokenizer(text)
assert len(tokens) == length assert len(tokens) == length

View File

@ -16,7 +16,6 @@ Nan Washington, Depatman Deta Etazini pibliye yon deklarasyon ki eksprime "regre
assert len(tokens) == 84 assert len(tokens) == 84
@pytest.mark.parametrize( @pytest.mark.parametrize(
"text,length", "text,length",
[ [
@ -66,14 +65,14 @@ def test_ht_lex_attrs_capitals(word):
@pytest.mark.parametrize( @pytest.mark.parametrize(
"word, expected", [ "word, expected",
[
("'m", "mwen"), ("'m", "mwen"),
("'n", "nou"), ("'n", "nou"),
("'l", "li"), ("'l", "li"),
("'y", "yo"), ("'y", "yo"),
("'w", "ou"), ("'w", "ou"),
] ],
) )
def test_ht_lex_attrs_norm_custom(word, expected): def test_ht_lex_attrs_norm_custom(word, expected):
assert norm_custom(word) == expected assert norm_custom(word) == expected