This commit is contained in:
Matthew Honnibal 2025-05-28 17:28:07 +02:00 committed by GitHub
commit b84c131df0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
13 changed files with 196 additions and 134 deletions

View File

@ -91,6 +91,9 @@ IDS = {
"MORPH": MORPH,
"IDX": IDX,
}
# Make these ints in Python, so that we don't get this unexpected 'flag' type
# This will match the behaviour before Cython 3
IDS = {name: int(value) for name, value in IDS.items()}
# ATTR IDs, in order of the symbol

View File

@ -5,11 +5,11 @@ from thinc.api import Model
from ...language import BaseDefaults, Language
from .lemmatizer import HaitianCreoleLemmatizer
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .tag_map import TAG_MAP
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class HaitianCreoleDefaults(BaseDefaults):
@ -22,10 +22,12 @@ class HaitianCreoleDefaults(BaseDefaults):
stop_words = STOP_WORDS
tag_map = TAG_MAP
class HaitianCreole(Language):
lang = "ht"
Defaults = HaitianCreoleDefaults
@HaitianCreole.factory(
"lemmatizer",
assigns=["token.lemma"],
@ -49,4 +51,5 @@ def make_lemmatizer(
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)
__all__ = ["HaitianCreole"]

View File

@ -1,8 +1,8 @@
from typing import List, Tuple
from ...lookups import Lookups
from ...pipeline import Lemmatizer
from ...tokens import Token
from ...lookups import Lookups
class HaitianCreoleLemmatizer(Lemmatizer):

View File

@ -49,6 +49,7 @@ NORM_MAP = {
"P": "Pa",
}
def like_num(text):
text = text.strip().lower()
if text.startswith(("+", "-", "±", "~")):
@ -69,9 +70,11 @@ def like_num(text):
return True
return False
def norm_custom(text):
return NORM_MAP.get(text, text.lower())
LEX_ATTRS = {
LIKE_NUM: like_num,
NORM: norm_custom,

View File

@ -4,10 +4,10 @@ from ..char_classes import (
ALPHA_UPPER,
CONCAT_QUOTES,
HYPHENS,
LIST_PUNCT,
LIST_QUOTES,
LIST_ELLIPSES,
LIST_ICONS,
LIST_PUNCT,
LIST_QUOTES,
merge_chars,
)
@ -16,28 +16,43 @@ ELISION = "'".replace(" ", "")
_prefixes_elision = "m n l y t k w"
_prefixes_elision += " " + _prefixes_elision.upper()
TOKENIZER_PREFIXES = LIST_PUNCT + LIST_QUOTES + [
r"(?:({pe})[{el}])(?=[{a}])".format(
a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision)
)
]
TOKENIZER_PREFIXES = (
LIST_PUNCT
+ LIST_QUOTES
+ [
r"(?:({pe})[{el}])(?=[{a}])".format(
a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision)
)
]
)
TOKENIZER_SUFFIXES = LIST_PUNCT + LIST_QUOTES + LIST_ELLIPSES + [
r"(?<=[0-9])%", # numbers like 10%
r"(?<=[0-9])(?:{h})".format(h=HYPHENS), # hyphens after numbers
r"(?<=[{a}])[']".format(a=ALPHA), # apostrophes after letters
r"(?<=[{a}])['][mwlnytk](?=\s|$)".format(a=ALPHA), # contractions
r"(?<=[{a}0-9])\)", # right parenthesis after letter/number
r"(?<=[{a}])\.(?=\s|$)".format(a=ALPHA), # period after letter if space or end of string
r"(?<=\))[\.\?!]", # punctuation immediately after right parenthesis
]
TOKENIZER_SUFFIXES = (
LIST_PUNCT
+ LIST_QUOTES
+ LIST_ELLIPSES
+ [
r"(?<=[0-9])%", # numbers like 10%
r"(?<=[0-9])(?:{h})".format(h=HYPHENS), # hyphens after numbers
r"(?<=[{a}])[']".format(a=ALPHA), # apostrophes after letters
r"(?<=[{a}])['][mwlnytk](?=\s|$)".format(a=ALPHA), # contractions
r"(?<=[{a}0-9])\)", # right parenthesis after letter/number
r"(?<=[{a}])\.(?=\s|$)".format(
a=ALPHA
), # period after letter if space or end of string
r"(?<=\))[\.\?!]", # punctuation immediately after right parenthesis
]
)
TOKENIZER_INFIXES = LIST_ELLIPSES + LIST_ICONS + [
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION),
]
TOKENIZER_INFIXES = (
LIST_ELLIPSES
+ LIST_ICONS
+ [
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION),
]
)

View File

@ -39,8 +39,7 @@ sa san si swa si
men mèsi oswa osinon
"""
.split()
""".split()
)
# Add common contractions, with and without apostrophe variants

View File

@ -1,4 +1,22 @@
from spacy.symbols import NOUN, VERB, AUX, ADJ, ADV, PRON, DET, ADP, SCONJ, CCONJ, PART, INTJ, NUM, PROPN, PUNCT, SYM, X
from spacy.symbols import (
ADJ,
ADP,
ADV,
AUX,
CCONJ,
DET,
INTJ,
NOUN,
NUM,
PART,
PRON,
PROPN,
PUNCT,
SCONJ,
SYM,
VERB,
X,
)
TAG_MAP = {
"NOUN": {"pos": NOUN},

View File

@ -1,4 +1,5 @@
from spacy.symbols import ORTH, NORM
from spacy.symbols import NORM, ORTH
def make_variants(base, first_norm, second_orth, second_norm):
return {
@ -7,14 +8,16 @@ def make_variants(base, first_norm, second_orth, second_norm):
{ORTH: second_orth, NORM: second_norm},
],
base.capitalize(): [
{ORTH: base.split("'")[0].capitalize() + "'", NORM: first_norm.capitalize()},
{
ORTH: base.split("'")[0].capitalize() + "'",
NORM: first_norm.capitalize(),
},
{ORTH: second_orth, NORM: second_norm},
]
],
}
TOKENIZER_EXCEPTIONS = {
"Dr.": [{ORTH: "Dr."}]
}
TOKENIZER_EXCEPTIONS = {"Dr.": [{ORTH: "Dr."}]}
# Apostrophe forms
TOKENIZER_EXCEPTIONS.update(make_variants("m'ap", "mwen", "ap", "ap"))
@ -29,93 +32,95 @@ TOKENIZER_EXCEPTIONS.update(make_variants("p'ap", "pa", "ap", "ap"))
TOKENIZER_EXCEPTIONS.update(make_variants("t'ap", "te", "ap", "ap"))
# Non-apostrophe contractions (with capitalized variants)
TOKENIZER_EXCEPTIONS.update({
"map": [
{ORTH: "m", NORM: "mwen"},
{ORTH: "ap", NORM: "ap"},
],
"Map": [
{ORTH: "M", NORM: "Mwen"},
{ORTH: "ap", NORM: "ap"},
],
"lem": [
{ORTH: "le", NORM: "le"},
{ORTH: "m", NORM: "mwen"},
],
"Lem": [
{ORTH: "Le", NORM: "Le"},
{ORTH: "m", NORM: "mwen"},
],
"lew": [
{ORTH: "le", NORM: "le"},
{ORTH: "w", NORM: "ou"},
],
"Lew": [
{ORTH: "Le", NORM: "Le"},
{ORTH: "w", NORM: "ou"},
],
"nap": [
{ORTH: "n", NORM: "nou"},
{ORTH: "ap", NORM: "ap"},
],
"Nap": [
{ORTH: "N", NORM: "Nou"},
{ORTH: "ap", NORM: "ap"},
],
"lap": [
{ORTH: "l", NORM: "li"},
{ORTH: "ap", NORM: "ap"},
],
"Lap": [
{ORTH: "L", NORM: "Li"},
{ORTH: "ap", NORM: "ap"},
],
"yap": [
{ORTH: "y", NORM: "yo"},
{ORTH: "ap", NORM: "ap"},
],
"Yap": [
{ORTH: "Y", NORM: "Yo"},
{ORTH: "ap", NORM: "ap"},
],
"mte": [
{ORTH: "m", NORM: "mwen"},
{ORTH: "te", NORM: "te"},
],
"Mte": [
{ORTH: "M", NORM: "Mwen"},
{ORTH: "te", NORM: "te"},
],
"mpral": [
{ORTH: "m", NORM: "mwen"},
{ORTH: "pral", NORM: "pral"},
],
"Mpral": [
{ORTH: "M", NORM: "Mwen"},
{ORTH: "pral", NORM: "pral"},
],
"wap": [
{ORTH: "w", NORM: "ou"},
{ORTH: "ap", NORM: "ap"},
],
"Wap": [
{ORTH: "W", NORM: "Ou"},
{ORTH: "ap", NORM: "ap"},
],
"kap": [
{ORTH: "k", NORM: "ki"},
{ORTH: "ap", NORM: "ap"},
],
"Kap": [
{ORTH: "K", NORM: "Ki"},
{ORTH: "ap", NORM: "ap"},
],
"tap": [
{ORTH: "t", NORM: "te"},
{ORTH: "ap", NORM: "ap"},
],
"Tap": [
{ORTH: "T", NORM: "Te"},
{ORTH: "ap", NORM: "ap"},
],
})
TOKENIZER_EXCEPTIONS.update(
{
"map": [
{ORTH: "m", NORM: "mwen"},
{ORTH: "ap", NORM: "ap"},
],
"Map": [
{ORTH: "M", NORM: "Mwen"},
{ORTH: "ap", NORM: "ap"},
],
"lem": [
{ORTH: "le", NORM: "le"},
{ORTH: "m", NORM: "mwen"},
],
"Lem": [
{ORTH: "Le", NORM: "Le"},
{ORTH: "m", NORM: "mwen"},
],
"lew": [
{ORTH: "le", NORM: "le"},
{ORTH: "w", NORM: "ou"},
],
"Lew": [
{ORTH: "Le", NORM: "Le"},
{ORTH: "w", NORM: "ou"},
],
"nap": [
{ORTH: "n", NORM: "nou"},
{ORTH: "ap", NORM: "ap"},
],
"Nap": [
{ORTH: "N", NORM: "Nou"},
{ORTH: "ap", NORM: "ap"},
],
"lap": [
{ORTH: "l", NORM: "li"},
{ORTH: "ap", NORM: "ap"},
],
"Lap": [
{ORTH: "L", NORM: "Li"},
{ORTH: "ap", NORM: "ap"},
],
"yap": [
{ORTH: "y", NORM: "yo"},
{ORTH: "ap", NORM: "ap"},
],
"Yap": [
{ORTH: "Y", NORM: "Yo"},
{ORTH: "ap", NORM: "ap"},
],
"mte": [
{ORTH: "m", NORM: "mwen"},
{ORTH: "te", NORM: "te"},
],
"Mte": [
{ORTH: "M", NORM: "Mwen"},
{ORTH: "te", NORM: "te"},
],
"mpral": [
{ORTH: "m", NORM: "mwen"},
{ORTH: "pral", NORM: "pral"},
],
"Mpral": [
{ORTH: "M", NORM: "Mwen"},
{ORTH: "pral", NORM: "pral"},
],
"wap": [
{ORTH: "w", NORM: "ou"},
{ORTH: "ap", NORM: "ap"},
],
"Wap": [
{ORTH: "W", NORM: "Ou"},
{ORTH: "ap", NORM: "ap"},
],
"kap": [
{ORTH: "k", NORM: "ki"},
{ORTH: "ap", NORM: "ap"},
],
"Kap": [
{ORTH: "K", NORM: "Ki"},
{ORTH: "ap", NORM: "ap"},
],
"tap": [
{ORTH: "t", NORM: "te"},
{ORTH: "ap", NORM: "ap"},
],
"Tap": [
{ORTH: "T", NORM: "Te"},
{ORTH: "ap", NORM: "ap"},
],
}
)

View File

@ -23,7 +23,9 @@ IDS = {
"SPACE": SPACE
}
# Make these ints in Python, so that we don't get this unexpected 'flag' type
# This will match the behaviour before Cython 3
IDS = {name: int(value) for name, value in IDS.items()}
NAMES = {value: key for key, value in IDS.items()}
# As of Cython 3.1, the global Python namespace no longer has the enum

View File

@ -29,4 +29,16 @@ def test_ht_tokenizer_handles_basic_abbreviation(ht_tokenizer, text):
def test_ht_tokenizer_full_sentence(ht_tokenizer):
text = "Si'm ka vini, m'ap pale ak li."
tokens = [t.text for t in ht_tokenizer(text)]
assert tokens == ["Si", "'m", "ka", "vini", ",", "m'", "ap", "pale", "ak", "li", "."]
assert tokens == [
"Si",
"'m",
"ka",
"vini",
",",
"m'",
"ap",
"pale",
"ak",
"li",
".",
]

View File

@ -1,4 +1,5 @@
import pytest
from spacy.tokens import Doc

View File

@ -37,7 +37,9 @@ def test_ht_tokenizer_splits_uneven_wrap(ht_tokenizer, text):
assert len(tokens) == 5
@pytest.mark.parametrize("text,length", [("Ozetazini.", 2), ("Frans.", 2), ("(Ozetazini.", 3)])
@pytest.mark.parametrize(
"text,length", [("Ozetazini.", 2), ("Frans.", 2), ("(Ozetazini.", 3)]
)
def test_ht_tokenizer_splits_prefix_interact(ht_tokenizer, text, length):
tokens = ht_tokenizer(text)
assert len(tokens) == length

View File

@ -16,7 +16,6 @@ Nan Washington, Depatman Deta Etazini pibliye yon deklarasyon ki eksprime "regre
assert len(tokens) == 84
@pytest.mark.parametrize(
"text,length",
[
@ -66,14 +65,14 @@ def test_ht_lex_attrs_capitals(word):
@pytest.mark.parametrize(
"word, expected", [
"word, expected",
[
("'m", "mwen"),
("'n", "nou"),
("'l", "li"),
("'y", "yo"),
("'w", "ou"),
]
],
)
def test_ht_lex_attrs_norm_custom(word, expected):
assert norm_custom(word) == expected