Improve Ligurian tokenization

This commit is contained in:
Jean Maillard 2024-11-24 18:18:37 -08:00
parent 3e30b5bef6
commit 8a469f06a4
9 changed files with 136 additions and 61 deletions

View File

@ -1,5 +1,5 @@
from ...language import BaseDefaults, Language from ...language import BaseDefaults, Language
from .punctuation import TOKENIZER_INFIXES from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
@ -7,6 +7,7 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class LigurianDefaults(BaseDefaults): class LigurianDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
prefixes = TOKENIZER_PREFIXES
stop_words = STOP_WORDS stop_words = STOP_WORDS

View File

@ -9,6 +9,6 @@ Example sentences to test spaCy and its language models.
sentences = [ sentences = [
"Sciusciâ e sciorbî no se peu.", "Sciusciâ e sciorbî no se peu.",
"Graçie di çetroin, che me son arrivæ.", "Graçie di çetroin, che me son arrivæ.",
"Vegnime apreuvo, che ve fasso pescâ di òmmi.", "Vegnîme apreuvo, che ve fasso pescâ di òmmi.",
"Bella pe sempre l'ægua inta conchetta quande unn'agoggia d'ægua a se â trapaña.", "Bella pe sempre l'ægua inta conchetta quande unn'agoggia d'ægua a se â trapaña.",
] ]

View File

@ -1,11 +1,23 @@
from ..punctuation import (
TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES,
TOKENIZER_INFIXES as BASE_TOKENIZER_INFIXES,
)
from ..char_classes import ALPHA from ..char_classes import ALPHA
from ..punctuation import TOKENIZER_INFIXES
ELISION = " ' ".strip().replace(" ", "").replace("\n", "")
_infixes = TOKENIZER_INFIXES + [ ELISION = "'"
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION)
_prefixes = [
r"['][0-9]{2}", # shorthand for years
r"[0-9]+°(?![cfkCFK])", # use of degree symbol as ordinal indicator
r"[{el}]nn?[{el}]?".format(el=ELISION), # elided forms of "un(na)"
] + BASE_TOKENIZER_PREFIXES
_infixes = BASE_TOKENIZER_INFIXES + [
r"(?<=[{a}][{el}])(?=[{a}0-9\"])".format(a=ALPHA, el=ELISION),
] ]
TOKENIZER_PREFIXES = _prefixes
TOKENIZER_INFIXES = _infixes TOKENIZER_INFIXES = _infixes

View File

@ -1,38 +1,40 @@
STOP_WORDS = set( STOP_WORDS = set(
""" """
a à â a-a a-e a-i a-o aiva aloa an ancheu ancon apreuvo ascì atra atre atri atro avanti avei a à â a-a a-e a-i a-o aiva aloa an ancheu ancon apreuo apreuvo ascì atra atre atri atro avanti avei aveiva
bella belle belli bello ben bell' bell bella belle belli bello ben
ch' che chì chi ciù co-a co-e co-i co-o comm' comme con cösa coscì cöse ch' ch che chì chi ciù co-a co-e co-i co-o comm' comm comme con contr' contr contra cösa coscì cöse
d' da da-a da-e da-i da-o dapeu de delongo derê di do doe doî donde dòppo d' d da da-a da-e da-i da-o dapeu de delongo derê di do doe doî donde dòppo drent' drent dento
é e ê ea ean emmo en ëse é à e ê ea ean emmo en ëse
fin fiña fin fiña
gh' ghe guæei gh' gh ghe guæi
i î in insemme int' inta inte inti into i î in insemme int' int inta inte inti into
l' lê lì lô l' l lê lì liatre liatri loiatre loiatri
m' ma manco me megio meno mezo mi m' m ma mai manco me megio meno meza meze mezi mezo mi
na n' ne ni ninte nisciun nisciuña no n' n na ne nì niatre niatri ninte nisciun nisciuña no noiatre noiatri
o ò ô oua o ò ô oua
parte pe pe-a pe-i pe-e pe-o perché pittin primma pròpio parte pe pe-a pe-i pe-e pe-o perché pittin primma pròpio
quæ quand' quande quarche quella quelle quelli quello quæ quand' quand quande quarche quarcösa quell' quell quella quelle quelli quello
s' sce scê sci sciâ sciô sciù se segge seu sò solo son sott' sta stæta stæte stæti stæto ste sti sto s' s sce scê scì scî scià sciâ sciô sciù se segge seu sò solo son sott' sott sotta sta stæta stæte stæti stæto ste sti sto
tanta tante tanti tanto te ti torna tra tròppo tutta tutte tutti tutto tant' tant tanta tante tanti tanto te teu tò ti torna tra tròppo tutt' tutt tutta tutte tutti tutto
un uña unn' unna un uña unn' unn unna
voî voscià
za zu za zu
""".split() """.split()

View File

@ -1,49 +1,67 @@
from ...symbols import ORTH from ...symbols import ORTH, NORM
from ...util import update_exc from ...util import update_exc
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
# Returns capitalized variants, all caps variants and with curly apostrophe
def _variants(orth, exc):
yield orth, exc
yield orth.capitalize(), [
{ORTH: e[ORTH].capitalize() if i == 0 else e[ORTH], NORM: e.get(NORM, e[ORTH])}
for i, e in enumerate(exc)
]
yield orth.upper(), [
{ORTH: e[ORTH].upper(), NORM: e.get(NORM, e[ORTH])} for e in exc
]
if "'" in orth:
yield from _variants(
orth.replace("'", ""),
[
{ORTH: e[ORTH].replace("'", ""), NORM: e.get(NORM, e[ORTH])}
for e in exc
],
)
_exc = {} _exc = {}
for raw in [ # Compound prepositions
"a-e",
"a-o",
"a-i",
"a-a",
"co-a",
"co-e",
"co-i",
"co-o",
"da-a",
"da-e",
"da-i",
"da-o",
"pe-a",
"pe-e",
"pe-i",
"pe-o",
]:
for orth in [raw, raw.capitalize()]:
_exc[orth] = [{ORTH: orth}]
# Prefix + prepositions with à (e.g. "sott'a-o") # Compounds with "inte" and "de" aren't split as they can be ambiguous
# Format: (compound form, isolated form, determiners it goes with)
_preps = [
("a-", "à", "oaie"),
("co-", "con", "oaie"),
("da-", "da", "oaie"),
("pe-", "pe", "oaie"),
("pi-", "pe", "a"), # colloquialism
("de-", "de", "oaie"), # incorrect, but occasionally seen
("ne-", "inte", "oaie"), # incorrect, but occasionally seen
]
for prep_, prep, dets in _preps:
for det in dets:
for orth, exc in _variants(
prep_ + det, [{ORTH: prep_, NORM: prep}, {ORTH: det}]
):
_exc[orth] = exc
for prep in [ # Units
"a-a",
"a-e", for u in "cfkCFK":
"a-o", _exc[f"°{u}"] = [{ORTH: f"°{u}"}]
"a-i", _exc[f"°{u}."] = [{ORTH: f"°{u}"}, {ORTH: "."}]
]:
for prefix in [ # Other exceptions
"sott'",
"sott", _other_exc = {
"contr'", "'n'": [{ORTH: "'n'", NORM: "unna"}],
"contr", "n'": [{ORTH: "n'", NORM: "unna"}],
"ch'", "'n": [{ORTH: "'n", NORM: "un"}],
"ch", "n": [{ORTH: "n", NORM: "un"}],
"s'", "tou": [{ORTH: "t", NORM: "te"}, {ORTH: "ou", NORM: "ô"}],
"s", }
]: for orth_, exc_ in _other_exc.items():
for prefix_orth in [prefix, prefix.capitalize()]: for orth, exc in _variants(orth_, exc_):
_exc[prefix_orth + prep] = [{ORTH: prefix_orth}, {ORTH: prep}] _exc[orth] = exc
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -282,6 +282,11 @@ def lg_tokenizer():
return get_lang_class("lg")().tokenizer return get_lang_class("lg")().tokenizer
@pytest.fixture(scope="session")
def lij_tokenizer():
return get_lang_class("lij")().tokenizer
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def lt_tokenizer(): def lt_tokenizer():
return get_lang_class("lt")().tokenizer return get_lang_class("lt")().tokenizer

View File

View File

@ -0,0 +1,13 @@
import pytest
@pytest.mark.parametrize(
"text,expected_tokens,expected_norms",
[("a-e", ["a-", "e"], ["à", "e"]), ("co-i", ["co-", "i"], ["con", "i"])],
)
def test_prepositions(lij_tokenizer, text, expected_tokens, expected_norms):
"""Test that compound prepositions are split correctly."""
tokens = lij_tokenizer(text)
assert len(tokens) == 2
assert [t.text for t in tokens] == expected_tokens
assert [t.norm_ for t in tokens] == expected_norms

View File

@ -0,0 +1,24 @@
import pytest
@pytest.mark.parametrize("text", ["'90", "90", "90"])
def test_lij_tokenizer_handles_year_elision(lij_tokenizer, text):
"""Test that elided years (e.g. '90 for 1990) are not split."""
tokens = lij_tokenizer(text)
assert len(tokens) == 1
@pytest.mark.parametrize("text,expected_tokens", [("10°C", ["10", "°C"])])
def test_lij_tokenizer_handles_degrees(lij_tokenizer, text, expected_tokens):
"""Test that in degree units the degree symbol isn't split from the unit."""
tokens = lij_tokenizer(text)
token_list = [token.text for token in tokens if not token.is_space]
assert expected_tokens == token_list
@pytest.mark.parametrize("text,expected_tokens", [("'n'atra", ["'n'", "atra"])])
def test_lij_tokenizer_handles_left_elision(lij_tokenizer, text, expected_tokens):
"""Test that left-eliding expressions are not split from their left apostrophe."""
tokens = lij_tokenizer(text)
token_list = [token.text for token in tokens if not token.is_space]
assert expected_tokens == token_list