Improve Ligurian tokenization

2025-12-23 01:53:17 +03:00 · 2024-11-24 18:18:37 -08:00 · 2024-11-24 18:18:37 -08:00 · 8a469f06a4
commit 8a469f06a4
parent 3e30b5bef6
9 changed files with 136 additions and 61 deletions
--- a/spacy/lang/lij/init.py
+++ b/spacy/lang/lij/init.py
@ -1,5 +1,5 @@
 from ...language import BaseDefaults, Language
-from .punctuation import TOKENIZER_INFIXES
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
 from .stop_words import STOP_WORDS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
@ -7,6 +7,7 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 class LigurianDefaults(BaseDefaults):
    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
    infixes = TOKENIZER_INFIXES
    prefixes = TOKENIZER_PREFIXES
    stop_words = STOP_WORDS
--- a/spacy/lang/lij/examples.py
+++ b/spacy/lang/lij/examples.py
@ -9,6 +9,6 @@ Example sentences to test spaCy and its language models.
 sentences = [
    "Sciusciâ e sciorbî no se peu.",
    "Graçie di çetroin, che me son arrivæ.",
-    "Vegnime apreuvo, che ve fasso pescâ di òmmi.",
+    "Vegnîme apreuvo, che ve fasso pescâ di òmmi.",
    "Bella pe sempre l'ægua inta conchetta quande unn'agoggia d'ægua a se â trapaña.",
 ]
--- a/spacy/lang/lij/punctuation.py
+++ b/spacy/lang/lij/punctuation.py
@ -1,11 +1,23 @@
 from ..punctuation import (
    TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES,
    TOKENIZER_INFIXES as BASE_TOKENIZER_INFIXES,
 )
 from ..char_classes import ALPHA
 from ..punctuation import TOKENIZER_INFIXES
 ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "")
-_infixes = TOKENIZER_INFIXES + [
+ELISION = "'’"
-    r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION)
+
 _prefixes = [
    r"['’‘][0-9]{2}",  # shorthand for years
    r"[0-9]+°(?![cfkCFK])",  # use of degree symbol as ordinal indicator
    r"[{el}‘]nn?[{el}]?".format(el=ELISION),  # elided forms of "un(na)"
 ] + BASE_TOKENIZER_PREFIXES
 _infixes = BASE_TOKENIZER_INFIXES + [
    r"(?<=[{a}][{el}])(?=[{a}0-9\"])".format(a=ALPHA, el=ELISION),
 ]
 TOKENIZER_PREFIXES = _prefixes
 TOKENIZER_INFIXES = _infixes
--- a/spacy/lang/lij/stop_words.py
+++ b/spacy/lang/lij/stop_words.py
@ -1,38 +1,40 @@
 STOP_WORDS = set(
    """
-a à â a-a a-e a-i a-o aiva aloa an ancheu ancon apreuvo ascì atra atre atri atro avanti avei
+a à â a-a a-e a-i a-o aiva aloa an ancheu ancon apreuo apreuvo ascì atra atre atri atro avanti avei aveiva
-bella belle belli bello ben
+bell' bell’ bella belle belli bello ben
-ch' che chì chi ciù co-a co-e co-i co-o comm' comme con cösa coscì cöse
+ch' ch’ che chì chi ciù co-a co-e co-i co-o comm' comm’ comme con contr' contr’ contra cösa coscì cöse
-d' da da-a da-e da-i da-o dapeu de delongo derê di do doe doî donde dòppo
+d' d’ da da-a da-e da-i da-o dapeu de delongo derê di do doe doî donde dòppo drent' drent’ dento
-é e ê ea ean emmo en ëse
+é à e ê ea ean emmo en ëse
 fin fiña
-gh' ghe guæei
+gh' gh’ ghe guæi
-i î in insemme int' inta inte inti into
+i î in insemme int' int’ inta inte inti into
-l' lê lì lô
+l' l’ lê lì liatre liatri lô loiatre loiatri
-m' ma manco me megio meno mezo mi
+m' m’ ma mai manco me megio meno meza meze mezi mezo mi
-na n' ne ni ninte nisciun nisciuña no
+n' n’ na ne nì niatre niatri ninte nisciun nisciuña no noiatre noiatri
 o ò ô oua
 parte pe pe-a pe-i pe-e pe-o perché pittin pö primma pròpio
-quæ quand' quande quarche quella quelle quelli quello
+quæ quand' quand’ quande quarche quarcösa quell' quell’ quella quelle quelli quello
-s' sce scê sci sciâ sciô sciù se segge seu sò solo son sott' sta stæta stæte stæti stæto ste sti sto
+s' s’ sce scê scì scî scià sciâ sciô sciù se segge seu sò solo son sott' sott’ sotta sta stæta stæte stæti stæto ste sti sto
-tanta tante tanti tanto te ti torna tra tròppo tutta tutte tutti tutto
+tant' tant’ tanta tante tanti tanto te teu tò ti torna tra tròppo tutt' tutt’ tutta tutte tutti tutto
-un uña unn' unna
+un uña unn' unn’ unna
 voî voscià
 za zu
 """.split()
--- a/spacy/lang/lij/tokenizer_exceptions.py
+++ b/spacy/lang/lij/tokenizer_exceptions.py
@ -1,49 +1,67 @@
-from ...symbols import ORTH
+from ...symbols import ORTH, NORM
 from ...util import update_exc
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 # Returns capitalized variants, all caps variants and with curly apostrophe
 def _variants(orth, exc):
    yield orth, exc
    yield orth.capitalize(), [
        {ORTH: e[ORTH].capitalize() if i == 0 else e[ORTH], NORM: e.get(NORM, e[ORTH])}
        for i, e in enumerate(exc)
    ]
    yield orth.upper(), [
        {ORTH: e[ORTH].upper(), NORM: e.get(NORM, e[ORTH])} for e in exc
    ]
    if "'" in orth:
        yield from _variants(
            orth.replace("'", "’"),
            [
                {ORTH: e[ORTH].replace("'", "’"), NORM: e.get(NORM, e[ORTH])}
                for e in exc
            ],
        )
 _exc = {}
-for raw in [
+# Compound prepositions
    "a-e",
    "a-o",
    "a-i",
    "a-a",
    "co-a",
    "co-e",
    "co-i",
    "co-o",
    "da-a",
    "da-e",
    "da-i",
    "da-o",
    "pe-a",
    "pe-e",
    "pe-i",
    "pe-o",
 ]:
    for orth in [raw, raw.capitalize()]:
        _exc[orth] = [{ORTH: orth}]
-# Prefix + prepositions with à (e.g. "sott'a-o")
+# Compounds with "inte" and "de" aren't split as they can be ambiguous
 # Format: (compound form, isolated form, determiners it goes with)
 _preps = [
    ("a-", "à", "oaie"),
    ("co-", "con", "oaie"),
    ("da-", "da", "oaie"),
    ("pe-", "pe", "oaie"),
    ("pi-", "pe", "a"),  # colloquialism
    ("de-", "de", "oaie"),  # incorrect, but occasionally seen
    ("ne-", "inte", "oaie"),  # incorrect, but occasionally seen
 ]
 for prep_, prep, dets in _preps:
    for det in dets:
        for orth, exc in _variants(
            prep_ + det, [{ORTH: prep_, NORM: prep}, {ORTH: det}]
        ):
            _exc[orth] = exc
-for prep in [
+# Units
-    "a-a",
+
-    "a-e",
+for u in "cfkCFK":
-    "a-o",
+    _exc[f"°{u}"] = [{ORTH: f"°{u}"}]
-    "a-i",
+    _exc[f"°{u}."] = [{ORTH: f"°{u}"}, {ORTH: "."}]
-]:
+
-    for prefix in [
+# Other exceptions
-        "sott'",
+
-        "sott’",
+_other_exc = {
-        "contr'",
+    "'n'": [{ORTH: "'n'", NORM: "unna"}],
-        "contr’",
+    "‘n'": [{ORTH: "‘n'", NORM: "unna"}],
-        "ch'",
+    "'n": [{ORTH: "'n", NORM: "un"}],
-        "ch’",
+    "‘n": [{ORTH: "‘n", NORM: "un"}],
-        "s'",
+    "tou": [{ORTH: "t", NORM: "te"}, {ORTH: "ou", NORM: "ô"}],
-        "s’",
+}
-    ]:
+for orth_, exc_ in _other_exc.items():
-        for prefix_orth in [prefix, prefix.capitalize()]:
+    for orth, exc in _variants(orth_, exc_):
-            _exc[prefix_orth + prep] = [{ORTH: prefix_orth}, {ORTH: prep}]
+        _exc[orth] = exc
 TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -282,6 +282,11 @@ def lg_tokenizer():
    return get_lang_class("lg")().tokenizer
@pytest.fixture(scope="session")
 def lij_tokenizer():
    return get_lang_class("lij")().tokenizer
@pytest.fixture(scope="session")
 def lt_tokenizer():
    return get_lang_class("lt")().tokenizer
--- a/spacy/tests/lang/lij/init.py
+++ b/spacy/tests/lang/lij/init.py
--- a/spacy/tests/lang/lij/test_exceptions.py
+++ b/spacy/tests/lang/lij/test_exceptions.py
@ -0,0 +1,13 @@
 import pytest
@pytest.mark.parametrize(
    "text,expected_tokens,expected_norms",
    [("a-e", ["a-", "e"], ["à", "e"]), ("co-i", ["co-", "i"], ["con", "i"])],
 )
 def test_prepositions(lij_tokenizer, text, expected_tokens, expected_norms):
    """Test that compound prepositions are split correctly."""
    tokens = lij_tokenizer(text)
    assert len(tokens) == 2
    assert [t.text for t in tokens] == expected_tokens
    assert [t.norm_ for t in tokens] == expected_norms
--- a/spacy/tests/lang/lij/test_prefix_suffix_infix.py
+++ b/spacy/tests/lang/lij/test_prefix_suffix_infix.py
@ -0,0 +1,24 @@
 import pytest
@pytest.mark.parametrize("text", ["'90", "’90", "‘90"])
 def test_lij_tokenizer_handles_year_elision(lij_tokenizer, text):
    """Test that elided years (e.g. '90 for 1990) are not split."""
    tokens = lij_tokenizer(text)
    assert len(tokens) == 1
@pytest.mark.parametrize("text,expected_tokens", [("10°C", ["10", "°C"])])
 def test_lij_tokenizer_handles_degrees(lij_tokenizer, text, expected_tokens):
    """Test that in degree units the degree symbol isn't split from the unit."""
    tokens = lij_tokenizer(text)
    token_list = [token.text for token in tokens if not token.is_space]
    assert expected_tokens == token_list
@pytest.mark.parametrize("text,expected_tokens", [("'n'atra", ["'n'", "atra"])])
 def test_lij_tokenizer_handles_left_elision(lij_tokenizer, text, expected_tokens):
    """Test that left-eliding expressions are not split from their left apostrophe."""
    tokens = lij_tokenizer(text)
    token_list = [token.text for token in tokens if not token.is_space]
    assert expected_tokens == token_list