Improve Ligurian tokenization

2025-11-03 09:27:56 +03:00 · 2024-11-24 18:18:37 -08:00 · 2024-11-24 18:18:37 -08:00 · 8a469f06a4
commit 8a469f06a4
parent 3e30b5bef6
9 changed files with 136 additions and 61 deletions
--- a/spacy/lang/lij/init.py
+++ b/spacy/lang/lij/init.py
@ -1,5 +1,5 @@
 from ...language import BaseDefaults, Language
-from .punctuation import TOKENIZER_INFIXES
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
 from .stop_words import STOP_WORDS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS

@ -7,6 +7,7 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 class LigurianDefaults(BaseDefaults):
    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
    infixes = TOKENIZER_INFIXES
+    prefixes = TOKENIZER_PREFIXES
    stop_words = STOP_WORDS


--- a/spacy/lang/lij/examples.py
+++ b/spacy/lang/lij/examples.py
@ -9,6 +9,6 @@ Example sentences to test spaCy and its language models.
 sentences = [
    "Sciusciâ e sciorbî no se peu.",
    "Graçie di çetroin, che me son arrivæ.",
-    "Vegnime apreuvo, che ve fasso pescâ di òmmi.",
+    "Vegnîme apreuvo, che ve fasso pescâ di òmmi.",
    "Bella pe sempre l'ægua inta conchetta quande unn'agoggia d'ægua a se â trapaña.",
 ]
--- a/spacy/lang/lij/punctuation.py
+++ b/spacy/lang/lij/punctuation.py
@ -1,11 +1,23 @@
+from ..punctuation import (
+    TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES,
+    TOKENIZER_INFIXES as BASE_TOKENIZER_INFIXES,
+)
 from ..char_classes import ALPHA
-from ..punctuation import TOKENIZER_INFIXES
-
-ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "")


-_infixes = TOKENIZER_INFIXES + [
-    r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION)
+ELISION = "'’"
+
+
+_prefixes = [
+    r"['’‘][0-9]{2}",  # shorthand for years
+    r"[0-9]+°(?![cfkCFK])",  # use of degree symbol as ordinal indicator
+    r"[{el}‘]nn?[{el}]?".format(el=ELISION),  # elided forms of "un(na)"
+] + BASE_TOKENIZER_PREFIXES
+
+
+_infixes = BASE_TOKENIZER_INFIXES + [
+    r"(?<=[{a}][{el}])(?=[{a}0-9\"])".format(a=ALPHA, el=ELISION),
 ]

+TOKENIZER_PREFIXES = _prefixes
 TOKENIZER_INFIXES = _infixes
--- a/spacy/lang/lij/stop_words.py
+++ b/spacy/lang/lij/stop_words.py
@ -1,38 +1,40 @@
 STOP_WORDS = set(
    """
-a à â a-a a-e a-i a-o aiva aloa an ancheu ancon apreuvo ascì atra atre atri atro avanti avei
+a à â a-a a-e a-i a-o aiva aloa an ancheu ancon apreuo apreuvo ascì atra atre atri atro avanti avei aveiva

-bella belle belli bello ben
+bell' bell’ bella belle belli bello ben

-ch' che chì chi ciù co-a co-e co-i co-o comm' comme con cösa coscì cöse
+ch' ch’ che chì chi ciù co-a co-e co-i co-o comm' comm’ comme con contr' contr’ contra cösa coscì cöse

-d' da da-a da-e da-i da-o dapeu de delongo derê di do doe doî donde dòppo
+d' d’ da da-a da-e da-i da-o dapeu de delongo derê di do doe doî donde dòppo drent' drent’ dento

-é e ê ea ean emmo en ëse
+é à e ê ea ean emmo en ëse

 fin fiña

-gh' ghe guæei
+gh' gh’ ghe guæi

-i î in insemme int' inta inte inti into
+i î in insemme int' int’ inta inte inti into

-l' lê lì lô
+l' l’ lê lì liatre liatri lô loiatre loiatri

-m' ma manco me megio meno mezo mi
+m' m’ ma mai manco me megio meno meza meze mezi mezo mi

-na n' ne ni ninte nisciun nisciuña no
+n' n’ na ne nì niatre niatri ninte nisciun nisciuña no noiatre noiatri

 o ò ô oua

 parte pe pe-a pe-i pe-e pe-o perché pittin pö primma pròpio

-quæ quand' quande quarche quella quelle quelli quello
+quæ quand' quand’ quande quarche quarcösa quell' quell’ quella quelle quelli quello

-s' sce scê sci sciâ sciô sciù se segge seu sò solo son sott' sta stæta stæte stæti stæto ste sti sto
+s' s’ sce scê scì scî scià sciâ sciô sciù se segge seu sò solo son sott' sott’ sotta sta stæta stæte stæti stæto ste sti sto

-tanta tante tanti tanto te ti torna tra tròppo tutta tutte tutti tutto
+tant' tant’ tanta tante tanti tanto te teu tò ti torna tra tròppo tutt' tutt’ tutta tutte tutti tutto

-un uña unn' unna
+un uña unn' unn’ unna
+
+voî voscià

 za zu
 """.split()
--- a/spacy/lang/lij/tokenizer_exceptions.py
+++ b/spacy/lang/lij/tokenizer_exceptions.py
@ -1,49 +1,67 @@
-from ...symbols import ORTH
+from ...symbols import ORTH, NORM
 from ...util import update_exc
 from ..tokenizer_exceptions import BASE_EXCEPTIONS

+
+# Returns capitalized variants, all caps variants and with curly apostrophe
+def _variants(orth, exc):
+    yield orth, exc
+    yield orth.capitalize(), [
+        {ORTH: e[ORTH].capitalize() if i == 0 else e[ORTH], NORM: e.get(NORM, e[ORTH])}
+        for i, e in enumerate(exc)
+    ]
+    yield orth.upper(), [
+        {ORTH: e[ORTH].upper(), NORM: e.get(NORM, e[ORTH])} for e in exc
+    ]
+    if "'" in orth:
+        yield from _variants(
+            orth.replace("'", "’"),
+            [
+                {ORTH: e[ORTH].replace("'", "’"), NORM: e.get(NORM, e[ORTH])}
+                for e in exc
+            ],
+        )
+
+
 _exc = {}

-for raw in [
-    "a-e",
-    "a-o",
-    "a-i",
-    "a-a",
-    "co-a",
-    "co-e",
-    "co-i",
-    "co-o",
-    "da-a",
-    "da-e",
-    "da-i",
-    "da-o",
-    "pe-a",
-    "pe-e",
-    "pe-i",
-    "pe-o",
-]:
-    for orth in [raw, raw.capitalize()]:
-        _exc[orth] = [{ORTH: orth}]
+# Compound prepositions

-# Prefix + prepositions with à (e.g. "sott'a-o")
+# Compounds with "inte" and "de" aren't split as they can be ambiguous
+# Format: (compound form, isolated form, determiners it goes with)
+_preps = [
+    ("a-", "à", "oaie"),
+    ("co-", "con", "oaie"),
+    ("da-", "da", "oaie"),
+    ("pe-", "pe", "oaie"),
+    ("pi-", "pe", "a"),  # colloquialism
+    ("de-", "de", "oaie"),  # incorrect, but occasionally seen
+    ("ne-", "inte", "oaie"),  # incorrect, but occasionally seen
+]
+for prep_, prep, dets in _preps:
+    for det in dets:
+        for orth, exc in _variants(
+            prep_ + det, [{ORTH: prep_, NORM: prep}, {ORTH: det}]
+        ):
+            _exc[orth] = exc

-for prep in [
-    "a-a",
-    "a-e",
-    "a-o",
-    "a-i",
-]:
-    for prefix in [
-        "sott'",
-        "sott’",
-        "contr'",
-        "contr’",
-        "ch'",
-        "ch’",
-        "s'",
-        "s’",
-    ]:
-        for prefix_orth in [prefix, prefix.capitalize()]:
-            _exc[prefix_orth + prep] = [{ORTH: prefix_orth}, {ORTH: prep}]
+# Units
+
+for u in "cfkCFK":
+    _exc[f"°{u}"] = [{ORTH: f"°{u}"}]
+    _exc[f"°{u}."] = [{ORTH: f"°{u}"}, {ORTH: "."}]
+
+# Other exceptions
+
+_other_exc = {
+    "'n'": [{ORTH: "'n'", NORM: "unna"}],
+    "‘n'": [{ORTH: "‘n'", NORM: "unna"}],
+    "'n": [{ORTH: "'n", NORM: "un"}],
+    "‘n": [{ORTH: "‘n", NORM: "un"}],
+    "tou": [{ORTH: "t", NORM: "te"}, {ORTH: "ou", NORM: "ô"}],
+}
+for orth_, exc_ in _other_exc.items():
+    for orth, exc in _variants(orth_, exc_):
+        _exc[orth] = exc

 TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -282,6 +282,11 @@ def lg_tokenizer():
    return get_lang_class("lg")().tokenizer


+@pytest.fixture(scope="session")
+def lij_tokenizer():
+    return get_lang_class("lij")().tokenizer
+
+
@pytest.fixture(scope="session")
 def lt_tokenizer():
    return get_lang_class("lt")().tokenizer
--- a/spacy/tests/lang/lij/init.py
+++ b/spacy/tests/lang/lij/init.py
--- a/spacy/tests/lang/lij/test_exceptions.py
+++ b/spacy/tests/lang/lij/test_exceptions.py
@ -0,0 +1,13 @@
+import pytest
+
+
+@pytest.mark.parametrize(
+    "text,expected_tokens,expected_norms",
+    [("a-e", ["a-", "e"], ["à", "e"]), ("co-i", ["co-", "i"], ["con", "i"])],
+)
+def test_prepositions(lij_tokenizer, text, expected_tokens, expected_norms):
+    """Test that compound prepositions are split correctly."""
+    tokens = lij_tokenizer(text)
+    assert len(tokens) == 2
+    assert [t.text for t in tokens] == expected_tokens
+    assert [t.norm_ for t in tokens] == expected_norms
--- a/spacy/tests/lang/lij/test_prefix_suffix_infix.py
+++ b/spacy/tests/lang/lij/test_prefix_suffix_infix.py
@ -0,0 +1,24 @@
+import pytest
+
+
+@pytest.mark.parametrize("text", ["'90", "’90", "‘90"])
+def test_lij_tokenizer_handles_year_elision(lij_tokenizer, text):
+    """Test that elided years (e.g. '90 for 1990) are not split."""
+    tokens = lij_tokenizer(text)
+    assert len(tokens) == 1
+
+
+@pytest.mark.parametrize("text,expected_tokens", [("10°C", ["10", "°C"])])
+def test_lij_tokenizer_handles_degrees(lij_tokenizer, text, expected_tokens):
+    """Test that in degree units the degree symbol isn't split from the unit."""
+    tokens = lij_tokenizer(text)
+    token_list = [token.text for token in tokens if not token.is_space]
+    assert expected_tokens == token_list
+
+
+@pytest.mark.parametrize("text,expected_tokens", [("'n'atra", ["'n'", "atra"])])
+def test_lij_tokenizer_handles_left_elision(lij_tokenizer, text, expected_tokens):
+    """Test that left-eliding expressions are not split from their left apostrophe."""
+    tokens = lij_tokenizer(text)
+    token_list = [token.text for token in tokens if not token.is_space]
+    assert expected_tokens == token_list