[Finnish tokenizer] Handle conjunction contractions (#8105)

2025-10-27 14:11:04 +03:00 · 2021-06-16 11:56:47 +03:00 · 2021-06-16 11:56:47 +03:00 · 5a6125c227
commit 5a6125c227
parent b09be3e1cb
2 changed files with 40 additions and 1 deletions
--- a/spacy/lang/fi/tokenizer_exceptions.py
+++ b/spacy/lang/fi/tokenizer_exceptions.py
@ -1,5 +1,5 @@
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH
+from ...symbols import ORTH, NORM
 from ...util import update_exc


@ -79,5 +79,18 @@ for exc_data in [
 ]:
    _exc[exc_data[ORTH]] = [exc_data]

+# Source: https://kaino.kotus.fi/visk/sisallys.php?p=141
+conj_contraction_bases = [
+    ("ett", "että"), ("jott", "jotta"), ("kosk", "koska"), ("mutt", "mutta"),
+    ("vaikk", "vaikka"), ("ehk", "ehkä"), ("miks", "miksi"), ("siks", "siksi"),
+    ("joll", "jos"), ("ell", "jos")
+]
+conj_contraction_negations = [
+    ("en", "en"), ("et", "et"), ("ei", "ei"), ("emme", "emme"),
+    ("ette", "ette"), ("eivat", "eivät"), ("eivät", "eivät")]
+for (base_lower, base_norm) in conj_contraction_bases:
+    for base in [base_lower, base_lower.title()]:
+        for (suffix, suffix_norm) in conj_contraction_negations:
+            _exc[base + suffix] = [{ORTH: base, NORM: base_norm}, {ORTH: suffix, NORM: suffix_norm}]

 TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
--- a/spacy/tests/lang/fi/test_tokenizer.py
+++ b/spacy/tests/lang/fi/test_tokenizer.py
@ -36,6 +36,23 @@ ABBREVIATION_INFLECTION_TESTS = [
    ("EU:n toimesta tehtiin jotain.", ["EU:n", "toimesta", "tehtiin", "jotain", "."]),
 ]

+CONTRACTION_TESTS = [
+    (
+        "Päätimme ettemme tule.",
+        ["Päätimme", "ett", "emme", "tule", "."],
+        ["päätimme", "että", "emme", "tule", "."]
+    ),
+    (
+        "Miksei puhuttaisi?",
+        ["Miks", "ei", "puhuttaisi", "?"],
+        ["miksi", "ei", "puhuttaisi", "?"]
+    ),
+    (
+        "He tottelivat vaikkeivat halunneet",
+        ["He", "tottelivat", "vaikk", "eivat", "halunneet"],
+        ["he", "tottelivat", "vaikka", "eivät", "halunneet"]
+    ),
+]

@pytest.mark.parametrize("text,expected_tokens", ABBREVIATION_TESTS)
 def test_fi_tokenizer_abbreviations(fi_tokenizer, text, expected_tokens):
@ -56,3 +73,12 @@ def test_fi_tokenizer_abbreviation_inflections(fi_tokenizer, text, expected_toke
    tokens = fi_tokenizer(text)
    token_list = [token.text for token in tokens if not token.is_space]
    assert expected_tokens == token_list
+
+
+@pytest.mark.parametrize("text,expected_tokens,expected_norms", CONTRACTION_TESTS)
+def test_fi_tokenizer_contractions(fi_tokenizer, text, expected_tokens, expected_norms):
+    tokens = fi_tokenizer(text)
+    token_list = [token.text for token in tokens if not token.is_space]
+    norm_list = [token.norm_ for token in tokens if not token.is_space]
+    assert expected_tokens == token_list
+    assert expected_norms == norm_list