From 5a6125c227e805db4ccde7a6645ea868620c731e Mon Sep 17 00:00:00 2001 From: Antti Ajanki Date: Wed, 16 Jun 2021 11:56:47 +0300 Subject: [PATCH] [Finnish tokenizer] Handle conjunction contractions (#8105) --- spacy/lang/fi/tokenizer_exceptions.py | 15 ++++++++++++++- spacy/tests/lang/fi/test_tokenizer.py | 26 ++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/spacy/lang/fi/tokenizer_exceptions.py b/spacy/lang/fi/tokenizer_exceptions.py index 22d710cb0..f0161f8b3 100644 --- a/spacy/lang/fi/tokenizer_exceptions.py +++ b/spacy/lang/fi/tokenizer_exceptions.py @@ -1,5 +1,5 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS -from ...symbols import ORTH +from ...symbols import ORTH, NORM from ...util import update_exc @@ -79,5 +79,18 @@ for exc_data in [ ]: _exc[exc_data[ORTH]] = [exc_data] +# Source: https://kaino.kotus.fi/visk/sisallys.php?p=141 +conj_contraction_bases = [ + ("ett", "että"), ("jott", "jotta"), ("kosk", "koska"), ("mutt", "mutta"), + ("vaikk", "vaikka"), ("ehk", "ehkä"), ("miks", "miksi"), ("siks", "siksi"), + ("joll", "jos"), ("ell", "jos") +] +conj_contraction_negations = [ + ("en", "en"), ("et", "et"), ("ei", "ei"), ("emme", "emme"), + ("ette", "ette"), ("eivat", "eivät"), ("eivät", "eivät")] +for (base_lower, base_norm) in conj_contraction_bases: + for base in [base_lower, base_lower.title()]: + for (suffix, suffix_norm) in conj_contraction_negations: + _exc[base + suffix] = [{ORTH: base, NORM: base_norm}, {ORTH: suffix, NORM: suffix_norm}] TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) diff --git a/spacy/tests/lang/fi/test_tokenizer.py b/spacy/tests/lang/fi/test_tokenizer.py index ae16c7eea..b2f23f7fd 100644 --- a/spacy/tests/lang/fi/test_tokenizer.py +++ b/spacy/tests/lang/fi/test_tokenizer.py @@ -36,6 +36,23 @@ ABBREVIATION_INFLECTION_TESTS = [ ("EU:n toimesta tehtiin jotain.", ["EU:n", "toimesta", "tehtiin", "jotain", "."]), ] +CONTRACTION_TESTS = [ + ( + "Päätimme ettemme tule.", + ["Päätimme", "ett", "emme", "tule", "."], + ["päätimme", "että", "emme", "tule", "."] + ), + ( + "Miksei puhuttaisi?", + ["Miks", "ei", "puhuttaisi", "?"], + ["miksi", "ei", "puhuttaisi", "?"] + ), + ( + "He tottelivat vaikkeivat halunneet", + ["He", "tottelivat", "vaikk", "eivat", "halunneet"], + ["he", "tottelivat", "vaikka", "eivät", "halunneet"] + ), +] @pytest.mark.parametrize("text,expected_tokens", ABBREVIATION_TESTS) def test_fi_tokenizer_abbreviations(fi_tokenizer, text, expected_tokens): @@ -56,3 +73,12 @@ def test_fi_tokenizer_abbreviation_inflections(fi_tokenizer, text, expected_toke tokens = fi_tokenizer(text) token_list = [token.text for token in tokens if not token.is_space] assert expected_tokens == token_list + + +@pytest.mark.parametrize("text,expected_tokens,expected_norms", CONTRACTION_TESTS) +def test_fi_tokenizer_contractions(fi_tokenizer, text, expected_tokens, expected_norms): + tokens = fi_tokenizer(text) + token_list = [token.text for token in tokens if not token.is_space] + norm_list = [token.norm_ for token in tokens if not token.is_space] + assert expected_tokens == token_list + assert expected_norms == norm_list