From e1f777b15171fef086c20a5d2415f9f2154e1544 Mon Sep 17 00:00:00 2001 From: Antti Ajanki Date: Tue, 11 Feb 2020 03:32:43 +0200 Subject: [PATCH] Improvements for Finnish tokenizer (#4985) * don't split on a colon. Colon is used to attach suffixes for abbreviations * tokenize on any of LIST_HYPHENS (except a single hyphen), not just on -- * simplify infix rules by merging similar rules --- spacy/lang/fi/punctuation.py | 9 ++++----- spacy/lang/fi/tokenizer_exceptions.py | 3 +++ spacy/tests/lang/fi/test_tokenizer.py | 22 ++++++++++++++++++++++ 3 files changed, 29 insertions(+), 5 deletions(-) diff --git a/spacy/lang/fi/punctuation.py b/spacy/lang/fi/punctuation.py index 02eb1b200..a85c0b228 100644 --- a/spacy/lang/fi/punctuation.py +++ b/spacy/lang/fi/punctuation.py @@ -1,12 +1,13 @@ # coding: utf8 from __future__ import unicode_literals -from ..char_classes import LIST_ELLIPSES, LIST_ICONS +from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_HYPHENS from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER from ..punctuation import TOKENIZER_SUFFIXES _quotes = CONCAT_QUOTES.replace("'", "") +DASHES = "|".join(x for x in LIST_HYPHENS if x != "-") _infixes = ( LIST_ELLIPSES @@ -14,11 +15,9 @@ _infixes = ( + [ r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER), r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA), - r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA), - r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes), - r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA), - r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), + r"(?<=[{a}])(?:{d})(?=[{a}])".format(a=ALPHA, d=DASHES), + r"(?<=[{a}0-9])[<>=/](?=[{a}])".format(a=ALPHA), ] ) diff --git a/spacy/lang/fi/tokenizer_exceptions.py b/spacy/lang/fi/tokenizer_exceptions.py index d74deb22b..5469e345e 100644 --- a/spacy/lang/fi/tokenizer_exceptions.py +++ b/spacy/lang/fi/tokenizer_exceptions.py @@ -31,6 +31,9 @@ for exc_data in [ {ORTH: "myöh.", LEMMA: "myöhempi"}, {ORTH: "n.", LEMMA: "noin"}, {ORTH: "nimim.", LEMMA: "nimimerkki"}, + {ORTH: "n:o", LEMMA: "numero"}, + {ORTH: "N:o", LEMMA: "numero"}, + {ORTH: "nro", LEMMA: "numero"}, {ORTH: "ns.", LEMMA: "niin sanottu"}, {ORTH: "nyk.", LEMMA: "nykyinen"}, {ORTH: "oik.", LEMMA: "oikealla"}, diff --git a/spacy/tests/lang/fi/test_tokenizer.py b/spacy/tests/lang/fi/test_tokenizer.py index 17f6f0ccc..aab063982 100644 --- a/spacy/tests/lang/fi/test_tokenizer.py +++ b/spacy/tests/lang/fi/test_tokenizer.py @@ -19,6 +19,21 @@ HYPHENATED_TESTS = [ ) ] +ABBREVIATION_INFLECTION_TESTS = [ + ( + "VTT:ssa ennen v:ta 2010 suoritetut mittaukset", + ["VTT:ssa", "ennen", "v:ta", "2010", "suoritetut", "mittaukset"] + ), + ( + "ALV:n osuus on 24 %.", + ["ALV:n", "osuus", "on", "24", "%", "."] + ), + ( + "Hiihtäjä oli kilpailun 14:s.", + ["Hiihtäjä", "oli", "kilpailun", "14:s", "."] + ) +] + @pytest.mark.parametrize("text,expected_tokens", ABBREVIATION_TESTS) def test_fi_tokenizer_abbreviations(fi_tokenizer, text, expected_tokens): @@ -32,3 +47,10 @@ def test_fi_tokenizer_hyphenated_words(fi_tokenizer, text, expected_tokens): tokens = fi_tokenizer(text) token_list = [token.text for token in tokens if not token.is_space] assert expected_tokens == token_list + + +@pytest.mark.parametrize("text,expected_tokens", ABBREVIATION_INFLECTION_TESTS) +def test_fi_tokenizer_abbreviation_inflections(fi_tokenizer, text, expected_tokens): + tokens = fi_tokenizer(text) + token_list = [token.text for token in tokens if not token.is_space] + assert expected_tokens == token_list