From e1f777b15171fef086c20a5d2415f9f2154e1544 Mon Sep 17 00:00:00 2001
From: Antti Ajanki <antti.ajanki@iki.fi>
Date: Tue, 11 Feb 2020 03:32:43 +0200
Subject: [PATCH] Improvements for Finnish tokenizer (#4985)

* don't split on a colon. Colon is used to attach suffixes for abbreviations
* tokenize on any of LIST_HYPHENS (except a single hyphen), not just on --
* simplify infix rules by merging similar rules
---
 spacy/lang/fi/punctuation.py          |  9 ++++-----
 spacy/lang/fi/tokenizer_exceptions.py |  3 +++
 spacy/tests/lang/fi/test_tokenizer.py | 22 ++++++++++++++++++++++
 3 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/spacy/lang/fi/punctuation.py b/spacy/lang/fi/punctuation.py
index 02eb1b200..a85c0b228 100644
--- a/spacy/lang/fi/punctuation.py
+++ b/spacy/lang/fi/punctuation.py
@@ -1,12 +1,13 @@
 # coding: utf8
 from __future__ import unicode_literals
 
-from ..char_classes import LIST_ELLIPSES, LIST_ICONS
+from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_HYPHENS
 from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
 from ..punctuation import TOKENIZER_SUFFIXES
 
 
 _quotes = CONCAT_QUOTES.replace("'", "")
+DASHES = "|".join(x for x in LIST_HYPHENS if x != "-")
 
 _infixes = (
     LIST_ELLIPSES
@@ -14,11 +15,9 @@ _infixes = (
     + [
         r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
         r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
-        r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA),
-        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
         r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
-        r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
-        r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
+        r"(?<=[{a}])(?:{d})(?=[{a}])".format(a=ALPHA, d=DASHES),
+        r"(?<=[{a}0-9])[<>=/](?=[{a}])".format(a=ALPHA),
     ]
 )
 
diff --git a/spacy/lang/fi/tokenizer_exceptions.py b/spacy/lang/fi/tokenizer_exceptions.py
index d74deb22b..5469e345e 100644
--- a/spacy/lang/fi/tokenizer_exceptions.py
+++ b/spacy/lang/fi/tokenizer_exceptions.py
@@ -31,6 +31,9 @@ for exc_data in [
     {ORTH: "myöh.", LEMMA: "myöhempi"},
     {ORTH: "n.", LEMMA: "noin"},
     {ORTH: "nimim.", LEMMA: "nimimerkki"},
+    {ORTH: "n:o", LEMMA: "numero"},
+    {ORTH: "N:o", LEMMA: "numero"},
+    {ORTH: "nro", LEMMA: "numero"},
     {ORTH: "ns.", LEMMA: "niin sanottu"},
     {ORTH: "nyk.", LEMMA: "nykyinen"},
     {ORTH: "oik.", LEMMA: "oikealla"},
diff --git a/spacy/tests/lang/fi/test_tokenizer.py b/spacy/tests/lang/fi/test_tokenizer.py
index 17f6f0ccc..aab063982 100644
--- a/spacy/tests/lang/fi/test_tokenizer.py
+++ b/spacy/tests/lang/fi/test_tokenizer.py
@@ -19,6 +19,21 @@ HYPHENATED_TESTS = [
     )
 ]
 
+ABBREVIATION_INFLECTION_TESTS = [
+    (
+        "VTT:ssa ennen v:ta 2010 suoritetut mittaukset",
+        ["VTT:ssa", "ennen", "v:ta", "2010", "suoritetut", "mittaukset"]
+    ),
+    (
+        "ALV:n osuus on 24 %.",
+        ["ALV:n", "osuus", "on", "24", "%", "."]
+    ),
+    (
+        "Hiihtäjä oli kilpailun 14:s.",
+        ["Hiihtäjä", "oli", "kilpailun", "14:s", "."]
+    )
+]
+
 
 @pytest.mark.parametrize("text,expected_tokens", ABBREVIATION_TESTS)
 def test_fi_tokenizer_abbreviations(fi_tokenizer, text, expected_tokens):
@@ -32,3 +47,10 @@ def test_fi_tokenizer_hyphenated_words(fi_tokenizer, text, expected_tokens):
     tokens = fi_tokenizer(text)
     token_list = [token.text for token in tokens if not token.is_space]
     assert expected_tokens == token_list
+
+
+@pytest.mark.parametrize("text,expected_tokens", ABBREVIATION_INFLECTION_TESTS)
+def test_fi_tokenizer_abbreviation_inflections(fi_tokenizer, text, expected_tokens):
+    tokens = fi_tokenizer(text)
+    token_list = [token.text for token in tokens if not token.is_space]
+    assert expected_tokens == token_list