From 5d8cb60e43194519e4cd5e7d5fd94dcb34573857 Mon Sep 17 00:00:00 2001
From: adrianeboyd <adrianeboyd@gmail.com>
Date: Tue, 11 Feb 2020 02:30:54 +0100
Subject: [PATCH 1/3] Update lower pin for srsly to 1.0.1 (#4976)

---
 requirements.txt | 2 +-
 setup.cfg        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 1786ee186..4f0579313 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,7 +5,7 @@ thinc==7.4.0.dev0
 blis>=0.4.0,<0.5.0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.4.0,<1.1.0
-srsly>=0.1.0,<1.1.0
+srsly>=1.0.1,<1.1.0
 catalogue>=0.0.7,<1.1.0
 # Third party dependencies
 numpy>=1.15.0
diff --git a/setup.cfg b/setup.cfg
index 2c8268517..55396e011 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -47,7 +47,7 @@ install_requires =
     thinc==7.4.0.dev0
     blis>=0.4.0,<0.5.0
     wasabi>=0.4.0,<1.1.0
-    srsly>=0.1.0,<1.1.0
+    srsly>=1.0.1,<1.1.0
     catalogue>=0.0.7,<1.1.0
     # Third-party dependencies
     tqdm>=4.38.0,<5.0.0

From 479e81bafc90f61556bab5583566e134a00f6aaa Mon Sep 17 00:00:00 2001
From: Julin S <48789920+ju-sh@users.noreply.github.com>
Date: Tue, 11 Feb 2020 07:01:26 +0530
Subject: [PATCH 2/3] fix link (#4977)

---
 website/docs/usage/linguistic-features.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md
index 3af7d9fd1..685619c88 100644
--- a/website/docs/usage/linguistic-features.md
+++ b/website/docs/usage/linguistic-features.md
@@ -327,7 +327,7 @@ displaCy in our [online demo](https://explosion.ai/demos/displacy)..
 ### Disabling the parser {#disabling}
 
 In the [default models](/models), the parser is loaded and enabled as part of
-the [standard processing pipeline](/usage/processing-pipelin). If you don't need
+the [standard processing pipeline](/usage/processing-pipelines). If you don't need
 any of the syntactic information, you should disable the parser. Disabling the
 parser will make spaCy load and run much faster. If you want to load the parser,
 but need to disable it for specific documents, you can also control its use on

From e1f777b15171fef086c20a5d2415f9f2154e1544 Mon Sep 17 00:00:00 2001
From: Antti Ajanki <antti.ajanki@iki.fi>
Date: Tue, 11 Feb 2020 03:32:43 +0200
Subject: [PATCH 3/3] Improvements for Finnish tokenizer (#4985)

* don't split on a colon. Colon is used to attach suffixes for abbreviations
* tokenize on any of LIST_HYPHENS (except a single hyphen), not just on --
* simplify infix rules by merging similar rules
---
 spacy/lang/fi/punctuation.py          |  9 ++++-----
 spacy/lang/fi/tokenizer_exceptions.py |  3 +++
 spacy/tests/lang/fi/test_tokenizer.py | 22 ++++++++++++++++++++++
 3 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/spacy/lang/fi/punctuation.py b/spacy/lang/fi/punctuation.py
index 02eb1b200..a85c0b228 100644
--- a/spacy/lang/fi/punctuation.py
+++ b/spacy/lang/fi/punctuation.py
@@ -1,12 +1,13 @@
 # coding: utf8
 from __future__ import unicode_literals
 
-from ..char_classes import LIST_ELLIPSES, LIST_ICONS
+from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_HYPHENS
 from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
 from ..punctuation import TOKENIZER_SUFFIXES
 
 
 _quotes = CONCAT_QUOTES.replace("'", "")
+DASHES = "|".join(x for x in LIST_HYPHENS if x != "-")
 
 _infixes = (
     LIST_ELLIPSES
@@ -14,11 +15,9 @@ _infixes = (
     + [
         r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
         r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
-        r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA),
-        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
         r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
-        r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
-        r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
+        r"(?<=[{a}])(?:{d})(?=[{a}])".format(a=ALPHA, d=DASHES),
+        r"(?<=[{a}0-9])[<>=/](?=[{a}])".format(a=ALPHA),
     ]
 )
 
diff --git a/spacy/lang/fi/tokenizer_exceptions.py b/spacy/lang/fi/tokenizer_exceptions.py
index d74deb22b..5469e345e 100644
--- a/spacy/lang/fi/tokenizer_exceptions.py
+++ b/spacy/lang/fi/tokenizer_exceptions.py
@@ -31,6 +31,9 @@ for exc_data in [
     {ORTH: "myöh.", LEMMA: "myöhempi"},
     {ORTH: "n.", LEMMA: "noin"},
     {ORTH: "nimim.", LEMMA: "nimimerkki"},
+    {ORTH: "n:o", LEMMA: "numero"},
+    {ORTH: "N:o", LEMMA: "numero"},
+    {ORTH: "nro", LEMMA: "numero"},
     {ORTH: "ns.", LEMMA: "niin sanottu"},
     {ORTH: "nyk.", LEMMA: "nykyinen"},
     {ORTH: "oik.", LEMMA: "oikealla"},
diff --git a/spacy/tests/lang/fi/test_tokenizer.py b/spacy/tests/lang/fi/test_tokenizer.py
index 17f6f0ccc..aab063982 100644
--- a/spacy/tests/lang/fi/test_tokenizer.py
+++ b/spacy/tests/lang/fi/test_tokenizer.py
@@ -19,6 +19,21 @@ HYPHENATED_TESTS = [
     )
 ]
 
+ABBREVIATION_INFLECTION_TESTS = [
+    (
+        "VTT:ssa ennen v:ta 2010 suoritetut mittaukset",
+        ["VTT:ssa", "ennen", "v:ta", "2010", "suoritetut", "mittaukset"]
+    ),
+    (
+        "ALV:n osuus on 24 %.",
+        ["ALV:n", "osuus", "on", "24", "%", "."]
+    ),
+    (
+        "Hiihtäjä oli kilpailun 14:s.",
+        ["Hiihtäjä", "oli", "kilpailun", "14:s", "."]
+    )
+]
+
 
 @pytest.mark.parametrize("text,expected_tokens", ABBREVIATION_TESTS)
 def test_fi_tokenizer_abbreviations(fi_tokenizer, text, expected_tokens):
@@ -32,3 +47,10 @@ def test_fi_tokenizer_hyphenated_words(fi_tokenizer, text, expected_tokens):
     tokens = fi_tokenizer(text)
     token_list = [token.text for token in tokens if not token.is_space]
     assert expected_tokens == token_list
+
+
+@pytest.mark.parametrize("text,expected_tokens", ABBREVIATION_INFLECTION_TESTS)
+def test_fi_tokenizer_abbreviation_inflections(fi_tokenizer, text, expected_tokens):
+    tokens = fi_tokenizer(text)
+    token_list = [token.text for token in tokens if not token.is_space]
+    assert expected_tokens == token_list