Merge branch 'master' into spacy.io

2025-11-06 02:47:29 +03:00 · 2020-02-10 20:34:22 -05:00 · 2020-02-10 20:34:22 -05:00 · 13b516289b
commit 13b516289b
parent 19dc77a738 e1f777b151
6 changed files with 32 additions and 8 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -5,7 +5,7 @@ thinc==7.4.0.dev0
 blis>=0.4.0,<0.5.0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.4.0,<1.1.0
-srsly>=0.1.0,<1.1.0
+srsly>=1.0.1,<1.1.0
 catalogue>=0.0.7,<1.1.0
 # Third party dependencies
 numpy>=1.15.0
--- a/setup.cfg
+++ b/setup.cfg
@ -47,7 +47,7 @@ install_requires =
    thinc==7.4.0.dev0
    blis>=0.4.0,<0.5.0
    wasabi>=0.4.0,<1.1.0
-    srsly>=0.1.0,<1.1.0
+    srsly>=1.0.1,<1.1.0
    catalogue>=0.0.7,<1.1.0
    # Third-party dependencies
    tqdm>=4.38.0,<5.0.0
--- a/spacy/lang/fi/punctuation.py
+++ b/spacy/lang/fi/punctuation.py
@ -1,12 +1,13 @@
 # coding: utf8
 from __future__ import unicode_literals

-from ..char_classes import LIST_ELLIPSES, LIST_ICONS
+from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_HYPHENS
 from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
 from ..punctuation import TOKENIZER_SUFFIXES


 _quotes = CONCAT_QUOTES.replace("'", "")
+DASHES = "|".join(x for x in LIST_HYPHENS if x != "-")

 _infixes = (
    LIST_ELLIPSES
@ -14,11 +15,9 @@ _infixes = (
    + [
        r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
        r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
-        r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA),
-        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
        r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
-        r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
-        r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
+        r"(?<=[{a}])(?:{d})(?=[{a}])".format(a=ALPHA, d=DASHES),
+        r"(?<=[{a}0-9])[<>=/](?=[{a}])".format(a=ALPHA),
    ]
 )

--- a/spacy/lang/fi/tokenizer_exceptions.py
+++ b/spacy/lang/fi/tokenizer_exceptions.py
@ -31,6 +31,9 @@ for exc_data in [
    {ORTH: "myöh.", LEMMA: "myöhempi"},
    {ORTH: "n.", LEMMA: "noin"},
    {ORTH: "nimim.", LEMMA: "nimimerkki"},
+    {ORTH: "n:o", LEMMA: "numero"},
+    {ORTH: "N:o", LEMMA: "numero"},
+    {ORTH: "nro", LEMMA: "numero"},
    {ORTH: "ns.", LEMMA: "niin sanottu"},
    {ORTH: "nyk.", LEMMA: "nykyinen"},
    {ORTH: "oik.", LEMMA: "oikealla"},
--- a/spacy/tests/lang/fi/test_tokenizer.py
+++ b/spacy/tests/lang/fi/test_tokenizer.py
@ -19,6 +19,21 @@ HYPHENATED_TESTS = [
    )
 ]

+ABBREVIATION_INFLECTION_TESTS = [
+    (
+        "VTT:ssa ennen v:ta 2010 suoritetut mittaukset",
+        ["VTT:ssa", "ennen", "v:ta", "2010", "suoritetut", "mittaukset"]
+    ),
+    (
+        "ALV:n osuus on 24 %.",
+        ["ALV:n", "osuus", "on", "24", "%", "."]
+    ),
+    (
+        "Hiihtäjä oli kilpailun 14:s.",
+        ["Hiihtäjä", "oli", "kilpailun", "14:s", "."]
+    )
+]
+

@pytest.mark.parametrize("text,expected_tokens", ABBREVIATION_TESTS)
 def test_fi_tokenizer_abbreviations(fi_tokenizer, text, expected_tokens):
@ -32,3 +47,10 @@ def test_fi_tokenizer_hyphenated_words(fi_tokenizer, text, expected_tokens):
    tokens = fi_tokenizer(text)
    token_list = [token.text for token in tokens if not token.is_space]
    assert expected_tokens == token_list
+
+
+@pytest.mark.parametrize("text,expected_tokens", ABBREVIATION_INFLECTION_TESTS)
+def test_fi_tokenizer_abbreviation_inflections(fi_tokenizer, text, expected_tokens):
+    tokens = fi_tokenizer(text)
+    token_list = [token.text for token in tokens if not token.is_space]
+    assert expected_tokens == token_list
--- a/website/docs/usage/linguistic-features.md
+++ b/website/docs/usage/linguistic-features.md
@ -327,7 +327,7 @@ displaCy in our [online demo](https://explosion.ai/demos/displacy)..
 ### Disabling the parser {#disabling}

 In the [default models](/models), the parser is loaded and enabled as part of
-the [standard processing pipeline](/usage/processing-pipelin). If you don't need
+the [standard processing pipeline](/usage/processing-pipelines). If you don't need
 any of the syntactic information, you should disable the parser. Disabling the
 parser will make spaCy load and run much faster. If you want to load the parser,
 but need to disable it for specific documents, you can also control its use on