diff --git a/requirements.txt b/requirements.txt index 1786ee186..4f0579313 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,7 @@ thinc==7.4.0.dev0 blis>=0.4.0,<0.5.0 murmurhash>=0.28.0,<1.1.0 wasabi>=0.4.0,<1.1.0 -srsly>=0.1.0,<1.1.0 +srsly>=1.0.1,<1.1.0 catalogue>=0.0.7,<1.1.0 # Third party dependencies numpy>=1.15.0 diff --git a/setup.cfg b/setup.cfg index 2c8268517..55396e011 100644 --- a/setup.cfg +++ b/setup.cfg @@ -47,7 +47,7 @@ install_requires = thinc==7.4.0.dev0 blis>=0.4.0,<0.5.0 wasabi>=0.4.0,<1.1.0 - srsly>=0.1.0,<1.1.0 + srsly>=1.0.1,<1.1.0 catalogue>=0.0.7,<1.1.0 # Third-party dependencies tqdm>=4.38.0,<5.0.0 diff --git a/spacy/lang/fi/punctuation.py b/spacy/lang/fi/punctuation.py index 02eb1b200..a85c0b228 100644 --- a/spacy/lang/fi/punctuation.py +++ b/spacy/lang/fi/punctuation.py @@ -1,12 +1,13 @@ # coding: utf8 from __future__ import unicode_literals -from ..char_classes import LIST_ELLIPSES, LIST_ICONS +from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_HYPHENS from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER from ..punctuation import TOKENIZER_SUFFIXES _quotes = CONCAT_QUOTES.replace("'", "") +DASHES = "|".join(x for x in LIST_HYPHENS if x != "-") _infixes = ( LIST_ELLIPSES @@ -14,11 +15,9 @@ _infixes = ( + [ r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER), r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA), - r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA), - r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes), - r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA), - r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), + r"(?<=[{a}])(?:{d})(?=[{a}])".format(a=ALPHA, d=DASHES), + r"(?<=[{a}0-9])[<>=/](?=[{a}])".format(a=ALPHA), ] ) diff --git a/spacy/lang/fi/tokenizer_exceptions.py b/spacy/lang/fi/tokenizer_exceptions.py index d74deb22b..5469e345e 100644 --- a/spacy/lang/fi/tokenizer_exceptions.py +++ b/spacy/lang/fi/tokenizer_exceptions.py @@ -31,6 +31,9 @@ for exc_data in [ {ORTH: "myöh.", LEMMA: "myöhempi"}, {ORTH: "n.", LEMMA: "noin"}, {ORTH: "nimim.", LEMMA: "nimimerkki"}, + {ORTH: "n:o", LEMMA: "numero"}, + {ORTH: "N:o", LEMMA: "numero"}, + {ORTH: "nro", LEMMA: "numero"}, {ORTH: "ns.", LEMMA: "niin sanottu"}, {ORTH: "nyk.", LEMMA: "nykyinen"}, {ORTH: "oik.", LEMMA: "oikealla"}, diff --git a/spacy/tests/lang/fi/test_tokenizer.py b/spacy/tests/lang/fi/test_tokenizer.py index 17f6f0ccc..aab063982 100644 --- a/spacy/tests/lang/fi/test_tokenizer.py +++ b/spacy/tests/lang/fi/test_tokenizer.py @@ -19,6 +19,21 @@ HYPHENATED_TESTS = [ ) ] +ABBREVIATION_INFLECTION_TESTS = [ + ( + "VTT:ssa ennen v:ta 2010 suoritetut mittaukset", + ["VTT:ssa", "ennen", "v:ta", "2010", "suoritetut", "mittaukset"] + ), + ( + "ALV:n osuus on 24 %.", + ["ALV:n", "osuus", "on", "24", "%", "."] + ), + ( + "Hiihtäjä oli kilpailun 14:s.", + ["Hiihtäjä", "oli", "kilpailun", "14:s", "."] + ) +] + @pytest.mark.parametrize("text,expected_tokens", ABBREVIATION_TESTS) def test_fi_tokenizer_abbreviations(fi_tokenizer, text, expected_tokens): @@ -32,3 +47,10 @@ def test_fi_tokenizer_hyphenated_words(fi_tokenizer, text, expected_tokens): tokens = fi_tokenizer(text) token_list = [token.text for token in tokens if not token.is_space] assert expected_tokens == token_list + + +@pytest.mark.parametrize("text,expected_tokens", ABBREVIATION_INFLECTION_TESTS) +def test_fi_tokenizer_abbreviation_inflections(fi_tokenizer, text, expected_tokens): + tokens = fi_tokenizer(text) + token_list = [token.text for token in tokens if not token.is_space] + assert expected_tokens == token_list diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index 3af7d9fd1..685619c88 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -327,7 +327,7 @@ displaCy in our [online demo](https://explosion.ai/demos/displacy).. ### Disabling the parser {#disabling} In the [default models](/models), the parser is loaded and enabled as part of -the [standard processing pipeline](/usage/processing-pipelin). If you don't need +the [standard processing pipeline](/usage/processing-pipelines). If you don't need any of the syntactic information, you should disable the parser. Disabling the parser will make spaCy load and run much faster. If you want to load the parser, but need to disable it for specific documents, you can also control its use on