From 5d8cb60e43194519e4cd5e7d5fd94dcb34573857 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Tue, 11 Feb 2020 02:30:54 +0100 Subject: [PATCH 1/3] Update lower pin for srsly to 1.0.1 (#4976) --- requirements.txt | 2 +- setup.cfg | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 1786ee186..4f0579313 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,7 @@ thinc==7.4.0.dev0 blis>=0.4.0,<0.5.0 murmurhash>=0.28.0,<1.1.0 wasabi>=0.4.0,<1.1.0 -srsly>=0.1.0,<1.1.0 +srsly>=1.0.1,<1.1.0 catalogue>=0.0.7,<1.1.0 # Third party dependencies numpy>=1.15.0 diff --git a/setup.cfg b/setup.cfg index 2c8268517..55396e011 100644 --- a/setup.cfg +++ b/setup.cfg @@ -47,7 +47,7 @@ install_requires = thinc==7.4.0.dev0 blis>=0.4.0,<0.5.0 wasabi>=0.4.0,<1.1.0 - srsly>=0.1.0,<1.1.0 + srsly>=1.0.1,<1.1.0 catalogue>=0.0.7,<1.1.0 # Third-party dependencies tqdm>=4.38.0,<5.0.0 From 479e81bafc90f61556bab5583566e134a00f6aaa Mon Sep 17 00:00:00 2001 From: Julin S <48789920+ju-sh@users.noreply.github.com> Date: Tue, 11 Feb 2020 07:01:26 +0530 Subject: [PATCH 2/3] fix link (#4977) --- website/docs/usage/linguistic-features.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index 3af7d9fd1..685619c88 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -327,7 +327,7 @@ displaCy in our [online demo](https://explosion.ai/demos/displacy).. ### Disabling the parser {#disabling} In the [default models](/models), the parser is loaded and enabled as part of -the [standard processing pipeline](/usage/processing-pipelin). If you don't need +the [standard processing pipeline](/usage/processing-pipelines). If you don't need any of the syntactic information, you should disable the parser. Disabling the parser will make spaCy load and run much faster. If you want to load the parser, but need to disable it for specific documents, you can also control its use on From e1f777b15171fef086c20a5d2415f9f2154e1544 Mon Sep 17 00:00:00 2001 From: Antti Ajanki Date: Tue, 11 Feb 2020 03:32:43 +0200 Subject: [PATCH 3/3] Improvements for Finnish tokenizer (#4985) * don't split on a colon. Colon is used to attach suffixes for abbreviations * tokenize on any of LIST_HYPHENS (except a single hyphen), not just on -- * simplify infix rules by merging similar rules --- spacy/lang/fi/punctuation.py | 9 ++++----- spacy/lang/fi/tokenizer_exceptions.py | 3 +++ spacy/tests/lang/fi/test_tokenizer.py | 22 ++++++++++++++++++++++ 3 files changed, 29 insertions(+), 5 deletions(-) diff --git a/spacy/lang/fi/punctuation.py b/spacy/lang/fi/punctuation.py index 02eb1b200..a85c0b228 100644 --- a/spacy/lang/fi/punctuation.py +++ b/spacy/lang/fi/punctuation.py @@ -1,12 +1,13 @@ # coding: utf8 from __future__ import unicode_literals -from ..char_classes import LIST_ELLIPSES, LIST_ICONS +from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_HYPHENS from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER from ..punctuation import TOKENIZER_SUFFIXES _quotes = CONCAT_QUOTES.replace("'", "") +DASHES = "|".join(x for x in LIST_HYPHENS if x != "-") _infixes = ( LIST_ELLIPSES @@ -14,11 +15,9 @@ _infixes = ( + [ r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER), r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA), - r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA), - r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes), - r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA), - r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), + r"(?<=[{a}])(?:{d})(?=[{a}])".format(a=ALPHA, d=DASHES), + r"(?<=[{a}0-9])[<>=/](?=[{a}])".format(a=ALPHA), ] ) diff --git a/spacy/lang/fi/tokenizer_exceptions.py b/spacy/lang/fi/tokenizer_exceptions.py index d74deb22b..5469e345e 100644 --- a/spacy/lang/fi/tokenizer_exceptions.py +++ b/spacy/lang/fi/tokenizer_exceptions.py @@ -31,6 +31,9 @@ for exc_data in [ {ORTH: "myöh.", LEMMA: "myöhempi"}, {ORTH: "n.", LEMMA: "noin"}, {ORTH: "nimim.", LEMMA: "nimimerkki"}, + {ORTH: "n:o", LEMMA: "numero"}, + {ORTH: "N:o", LEMMA: "numero"}, + {ORTH: "nro", LEMMA: "numero"}, {ORTH: "ns.", LEMMA: "niin sanottu"}, {ORTH: "nyk.", LEMMA: "nykyinen"}, {ORTH: "oik.", LEMMA: "oikealla"}, diff --git a/spacy/tests/lang/fi/test_tokenizer.py b/spacy/tests/lang/fi/test_tokenizer.py index 17f6f0ccc..aab063982 100644 --- a/spacy/tests/lang/fi/test_tokenizer.py +++ b/spacy/tests/lang/fi/test_tokenizer.py @@ -19,6 +19,21 @@ HYPHENATED_TESTS = [ ) ] +ABBREVIATION_INFLECTION_TESTS = [ + ( + "VTT:ssa ennen v:ta 2010 suoritetut mittaukset", + ["VTT:ssa", "ennen", "v:ta", "2010", "suoritetut", "mittaukset"] + ), + ( + "ALV:n osuus on 24 %.", + ["ALV:n", "osuus", "on", "24", "%", "."] + ), + ( + "Hiihtäjä oli kilpailun 14:s.", + ["Hiihtäjä", "oli", "kilpailun", "14:s", "."] + ) +] + @pytest.mark.parametrize("text,expected_tokens", ABBREVIATION_TESTS) def test_fi_tokenizer_abbreviations(fi_tokenizer, text, expected_tokens): @@ -32,3 +47,10 @@ def test_fi_tokenizer_hyphenated_words(fi_tokenizer, text, expected_tokens): tokens = fi_tokenizer(text) token_list = [token.text for token in tokens if not token.is_space] assert expected_tokens == token_list + + +@pytest.mark.parametrize("text,expected_tokens", ABBREVIATION_INFLECTION_TESTS) +def test_fi_tokenizer_abbreviation_inflections(fi_tokenizer, text, expected_tokens): + tokens = fi_tokenizer(text) + token_list = [token.text for token in tokens if not token.is_space] + assert expected_tokens == token_list