From 9745b0d5233b97698f6c2b8832f6e6687d640bc0 Mon Sep 17 00:00:00 2001 From: Sofie Date: Mon, 4 Feb 2019 22:39:25 +0100 Subject: [PATCH] Improve Italian & Urdu tokenization accuracy (#3228) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Description 1. Added the same infix rule as in French (`d'une`, `j'ai`) for Italian (`c'è`, `l'ha`), bringing F-score on `it_isdt-ud-train.txt` from 96% to 99%. Added unit test to check this behaviour. 2. Added specific Urdu punctuation character as suffix, improving F-score on `ur_udtb-ud-train.txt` from 94% to 100%. Added unit test to check this behaviour. ### Types of change Enhancement of Italian & Urdu tokenization ## Checklist - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information. --- spacy/lang/it/__init__.py | 3 +++ spacy/lang/it/punctuation.py | 15 +++++++++++++++ spacy/lang/ur/__init__.py | 3 +++ spacy/lang/ur/punctuation.py | 10 ++++++++++ spacy/tests/conftest.py | 5 +++++ spacy/tests/lang/ca/test_prefix_suffix_infix.py | 1 + spacy/tests/lang/it/__init__.py | 0 spacy/tests/lang/it/test_prefix_suffix_infix.py | 14 ++++++++++++++ spacy/tests/lang/ur/test_prefix_suffix_infix.py | 13 +++++++++++++ spacy/tests/lang/ur/test_text.py | 2 +- 10 files changed, 65 insertions(+), 1 deletion(-) create mode 100644 spacy/lang/it/punctuation.py create mode 100644 spacy/lang/ur/punctuation.py create mode 100644 spacy/tests/lang/it/__init__.py create mode 100644 spacy/tests/lang/it/test_prefix_suffix_infix.py create mode 100644 spacy/tests/lang/ur/test_prefix_suffix_infix.py diff --git a/spacy/lang/it/__init__.py b/spacy/lang/it/__init__.py index c8028fdc3..28ce2f145 100644 --- a/spacy/lang/it/__init__.py +++ b/spacy/lang/it/__init__.py @@ -11,6 +11,8 @@ from ...language import Language from ...attrs import LANG, NORM from ...util import update_exc, add_lookups +from .punctuation import TOKENIZER_INFIXES + class ItalianDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) @@ -22,6 +24,7 @@ class ItalianDefaults(Language.Defaults): stop_words = STOP_WORDS lemma_lookup = LOOKUP tag_map = TAG_MAP + infixes = TOKENIZER_INFIXES class Italian(Language): diff --git a/spacy/lang/it/punctuation.py b/spacy/lang/it/punctuation.py new file mode 100644 index 000000000..4439376c8 --- /dev/null +++ b/spacy/lang/it/punctuation.py @@ -0,0 +1,15 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ..punctuation import TOKENIZER_INFIXES +from ..char_classes import ALPHA + + +ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "") + + +_infixes = TOKENIZER_INFIXES + [ + r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION) +] + +TOKENIZER_INFIXES = _infixes diff --git a/spacy/lang/ur/__init__.py b/spacy/lang/ur/__init__.py index 0a3e2a502..46056e090 100644 --- a/spacy/lang/ur/__init__.py +++ b/spacy/lang/ur/__init__.py @@ -9,6 +9,8 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...language import Language from ...attrs import LANG +from .punctuation import TOKENIZER_SUFFIXES + class UrduDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) @@ -18,6 +20,7 @@ class UrduDefaults(Language.Defaults): tokenizer_exceptions = BASE_EXCEPTIONS tag_map = TAG_MAP stop_words = STOP_WORDS + suffixes = TOKENIZER_SUFFIXES class Urdu(Language): diff --git a/spacy/lang/ur/punctuation.py b/spacy/lang/ur/punctuation.py new file mode 100644 index 000000000..0d3b01bba --- /dev/null +++ b/spacy/lang/ur/punctuation.py @@ -0,0 +1,10 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ..punctuation import TOKENIZER_SUFFIXES + + +_suffixes = TOKENIZER_SUFFIXES + ["۔"] + + +TOKENIZER_SUFFIXES = _suffixes diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index d07b79efe..c4241dfc5 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -65,6 +65,11 @@ def id_tokenizer(): return get_lang_class("id").Defaults.create_tokenizer() +@pytest.fixture(scope="session") +def it_tokenizer(): + return get_lang_class("it").Defaults.create_tokenizer() + + @pytest.fixture(scope="session") def sv_tokenizer(): return get_lang_class("sv").Defaults.create_tokenizer() diff --git a/spacy/tests/lang/ca/test_prefix_suffix_infix.py b/spacy/tests/lang/ca/test_prefix_suffix_infix.py index 97431cd7b..4583a62b9 100644 --- a/spacy/tests/lang/ca/test_prefix_suffix_infix.py +++ b/spacy/tests/lang/ca/test_prefix_suffix_infix.py @@ -11,3 +11,4 @@ def test_contractions(ca_tokenizer, text, expected_tokens): """ Test that the contractions are split into two tokens""" tokens = ca_tokenizer(text) assert len(tokens) == 2 + assert [t.text for t in tokens] == expected_tokens diff --git a/spacy/tests/lang/it/__init__.py b/spacy/tests/lang/it/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/it/test_prefix_suffix_infix.py b/spacy/tests/lang/it/test_prefix_suffix_infix.py new file mode 100644 index 000000000..f84351fd7 --- /dev/null +++ b/spacy/tests/lang/it/test_prefix_suffix_infix.py @@ -0,0 +1,14 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + + +@pytest.mark.parametrize( + "text,expected_tokens", [("c'è", ["c'", "è"]), ("l'ha", ["l'", "ha"])] +) +def test_contractions(it_tokenizer, text, expected_tokens): + """ Test that the contractions are split into two tokens""" + tokens = it_tokenizer(text) + assert len(tokens) == 2 + assert [t.text for t in tokens] == expected_tokens diff --git a/spacy/tests/lang/ur/test_prefix_suffix_infix.py b/spacy/tests/lang/ur/test_prefix_suffix_infix.py new file mode 100644 index 000000000..efbf945bf --- /dev/null +++ b/spacy/tests/lang/ur/test_prefix_suffix_infix.py @@ -0,0 +1,13 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + + +@pytest.mark.parametrize( + "text", ['ہےں۔', 'کیا۔'] +) +def test_contractions(ur_tokenizer, text): + """Test specific Urdu punctuation character""" + tokens = ur_tokenizer(text) + assert len(tokens) == 2 diff --git a/spacy/tests/lang/ur/test_text.py b/spacy/tests/lang/ur/test_text.py index 45d80e027..545e0fa9e 100644 --- a/spacy/tests/lang/ur/test_text.py +++ b/spacy/tests/lang/ur/test_text.py @@ -10,7 +10,7 @@ def test_ur_tokenizer_handles_long_text(ur_tokenizer): کہ ایک عدد ٹیلی ویژن ہی کیوں نہ خرید لیں ، سوچا ورلڈ کپ ہی دیکھیں گے۔اپنے پاکستان کے کھلاڑیوں کو دیکھ کر ورلڈ کپ دیکھنے کا حوصلہ ہی نہ رہا تو اب یوں ہی ادھر اُدھر کے چینل گھمانے لگ پڑتے ہیں۔""" tokens = ur_tokenizer(text) - assert len(tokens) == 77 + assert len(tokens) == 78 @pytest.mark.parametrize("text,length", [("تحریر باسط حبیب", 3), ("میرا پاکستان", 2)])