diff --git a/spacy/lang/it/__init__.py b/spacy/lang/it/__init__.py index c8028fdc3..28ce2f145 100644 --- a/spacy/lang/it/__init__.py +++ b/spacy/lang/it/__init__.py @@ -11,6 +11,8 @@ from ...language import Language from ...attrs import LANG, NORM from ...util import update_exc, add_lookups +from .punctuation import TOKENIZER_INFIXES + class ItalianDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) @@ -22,6 +24,7 @@ class ItalianDefaults(Language.Defaults): stop_words = STOP_WORDS lemma_lookup = LOOKUP tag_map = TAG_MAP + infixes = TOKENIZER_INFIXES class Italian(Language): diff --git a/spacy/lang/it/punctuation.py b/spacy/lang/it/punctuation.py new file mode 100644 index 000000000..4439376c8 --- /dev/null +++ b/spacy/lang/it/punctuation.py @@ -0,0 +1,15 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ..punctuation import TOKENIZER_INFIXES +from ..char_classes import ALPHA + + +ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "") + + +_infixes = TOKENIZER_INFIXES + [ + r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION) +] + +TOKENIZER_INFIXES = _infixes diff --git a/spacy/lang/ur/__init__.py b/spacy/lang/ur/__init__.py index 0a3e2a502..46056e090 100644 --- a/spacy/lang/ur/__init__.py +++ b/spacy/lang/ur/__init__.py @@ -9,6 +9,8 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...language import Language from ...attrs import LANG +from .punctuation import TOKENIZER_SUFFIXES + class UrduDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) @@ -18,6 +20,7 @@ class UrduDefaults(Language.Defaults): tokenizer_exceptions = BASE_EXCEPTIONS tag_map = TAG_MAP stop_words = STOP_WORDS + suffixes = TOKENIZER_SUFFIXES class Urdu(Language): diff --git a/spacy/lang/ur/punctuation.py b/spacy/lang/ur/punctuation.py new file mode 100644 index 000000000..0d3b01bba --- /dev/null +++ b/spacy/lang/ur/punctuation.py @@ -0,0 +1,10 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ..punctuation import TOKENIZER_SUFFIXES + + +_suffixes = TOKENIZER_SUFFIXES + ["۔"] + + +TOKENIZER_SUFFIXES = _suffixes diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index d07b79efe..c4241dfc5 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -65,6 +65,11 @@ def id_tokenizer(): return get_lang_class("id").Defaults.create_tokenizer() +@pytest.fixture(scope="session") +def it_tokenizer(): + return get_lang_class("it").Defaults.create_tokenizer() + + @pytest.fixture(scope="session") def sv_tokenizer(): return get_lang_class("sv").Defaults.create_tokenizer() diff --git a/spacy/tests/lang/ca/test_prefix_suffix_infix.py b/spacy/tests/lang/ca/test_prefix_suffix_infix.py index 97431cd7b..4583a62b9 100644 --- a/spacy/tests/lang/ca/test_prefix_suffix_infix.py +++ b/spacy/tests/lang/ca/test_prefix_suffix_infix.py @@ -11,3 +11,4 @@ def test_contractions(ca_tokenizer, text, expected_tokens): """ Test that the contractions are split into two tokens""" tokens = ca_tokenizer(text) assert len(tokens) == 2 + assert [t.text for t in tokens] == expected_tokens diff --git a/spacy/tests/lang/it/__init__.py b/spacy/tests/lang/it/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/it/test_prefix_suffix_infix.py b/spacy/tests/lang/it/test_prefix_suffix_infix.py new file mode 100644 index 000000000..f84351fd7 --- /dev/null +++ b/spacy/tests/lang/it/test_prefix_suffix_infix.py @@ -0,0 +1,14 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + + +@pytest.mark.parametrize( + "text,expected_tokens", [("c'è", ["c'", "è"]), ("l'ha", ["l'", "ha"])] +) +def test_contractions(it_tokenizer, text, expected_tokens): + """ Test that the contractions are split into two tokens""" + tokens = it_tokenizer(text) + assert len(tokens) == 2 + assert [t.text for t in tokens] == expected_tokens diff --git a/spacy/tests/lang/ur/test_prefix_suffix_infix.py b/spacy/tests/lang/ur/test_prefix_suffix_infix.py new file mode 100644 index 000000000..efbf945bf --- /dev/null +++ b/spacy/tests/lang/ur/test_prefix_suffix_infix.py @@ -0,0 +1,13 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + + +@pytest.mark.parametrize( + "text", ['ہےں۔', 'کیا۔'] +) +def test_contractions(ur_tokenizer, text): + """Test specific Urdu punctuation character""" + tokens = ur_tokenizer(text) + assert len(tokens) == 2 diff --git a/spacy/tests/lang/ur/test_text.py b/spacy/tests/lang/ur/test_text.py index 45d80e027..545e0fa9e 100644 --- a/spacy/tests/lang/ur/test_text.py +++ b/spacy/tests/lang/ur/test_text.py @@ -10,7 +10,7 @@ def test_ur_tokenizer_handles_long_text(ur_tokenizer): کہ ایک عدد ٹیلی ویژن ہی کیوں نہ خرید لیں ، سوچا ورلڈ کپ ہی دیکھیں گے۔اپنے پاکستان کے کھلاڑیوں کو دیکھ کر ورلڈ کپ دیکھنے کا حوصلہ ہی نہ رہا تو اب یوں ہی ادھر اُدھر کے چینل گھمانے لگ پڑتے ہیں۔""" tokens = ur_tokenizer(text) - assert len(tokens) == 77 + assert len(tokens) == 78 @pytest.mark.parametrize("text,length", [("تحریر باسط حبیب", 3), ("میرا پاکستان", 2)])