Improve Italian & Urdu tokenization accuracy (#3228)

## Description

1. Added the same infix rule as in French (`d'une`, `j'ai`) for Italian (`c'è`, `l'ha`), bringing F-score on `it_isdt-ud-train.txt` from 96% to 99%. Added unit test to check this behaviour.

2. Added specific Urdu punctuation character as suffix, improving F-score on `ur_udtb-ud-train.txt` from 94% to 100%. Added unit test to check this behaviour.

### Types of change
Enhancement of Italian & Urdu tokenization

## Checklist
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
This commit is contained in:
Sofie 2019-02-04 22:39:25 +01:00 committed by Ines Montani
parent a3efa3e8d9
commit 9745b0d523
10 changed files with 65 additions and 1 deletions

View File

@ -11,6 +11,8 @@ from ...language import Language
from ...attrs import LANG, NORM from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups from ...util import update_exc, add_lookups
from .punctuation import TOKENIZER_INFIXES
class ItalianDefaults(Language.Defaults): class ItalianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
@ -22,6 +24,7 @@ class ItalianDefaults(Language.Defaults):
stop_words = STOP_WORDS stop_words = STOP_WORDS
lemma_lookup = LOOKUP lemma_lookup = LOOKUP
tag_map = TAG_MAP tag_map = TAG_MAP
infixes = TOKENIZER_INFIXES
class Italian(Language): class Italian(Language):

View File

@ -0,0 +1,15 @@
# coding: utf8
from __future__ import unicode_literals
from ..punctuation import TOKENIZER_INFIXES
from ..char_classes import ALPHA
ELISION = " ' ".strip().replace(" ", "").replace("\n", "")
_infixes = TOKENIZER_INFIXES + [
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION)
]
TOKENIZER_INFIXES = _infixes

View File

@ -9,6 +9,8 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...attrs import LANG from ...attrs import LANG
from .punctuation import TOKENIZER_SUFFIXES
class UrduDefaults(Language.Defaults): class UrduDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
@ -18,6 +20,7 @@ class UrduDefaults(Language.Defaults):
tokenizer_exceptions = BASE_EXCEPTIONS tokenizer_exceptions = BASE_EXCEPTIONS
tag_map = TAG_MAP tag_map = TAG_MAP
stop_words = STOP_WORDS stop_words = STOP_WORDS
suffixes = TOKENIZER_SUFFIXES
class Urdu(Language): class Urdu(Language):

View File

@ -0,0 +1,10 @@
# coding: utf8
from __future__ import unicode_literals
from ..punctuation import TOKENIZER_SUFFIXES
_suffixes = TOKENIZER_SUFFIXES + ["۔"]
TOKENIZER_SUFFIXES = _suffixes

View File

@ -65,6 +65,11 @@ def id_tokenizer():
return get_lang_class("id").Defaults.create_tokenizer() return get_lang_class("id").Defaults.create_tokenizer()
@pytest.fixture(scope="session")
def it_tokenizer():
return get_lang_class("it").Defaults.create_tokenizer()
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def sv_tokenizer(): def sv_tokenizer():
return get_lang_class("sv").Defaults.create_tokenizer() return get_lang_class("sv").Defaults.create_tokenizer()

View File

@ -11,3 +11,4 @@ def test_contractions(ca_tokenizer, text, expected_tokens):
""" Test that the contractions are split into two tokens""" """ Test that the contractions are split into two tokens"""
tokens = ca_tokenizer(text) tokens = ca_tokenizer(text)
assert len(tokens) == 2 assert len(tokens) == 2
assert [t.text for t in tokens] == expected_tokens

View File

View File

@ -0,0 +1,14 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize(
"text,expected_tokens", [("c'è", ["c'", "è"]), ("l'ha", ["l'", "ha"])]
)
def test_contractions(it_tokenizer, text, expected_tokens):
""" Test that the contractions are split into two tokens"""
tokens = it_tokenizer(text)
assert len(tokens) == 2
assert [t.text for t in tokens] == expected_tokens

View File

@ -0,0 +1,13 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize(
"text", ['ہےں۔', 'کیا۔']
)
def test_contractions(ur_tokenizer, text):
"""Test specific Urdu punctuation character"""
tokens = ur_tokenizer(text)
assert len(tokens) == 2

View File

@ -10,7 +10,7 @@ def test_ur_tokenizer_handles_long_text(ur_tokenizer):
کہ ایک عدد ٹیلی ویژن ہی کیوں نہ خرید لیں ، سوچا ورلڈ کپ ہی دیکھیں گے۔اپنے پاکستان کے کھلاڑیوں کو دیکھ کر کہ ایک عدد ٹیلی ویژن ہی کیوں نہ خرید لیں ، سوچا ورلڈ کپ ہی دیکھیں گے۔اپنے پاکستان کے کھلاڑیوں کو دیکھ کر
ورلڈ کپ دیکھنے کا حوصلہ ہی نہ رہا تو اب یوں ہی ادھر اُدھر کے چینل گھمانے لگ پڑتے ہیں۔""" ورلڈ کپ دیکھنے کا حوصلہ ہی نہ رہا تو اب یوں ہی ادھر اُدھر کے چینل گھمانے لگ پڑتے ہیں۔"""
tokens = ur_tokenizer(text) tokens = ur_tokenizer(text)
assert len(tokens) == 77 assert len(tokens) == 78
@pytest.mark.parametrize("text,length", [("تحریر باسط حبیب", 3), ("میرا پاکستان", 2)]) @pytest.mark.parametrize("text,length", [("تحریر باسط حبیب", 3), ("میرا پاکستان", 2)])