mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 01:16:28 +03:00
Improve Italian & Urdu tokenization accuracy (#3228)
## Description 1. Added the same infix rule as in French (`d'une`, `j'ai`) for Italian (`c'è`, `l'ha`), bringing F-score on `it_isdt-ud-train.txt` from 96% to 99%. Added unit test to check this behaviour. 2. Added specific Urdu punctuation character as suffix, improving F-score on `ur_udtb-ud-train.txt` from 94% to 100%. Added unit test to check this behaviour. ### Types of change Enhancement of Italian & Urdu tokenization ## Checklist - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
This commit is contained in:
parent
a3efa3e8d9
commit
9745b0d523
|
@ -11,6 +11,8 @@ from ...language import Language
|
||||||
from ...attrs import LANG, NORM
|
from ...attrs import LANG, NORM
|
||||||
from ...util import update_exc, add_lookups
|
from ...util import update_exc, add_lookups
|
||||||
|
|
||||||
|
from .punctuation import TOKENIZER_INFIXES
|
||||||
|
|
||||||
|
|
||||||
class ItalianDefaults(Language.Defaults):
|
class ItalianDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
|
@ -22,6 +24,7 @@ class ItalianDefaults(Language.Defaults):
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
lemma_lookup = LOOKUP
|
lemma_lookup = LOOKUP
|
||||||
tag_map = TAG_MAP
|
tag_map = TAG_MAP
|
||||||
|
infixes = TOKENIZER_INFIXES
|
||||||
|
|
||||||
|
|
||||||
class Italian(Language):
|
class Italian(Language):
|
||||||
|
|
15
spacy/lang/it/punctuation.py
Normal file
15
spacy/lang/it/punctuation.py
Normal file
|
@ -0,0 +1,15 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from ..punctuation import TOKENIZER_INFIXES
|
||||||
|
from ..char_classes import ALPHA
|
||||||
|
|
||||||
|
|
||||||
|
ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "")
|
||||||
|
|
||||||
|
|
||||||
|
_infixes = TOKENIZER_INFIXES + [
|
||||||
|
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION)
|
||||||
|
]
|
||||||
|
|
||||||
|
TOKENIZER_INFIXES = _infixes
|
|
@ -9,6 +9,8 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...attrs import LANG
|
from ...attrs import LANG
|
||||||
|
|
||||||
|
from .punctuation import TOKENIZER_SUFFIXES
|
||||||
|
|
||||||
|
|
||||||
class UrduDefaults(Language.Defaults):
|
class UrduDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
|
@ -18,6 +20,7 @@ class UrduDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = BASE_EXCEPTIONS
|
tokenizer_exceptions = BASE_EXCEPTIONS
|
||||||
tag_map = TAG_MAP
|
tag_map = TAG_MAP
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
|
|
||||||
|
|
||||||
class Urdu(Language):
|
class Urdu(Language):
|
||||||
|
|
10
spacy/lang/ur/punctuation.py
Normal file
10
spacy/lang/ur/punctuation.py
Normal file
|
@ -0,0 +1,10 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from ..punctuation import TOKENIZER_SUFFIXES
|
||||||
|
|
||||||
|
|
||||||
|
_suffixes = TOKENIZER_SUFFIXES + ["۔"]
|
||||||
|
|
||||||
|
|
||||||
|
TOKENIZER_SUFFIXES = _suffixes
|
|
@ -65,6 +65,11 @@ def id_tokenizer():
|
||||||
return get_lang_class("id").Defaults.create_tokenizer()
|
return get_lang_class("id").Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def it_tokenizer():
|
||||||
|
return get_lang_class("it").Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def sv_tokenizer():
|
def sv_tokenizer():
|
||||||
return get_lang_class("sv").Defaults.create_tokenizer()
|
return get_lang_class("sv").Defaults.create_tokenizer()
|
||||||
|
|
|
@ -11,3 +11,4 @@ def test_contractions(ca_tokenizer, text, expected_tokens):
|
||||||
""" Test that the contractions are split into two tokens"""
|
""" Test that the contractions are split into two tokens"""
|
||||||
tokens = ca_tokenizer(text)
|
tokens = ca_tokenizer(text)
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
|
assert [t.text for t in tokens] == expected_tokens
|
||||||
|
|
0
spacy/tests/lang/it/__init__.py
Normal file
0
spacy/tests/lang/it/__init__.py
Normal file
14
spacy/tests/lang/it/test_prefix_suffix_infix.py
Normal file
14
spacy/tests/lang/it/test_prefix_suffix_infix.py
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
# coding: utf-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"text,expected_tokens", [("c'è", ["c'", "è"]), ("l'ha", ["l'", "ha"])]
|
||||||
|
)
|
||||||
|
def test_contractions(it_tokenizer, text, expected_tokens):
|
||||||
|
""" Test that the contractions are split into two tokens"""
|
||||||
|
tokens = it_tokenizer(text)
|
||||||
|
assert len(tokens) == 2
|
||||||
|
assert [t.text for t in tokens] == expected_tokens
|
13
spacy/tests/lang/ur/test_prefix_suffix_infix.py
Normal file
13
spacy/tests/lang/ur/test_prefix_suffix_infix.py
Normal file
|
@ -0,0 +1,13 @@
|
||||||
|
# coding: utf-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"text", ['ہےں۔', 'کیا۔']
|
||||||
|
)
|
||||||
|
def test_contractions(ur_tokenizer, text):
|
||||||
|
"""Test specific Urdu punctuation character"""
|
||||||
|
tokens = ur_tokenizer(text)
|
||||||
|
assert len(tokens) == 2
|
|
@ -10,7 +10,7 @@ def test_ur_tokenizer_handles_long_text(ur_tokenizer):
|
||||||
کہ ایک عدد ٹیلی ویژن ہی کیوں نہ خرید لیں ، سوچا ورلڈ کپ ہی دیکھیں گے۔اپنے پاکستان کے کھلاڑیوں کو دیکھ کر
|
کہ ایک عدد ٹیلی ویژن ہی کیوں نہ خرید لیں ، سوچا ورلڈ کپ ہی دیکھیں گے۔اپنے پاکستان کے کھلاڑیوں کو دیکھ کر
|
||||||
ورلڈ کپ دیکھنے کا حوصلہ ہی نہ رہا تو اب یوں ہی ادھر اُدھر کے چینل گھمانے لگ پڑتے ہیں۔"""
|
ورلڈ کپ دیکھنے کا حوصلہ ہی نہ رہا تو اب یوں ہی ادھر اُدھر کے چینل گھمانے لگ پڑتے ہیں۔"""
|
||||||
tokens = ur_tokenizer(text)
|
tokens = ur_tokenizer(text)
|
||||||
assert len(tokens) == 77
|
assert len(tokens) == 78
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("text,length", [("تحریر باسط حبیب", 3), ("میرا پاکستان", 2)])
|
@pytest.mark.parametrize("text,length", [("تحریر باسط حبیب", 3), ("میرا پاکستان", 2)])
|
||||||
|
|
Loading…
Reference in New Issue
Block a user