mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Improve Italian & Urdu tokenization accuracy (#3228)
## Description 1. Added the same infix rule as in French (`d'une`, `j'ai`) for Italian (`c'è`, `l'ha`), bringing F-score on `it_isdt-ud-train.txt` from 96% to 99%. Added unit test to check this behaviour. 2. Added specific Urdu punctuation character as suffix, improving F-score on `ur_udtb-ud-train.txt` from 94% to 100%. Added unit test to check this behaviour. ### Types of change Enhancement of Italian & Urdu tokenization ## Checklist - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
This commit is contained in:
parent
a3efa3e8d9
commit
9745b0d523
|
@ -11,6 +11,8 @@ from ...language import Language
|
|||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
|
||||
from .punctuation import TOKENIZER_INFIXES
|
||||
|
||||
|
||||
class ItalianDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
|
@ -22,6 +24,7 @@ class ItalianDefaults(Language.Defaults):
|
|||
stop_words = STOP_WORDS
|
||||
lemma_lookup = LOOKUP
|
||||
tag_map = TAG_MAP
|
||||
infixes = TOKENIZER_INFIXES
|
||||
|
||||
|
||||
class Italian(Language):
|
||||
|
|
15
spacy/lang/it/punctuation.py
Normal file
15
spacy/lang/it/punctuation.py
Normal file
|
@ -0,0 +1,15 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..punctuation import TOKENIZER_INFIXES
|
||||
from ..char_classes import ALPHA
|
||||
|
||||
|
||||
ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "")
|
||||
|
||||
|
||||
_infixes = TOKENIZER_INFIXES + [
|
||||
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION)
|
||||
]
|
||||
|
||||
TOKENIZER_INFIXES = _infixes
|
|
@ -9,6 +9,8 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
|||
from ...language import Language
|
||||
from ...attrs import LANG
|
||||
|
||||
from .punctuation import TOKENIZER_SUFFIXES
|
||||
|
||||
|
||||
class UrduDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
|
@ -18,6 +20,7 @@ class UrduDefaults(Language.Defaults):
|
|||
tokenizer_exceptions = BASE_EXCEPTIONS
|
||||
tag_map = TAG_MAP
|
||||
stop_words = STOP_WORDS
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
|
||||
|
||||
class Urdu(Language):
|
||||
|
|
10
spacy/lang/ur/punctuation.py
Normal file
10
spacy/lang/ur/punctuation.py
Normal file
|
@ -0,0 +1,10 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..punctuation import TOKENIZER_SUFFIXES
|
||||
|
||||
|
||||
_suffixes = TOKENIZER_SUFFIXES + ["۔"]
|
||||
|
||||
|
||||
TOKENIZER_SUFFIXES = _suffixes
|
|
@ -65,6 +65,11 @@ def id_tokenizer():
|
|||
return get_lang_class("id").Defaults.create_tokenizer()
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def it_tokenizer():
|
||||
return get_lang_class("it").Defaults.create_tokenizer()
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def sv_tokenizer():
|
||||
return get_lang_class("sv").Defaults.create_tokenizer()
|
||||
|
|
|
@ -11,3 +11,4 @@ def test_contractions(ca_tokenizer, text, expected_tokens):
|
|||
""" Test that the contractions are split into two tokens"""
|
||||
tokens = ca_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
assert [t.text for t in tokens] == expected_tokens
|
||||
|
|
0
spacy/tests/lang/it/__init__.py
Normal file
0
spacy/tests/lang/it/__init__.py
Normal file
14
spacy/tests/lang/it/test_prefix_suffix_infix.py
Normal file
14
spacy/tests/lang/it/test_prefix_suffix_infix.py
Normal file
|
@ -0,0 +1,14 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text,expected_tokens", [("c'è", ["c'", "è"]), ("l'ha", ["l'", "ha"])]
|
||||
)
|
||||
def test_contractions(it_tokenizer, text, expected_tokens):
|
||||
""" Test that the contractions are split into two tokens"""
|
||||
tokens = it_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
assert [t.text for t in tokens] == expected_tokens
|
13
spacy/tests/lang/ur/test_prefix_suffix_infix.py
Normal file
13
spacy/tests/lang/ur/test_prefix_suffix_infix.py
Normal file
|
@ -0,0 +1,13 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text", ['ہےں۔', 'کیا۔']
|
||||
)
|
||||
def test_contractions(ur_tokenizer, text):
|
||||
"""Test specific Urdu punctuation character"""
|
||||
tokens = ur_tokenizer(text)
|
||||
assert len(tokens) == 2
|
|
@ -10,7 +10,7 @@ def test_ur_tokenizer_handles_long_text(ur_tokenizer):
|
|||
کہ ایک عدد ٹیلی ویژن ہی کیوں نہ خرید لیں ، سوچا ورلڈ کپ ہی دیکھیں گے۔اپنے پاکستان کے کھلاڑیوں کو دیکھ کر
|
||||
ورلڈ کپ دیکھنے کا حوصلہ ہی نہ رہا تو اب یوں ہی ادھر اُدھر کے چینل گھمانے لگ پڑتے ہیں۔"""
|
||||
tokens = ur_tokenizer(text)
|
||||
assert len(tokens) == 77
|
||||
assert len(tokens) == 78
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text,length", [("تحریر باسط حبیب", 3), ("میرا پاکستان", 2)])
|
||||
|
|
Loading…
Reference in New Issue
Block a user