new tests & tokenization fixes (#4734)

- added some tests for tokenization issues
- fixed some issues with tokenization of words with hyphen infix
- rewrote the "tokenizer_exceptions.py" file (stemming from the German version)
This commit is contained in:
Christoph Purschke 2019-12-01 23:08:21 +01:00 committed by Ines Montani
parent 48ea2e8d0f
commit a7ee4b6f17
5 changed files with 38 additions and 12 deletions

View File

@ -6,7 +6,7 @@ from __future__ import unicode_literals
# variants (vläicht = vlaicht, vleicht, viläicht, viläischt, etc. etc.) # variants (vläicht = vlaicht, vleicht, viläicht, viläischt, etc. etc.)
# here one could include the most common spelling mistakes # here one could include the most common spelling mistakes
_exc = {"datt": "dass", "wgl.": "weg.", "vläicht": "viläicht"} _exc = {"dass": "datt", "viläicht": "vläicht"}
NORM_EXCEPTIONS = {} NORM_EXCEPTIONS = {}

View File

@ -1,16 +1,23 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from ..punctuation import TOKENIZER_INFIXES from ..char_classes import LIST_ELLIPSES, LIST_ICONS
from ..char_classes import ALPHA from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
ELISION = " ' ".strip().replace(" ", "") ELISION = " ' ".strip().replace(" ", "")
HYPHENS = r"- ".strip().replace(" ", "")
_infixes = (
_infixes = TOKENIZER_INFIXES + [ LIST_ELLIPSES
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION) + LIST_ICONS
+ [
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION),
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
r"(?<=[0-9])-(?=[0-9])",
] ]
)
TOKENIZER_INFIXES = _infixes TOKENIZER_INFIXES = _infixes

View File

@ -10,7 +10,9 @@ _exc = {}
# translate / delete what is not necessary # translate / delete what is not necessary
for exc_data in [ for exc_data in [
{ORTH: "wgl.", LEMMA: "wann ech gelift", NORM: "wann ech gelieft"}, {ORTH: "'t", LEMMA: "et", NORM: "et"},
{ORTH: "'T", LEMMA: "et", NORM: "et"},
{ORTH: "wgl.", LEMMA: "wannechgelift", NORM: "wannechgelift"},
{ORTH: "M.", LEMMA: "Monsieur", NORM: "Monsieur"}, {ORTH: "M.", LEMMA: "Monsieur", NORM: "Monsieur"},
{ORTH: "Mme.", LEMMA: "Madame", NORM: "Madame"}, {ORTH: "Mme.", LEMMA: "Madame", NORM: "Madame"},
{ORTH: "Dr.", LEMMA: "Dokter", NORM: "Dokter"}, {ORTH: "Dr.", LEMMA: "Dokter", NORM: "Dokter"},
@ -18,7 +20,7 @@ for exc_data in [
{ORTH: "asw.", LEMMA: "an sou weider", NORM: "an sou weider"}, {ORTH: "asw.", LEMMA: "an sou weider", NORM: "an sou weider"},
{ORTH: "etc.", LEMMA: "et cetera", NORM: "et cetera"}, {ORTH: "etc.", LEMMA: "et cetera", NORM: "et cetera"},
{ORTH: "bzw.", LEMMA: "bezéiungsweis", NORM: "bezéiungsweis"}, {ORTH: "bzw.", LEMMA: "bezéiungsweis", NORM: "bezéiungsweis"},
{ORTH: "Jan.", LEMMA: "Januar", NORM: "Januar"}, {ORTH: "Jan.", LEMMA: "Januar", NORM: "Januar"}
]: ]:
_exc[exc_data[ORTH]] = [exc_data] _exc[exc_data[ORTH]] = [exc_data]

View File

@ -3,8 +3,24 @@ from __future__ import unicode_literals
import pytest import pytest
@pytest.mark.parametrize("text", ["z.B.", "Jan."]) @pytest.mark.parametrize("text", ["z.B.", "Jan."])
def test_lb_tokenizer_handles_abbr(lb_tokenizer, text): def test_lb_tokenizer_handles_abbr(lb_tokenizer, text):
tokens = lb_tokenizer(text) tokens = lb_tokenizer(text)
assert len(tokens) == 1 assert len(tokens) == 1
@pytest.mark.parametrize("text", ["d'Saach", "d'Kanner", "dWelt", "dSuen"])
def test_lb_tokenizer_splits_contractions(lb_tokenizer, text):
tokens = lb_tokenizer(text)
assert len(tokens) == 2
def test_lb_tokenizer_handles_exc_in_text(lb_tokenizer):
text = "Mee 't ass net evident, d'Liewen."
tokens = lb_tokenizer(text)
assert len(tokens) == 9
assert tokens[1].text == "'t"
assert tokens[1].lemma_ == "et"
@pytest.mark.parametrize("text,norm", [("dass", "datt"), ("viläicht", "vläicht")])
def test_lb_norm_exceptions(lb_tokenizer, text, norm):
tokens = lb_tokenizer(text)
assert tokens[0].norm_ == norm

View File

@ -16,6 +16,7 @@ def test_lb_tokenizer_handles_long_text(lb_tokenizer):
[ [
("»Wat ass mat mir geschitt?«, huet hie geduecht.", 13), ("»Wat ass mat mir geschitt?«, huet hie geduecht.", 13),
("“Dëst fréi Opstoen”, denkt hien, “mécht ee ganz duercherneen. ", 15), ("“Dëst fréi Opstoen”, denkt hien, “mécht ee ganz duercherneen. ", 15),
("Am Grand-Duché ass d'Liewen schéin, mee 't gëtt ze vill Autoen.", 14)
], ],
) )
def test_lb_tokenizer_handles_examples(lb_tokenizer, text, length): def test_lb_tokenizer_handles_examples(lb_tokenizer, text, length):