new tests & tokenization fixes (#4734)

- added some tests for tokenization issues
- fixed some issues with tokenization of words with hyphen infix
- rewrote the "tokenizer_exceptions.py" file (stemming from the German version)
This commit is contained in:
Christoph Purschke 2019-12-01 23:08:21 +01:00 committed by Ines Montani
parent 48ea2e8d0f
commit a7ee4b6f17
5 changed files with 38 additions and 12 deletions

View File

@ -6,7 +6,7 @@ from __future__ import unicode_literals
# variants (vläicht = vlaicht, vleicht, viläicht, viläischt, etc. etc.)
# here one could include the most common spelling mistakes
_exc = {"datt": "dass", "wgl.": "weg.", "vläicht": "viläicht"}
_exc = {"dass": "datt", "viläicht": "vläicht"}
NORM_EXCEPTIONS = {}

View File

@ -1,16 +1,23 @@
# coding: utf8
from __future__ import unicode_literals
from ..punctuation import TOKENIZER_INFIXES
from ..char_classes import ALPHA
from ..char_classes import LIST_ELLIPSES, LIST_ICONS
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
ELISION = " ' ".strip().replace(" ", "")
HYPHENS = r"- ".strip().replace(" ", "")
_infixes = TOKENIZER_INFIXES + [
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION)
]
_infixes = (
LIST_ELLIPSES
+ LIST_ICONS
+ [
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION),
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
r"(?<=[0-9])-(?=[0-9])",
]
)
TOKENIZER_INFIXES = _infixes

View File

@ -10,7 +10,9 @@ _exc = {}
# translate / delete what is not necessary
for exc_data in [
{ORTH: "wgl.", LEMMA: "wann ech gelift", NORM: "wann ech gelieft"},
{ORTH: "'t", LEMMA: "et", NORM: "et"},
{ORTH: "'T", LEMMA: "et", NORM: "et"},
{ORTH: "wgl.", LEMMA: "wannechgelift", NORM: "wannechgelift"},
{ORTH: "M.", LEMMA: "Monsieur", NORM: "Monsieur"},
{ORTH: "Mme.", LEMMA: "Madame", NORM: "Madame"},
{ORTH: "Dr.", LEMMA: "Dokter", NORM: "Dokter"},
@ -18,7 +20,7 @@ for exc_data in [
{ORTH: "asw.", LEMMA: "an sou weider", NORM: "an sou weider"},
{ORTH: "etc.", LEMMA: "et cetera", NORM: "et cetera"},
{ORTH: "bzw.", LEMMA: "bezéiungsweis", NORM: "bezéiungsweis"},
{ORTH: "Jan.", LEMMA: "Januar", NORM: "Januar"},
{ORTH: "Jan.", LEMMA: "Januar", NORM: "Januar"}
]:
_exc[exc_data[ORTH]] = [exc_data]

View File

@ -3,8 +3,24 @@ from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize("text", ["z.B.", "Jan."])
def test_lb_tokenizer_handles_abbr(lb_tokenizer, text):
tokens = lb_tokenizer(text)
assert len(tokens) == 1
@pytest.mark.parametrize("text", ["d'Saach", "d'Kanner", "dWelt", "dSuen"])
def test_lb_tokenizer_splits_contractions(lb_tokenizer, text):
tokens = lb_tokenizer(text)
assert len(tokens) == 2
def test_lb_tokenizer_handles_exc_in_text(lb_tokenizer):
text = "Mee 't ass net evident, d'Liewen."
tokens = lb_tokenizer(text)
assert len(tokens) == 9
assert tokens[1].text == "'t"
assert tokens[1].lemma_ == "et"
@pytest.mark.parametrize("text,norm", [("dass", "datt"), ("viläicht", "vläicht")])
def test_lb_norm_exceptions(lb_tokenizer, text, norm):
tokens = lb_tokenizer(text)
assert tokens[0].norm_ == norm

View File

@ -16,6 +16,7 @@ def test_lb_tokenizer_handles_long_text(lb_tokenizer):
[
("»Wat ass mat mir geschitt?«, huet hie geduecht.", 13),
("“Dëst fréi Opstoen”, denkt hien, “mécht ee ganz duercherneen. ", 15),
("Am Grand-Duché ass d'Liewen schéin, mee 't gëtt ze vill Autoen.", 14)
],
)
def test_lb_tokenizer_handles_examples(lb_tokenizer, text, length):