Merge pull request #3957 from sorenlind/danish-tokenizer-slash

Make Danish tokenizer split on forward slash
This commit is contained in:
Ines Montani 2019-07-12 18:19:22 +02:00 committed by GitHub
commit c0e29f7029
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 34 additions and 1 deletions

View File

@ -14,10 +14,11 @@ _infixes = (
+ [ + [
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER), r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA), r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
r'(?<=[{a}])[:<>=](?=[{a}])'.format(a=ALPHA), r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes), r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA), r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
] ]
) )

View File

@ -52,6 +52,7 @@ for exc_data in [
{ORTH: "Ons.", LEMMA: "onsdag"}, {ORTH: "Ons.", LEMMA: "onsdag"},
{ORTH: "Fre.", LEMMA: "fredag"}, {ORTH: "Fre.", LEMMA: "fredag"},
{ORTH: "Lør.", LEMMA: "lørdag"}, {ORTH: "Lør.", LEMMA: "lørdag"},
{ORTH: "og/eller", LEMMA: "og/eller", NORM: "og/eller", TAG: "CC"},
]: ]:
_exc[exc_data[ORTH]] = [exc_data] _exc[exc_data[ORTH]] = [exc_data]
@ -64,6 +65,8 @@ for orth in [
"mik.", "mik.",
"pers.", "pers.",
"A.D.", "A.D.",
"A/B",
"a/s",
"A/S", "A/S",
"B.C.", "B.C.",
"BK.", "BK.",
@ -79,7 +82,9 @@ for orth in [
"Kprs.", "Kprs.",
"L.A.", "L.A.",
"Ll.", "Ll.",
"m/k",
"m/s", "m/s",
"m/sek.",
"M/S", "M/S",
"Mag.", "Mag.",
"Mr.", "Mr.",
@ -90,6 +95,7 @@ for orth in [
"Sdr.", "Sdr.",
"Skt.", "Skt.",
"Spl.", "Spl.",
"TCP/IP",
"Vg.", "Vg.",
]: ]:
_exc[orth] = [{ORTH: orth}] _exc[orth] = [{ORTH: orth}]
@ -141,6 +147,7 @@ for orth in [
"brolægn.", "brolægn.",
"bto.", "bto.",
"bygn.", "bygn.",
"c/o",
"ca.", "ca.",
"cand.", "cand.",
"d.d.", "d.d.",
@ -293,6 +300,7 @@ for orth in [
"kgl.", "kgl.",
"kl.", "kl.",
"kld.", "kld.",
"km/t",
"knsp.", "knsp.",
"komm.", "komm.",
"kons.", "kons.",

View File

@ -43,3 +43,27 @@ def test_da_tokenizer_handles_custom_base_exc(da_tokenizer):
def test_da_tokenizer_norm_exceptions(da_tokenizer, text, norm): def test_da_tokenizer_norm_exceptions(da_tokenizer, text, norm):
tokens = da_tokenizer(text) tokens = da_tokenizer(text)
assert tokens[0].norm_ == norm assert tokens[0].norm_ == norm
@pytest.mark.parametrize(
"text,n_tokens",
[
("Godt og/eller skidt", 3),
("Kør 4 km/t på vejen", 5),
("Det blæser 12 m/s.", 5),
("Det blæser 12 m/sek. på havnen", 6),
("Windows 8/Windows 10", 5),
("Billeten virker til bus/tog/metro", 8),
("26/02/2019", 1),
("Kristiansen c/o Madsen", 3),
("Sprogteknologi a/s", 2),
("De boede i A/B Bellevue", 5),
("Rotorhastigheden er 3400 o/m.", 5),
("Jeg købte billet t/r.", 5),
("Murerarbejdsmand m/k søges", 3),
("Netværket kører over TCP/IP", 4),
],
)
def test_da_tokenizer_slash(da_tokenizer, text, n_tokens):
tokens = da_tokenizer(text)
assert len(tokens) == n_tokens