Remove TAG from da/sv tokenizer exceptions (#5428)

Remove `TAG` value from Danish and Swedish tokenizer exceptions because
it may not be included in a tag map (and these settings are problematic
as tokenizer exceptions anyway).
This commit is contained in:
adrianeboyd 2020-05-13 10:25:54 +02:00 committed by GitHub
parent 24e7108f80
commit 07639dd6ac
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 5 additions and 5 deletions

View File

@ -6,7 +6,7 @@ Source: https://forkortelse.dk/ and various others.
from __future__ import unicode_literals
from ...symbols import ORTH, LEMMA, NORM, TAG, PUNCT
from ...symbols import ORTH, LEMMA, NORM
_exc = {}
@ -52,7 +52,7 @@ for exc_data in [
{ORTH: "Ons.", LEMMA: "onsdag"},
{ORTH: "Fre.", LEMMA: "fredag"},
{ORTH: "Lør.", LEMMA: "lørdag"},
{ORTH: "og/eller", LEMMA: "og/eller", NORM: "og/eller", TAG: "CC"},
{ORTH: "og/eller", LEMMA: "og/eller", NORM: "og/eller"},
]:
_exc[exc_data[ORTH]] = [exc_data]
@ -577,7 +577,7 @@ for h in range(1, 31 + 1):
for period in ["."]:
_exc["%d%s" % (h, period)] = [{ORTH: "%d." % h}]
_custom_base_exc = {"i.": [{ORTH: "i", LEMMA: "i", NORM: "i"}, {ORTH: ".", TAG: PUNCT}]}
_custom_base_exc = {"i.": [{ORTH: "i", LEMMA: "i", NORM: "i"}, {ORTH: "."}]}
_exc.update(_custom_base_exc)
TOKENIZER_EXCEPTIONS = _exc

View File

@ -1,7 +1,7 @@
# coding: utf8
from __future__ import unicode_literals
from ...symbols import LEMMA, NORM, ORTH, PRON_LEMMA, PUNCT, TAG
from ...symbols import LEMMA, NORM, ORTH, PRON_LEMMA
_exc = {}
@ -155,6 +155,6 @@ for orth in ABBREVIATIONS:
# Sentences ending in "i." (as in "... peka i."), "m." (as in "...än 2000 m."),
# should be tokenized as two separate tokens.
for orth in ["i", "m"]:
_exc[orth + "."] = [{ORTH: orth, LEMMA: orth, NORM: orth}, {ORTH: ".", TAG: PUNCT}]
_exc[orth + "."] = [{ORTH: orth, LEMMA: orth, NORM: orth}, {ORTH: "."}]
TOKENIZER_EXCEPTIONS = _exc