Remove TAG from da/sv tokenizer exceptions (#5428)

Remove `TAG` value from Danish and Swedish tokenizer exceptions because it may not be included in a tag map (and these settings are problematic as tokenizer exceptions anyway).
2026-01-10 18:51:21 +03:00 · 2020-05-13 10:25:54 +02:00 · 2020-05-13 10:25:54 +02:00 · 07639dd6ac
commit 07639dd6ac
parent 24e7108f80
2 changed files with 5 additions and 5 deletions
--- a/spacy/lang/da/tokenizer_exceptions.py
+++ b/spacy/lang/da/tokenizer_exceptions.py
@ -6,7 +6,7 @@ Source: https://forkortelse.dk/ and various others.

 from __future__ import unicode_literals

-from ...symbols import ORTH, LEMMA, NORM, TAG, PUNCT
+from ...symbols import ORTH, LEMMA, NORM


 _exc = {}
@ -52,7 +52,7 @@ for exc_data in [
    {ORTH: "Ons.", LEMMA: "onsdag"},
    {ORTH: "Fre.", LEMMA: "fredag"},
    {ORTH: "Lør.", LEMMA: "lørdag"},
-    {ORTH: "og/eller", LEMMA: "og/eller", NORM: "og/eller", TAG: "CC"},
+    {ORTH: "og/eller", LEMMA: "og/eller", NORM: "og/eller"},
 ]:
    _exc[exc_data[ORTH]] = [exc_data]

@ -577,7 +577,7 @@ for h in range(1, 31 + 1):
    for period in ["."]:
        _exc["%d%s" % (h, period)] = [{ORTH: "%d." % h}]

-_custom_base_exc = {"i.": [{ORTH: "i", LEMMA: "i", NORM: "i"}, {ORTH: ".", TAG: PUNCT}]}
+_custom_base_exc = {"i.": [{ORTH: "i", LEMMA: "i", NORM: "i"}, {ORTH: "."}]}
 _exc.update(_custom_base_exc)

 TOKENIZER_EXCEPTIONS = _exc
--- a/spacy/lang/sv/tokenizer_exceptions.py
+++ b/spacy/lang/sv/tokenizer_exceptions.py
@ -1,7 +1,7 @@
 # coding: utf8
 from __future__ import unicode_literals

-from ...symbols import LEMMA, NORM, ORTH, PRON_LEMMA, PUNCT, TAG
+from ...symbols import LEMMA, NORM, ORTH, PRON_LEMMA

 _exc = {}

@ -155,6 +155,6 @@ for orth in ABBREVIATIONS:
 # Sentences ending in "i." (as in "... peka i."), "m." (as in "...än 2000 m."),
 # should be tokenized as two separate tokens.
 for orth in ["i", "m"]:
-    _exc[orth + "."] = [{ORTH: orth, LEMMA: orth, NORM: orth}, {ORTH: ".", TAG: PUNCT}]
+    _exc[orth + "."] = [{ORTH: orth, LEMMA: orth, NORM: orth}, {ORTH: "."}]

 TOKENIZER_EXCEPTIONS = _exc