From 07639dd6ac9db6f874d1f01ccb5e37a910924feb Mon Sep 17 00:00:00 2001
From: adrianeboyd <adrianeboyd@gmail.com>
Date: Wed, 13 May 2020 10:25:54 +0200
Subject: [PATCH] Remove TAG from da/sv tokenizer exceptions (#5428)

Remove `TAG` value from Danish and Swedish tokenizer exceptions because
it may not be included in a tag map (and these settings are problematic
as tokenizer exceptions anyway).
---
 spacy/lang/da/tokenizer_exceptions.py | 6 +++---
 spacy/lang/sv/tokenizer_exceptions.py | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/spacy/lang/da/tokenizer_exceptions.py b/spacy/lang/da/tokenizer_exceptions.py
index 89b083186..9e4637bfb 100644
--- a/spacy/lang/da/tokenizer_exceptions.py
+++ b/spacy/lang/da/tokenizer_exceptions.py
@@ -6,7 +6,7 @@ Source: https://forkortelse.dk/ and various others.
 
 from __future__ import unicode_literals
 
-from ...symbols import ORTH, LEMMA, NORM, TAG, PUNCT
+from ...symbols import ORTH, LEMMA, NORM
 
 
 _exc = {}
@@ -52,7 +52,7 @@ for exc_data in [
     {ORTH: "Ons.", LEMMA: "onsdag"},
     {ORTH: "Fre.", LEMMA: "fredag"},
     {ORTH: "Lør.", LEMMA: "lørdag"},
-    {ORTH: "og/eller", LEMMA: "og/eller", NORM: "og/eller", TAG: "CC"},
+    {ORTH: "og/eller", LEMMA: "og/eller", NORM: "og/eller"},
 ]:
     _exc[exc_data[ORTH]] = [exc_data]
 
@@ -577,7 +577,7 @@ for h in range(1, 31 + 1):
     for period in ["."]:
         _exc["%d%s" % (h, period)] = [{ORTH: "%d." % h}]
 
-_custom_base_exc = {"i.": [{ORTH: "i", LEMMA: "i", NORM: "i"}, {ORTH: ".", TAG: PUNCT}]}
+_custom_base_exc = {"i.": [{ORTH: "i", LEMMA: "i", NORM: "i"}, {ORTH: "."}]}
 _exc.update(_custom_base_exc)
 
 TOKENIZER_EXCEPTIONS = _exc
diff --git a/spacy/lang/sv/tokenizer_exceptions.py b/spacy/lang/sv/tokenizer_exceptions.py
index dd0976aa6..e95c67f37 100644
--- a/spacy/lang/sv/tokenizer_exceptions.py
+++ b/spacy/lang/sv/tokenizer_exceptions.py
@@ -1,7 +1,7 @@
 # coding: utf8
 from __future__ import unicode_literals
 
-from ...symbols import LEMMA, NORM, ORTH, PRON_LEMMA, PUNCT, TAG
+from ...symbols import LEMMA, NORM, ORTH, PRON_LEMMA
 
 _exc = {}
 
@@ -155,6 +155,6 @@ for orth in ABBREVIATIONS:
 # Sentences ending in "i." (as in "... peka i."), "m." (as in "...än 2000 m."),
 # should be tokenized as two separate tokens.
 for orth in ["i", "m"]:
-    _exc[orth + "."] = [{ORTH: orth, LEMMA: orth, NORM: orth}, {ORTH: ".", TAG: PUNCT}]
+    _exc[orth + "."] = [{ORTH: orth, LEMMA: orth, NORM: orth}, {ORTH: "."}]
 
 TOKENIZER_EXCEPTIONS = _exc