From 07639dd6ac9db6f874d1f01ccb5e37a910924feb Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Wed, 13 May 2020 10:25:54 +0200 Subject: [PATCH] Remove TAG from da/sv tokenizer exceptions (#5428) Remove `TAG` value from Danish and Swedish tokenizer exceptions because it may not be included in a tag map (and these settings are problematic as tokenizer exceptions anyway). --- spacy/lang/da/tokenizer_exceptions.py | 6 +++--- spacy/lang/sv/tokenizer_exceptions.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/spacy/lang/da/tokenizer_exceptions.py b/spacy/lang/da/tokenizer_exceptions.py index 89b083186..9e4637bfb 100644 --- a/spacy/lang/da/tokenizer_exceptions.py +++ b/spacy/lang/da/tokenizer_exceptions.py @@ -6,7 +6,7 @@ Source: https://forkortelse.dk/ and various others. from __future__ import unicode_literals -from ...symbols import ORTH, LEMMA, NORM, TAG, PUNCT +from ...symbols import ORTH, LEMMA, NORM _exc = {} @@ -52,7 +52,7 @@ for exc_data in [ {ORTH: "Ons.", LEMMA: "onsdag"}, {ORTH: "Fre.", LEMMA: "fredag"}, {ORTH: "Lør.", LEMMA: "lørdag"}, - {ORTH: "og/eller", LEMMA: "og/eller", NORM: "og/eller", TAG: "CC"}, + {ORTH: "og/eller", LEMMA: "og/eller", NORM: "og/eller"}, ]: _exc[exc_data[ORTH]] = [exc_data] @@ -577,7 +577,7 @@ for h in range(1, 31 + 1): for period in ["."]: _exc["%d%s" % (h, period)] = [{ORTH: "%d." % h}] -_custom_base_exc = {"i.": [{ORTH: "i", LEMMA: "i", NORM: "i"}, {ORTH: ".", TAG: PUNCT}]} +_custom_base_exc = {"i.": [{ORTH: "i", LEMMA: "i", NORM: "i"}, {ORTH: "."}]} _exc.update(_custom_base_exc) TOKENIZER_EXCEPTIONS = _exc diff --git a/spacy/lang/sv/tokenizer_exceptions.py b/spacy/lang/sv/tokenizer_exceptions.py index dd0976aa6..e95c67f37 100644 --- a/spacy/lang/sv/tokenizer_exceptions.py +++ b/spacy/lang/sv/tokenizer_exceptions.py @@ -1,7 +1,7 @@ # coding: utf8 from __future__ import unicode_literals -from ...symbols import LEMMA, NORM, ORTH, PRON_LEMMA, PUNCT, TAG +from ...symbols import LEMMA, NORM, ORTH, PRON_LEMMA _exc = {} @@ -155,6 +155,6 @@ for orth in ABBREVIATIONS: # Sentences ending in "i." (as in "... peka i."), "m." (as in "...än 2000 m."), # should be tokenized as two separate tokens. for orth in ["i", "m"]: - _exc[orth + "."] = [{ORTH: orth, LEMMA: orth, NORM: orth}, {ORTH: ".", TAG: PUNCT}] + _exc[orth + "."] = [{ORTH: orth, LEMMA: orth, NORM: orth}, {ORTH: "."}] TOKENIZER_EXCEPTIONS = _exc