diff --git a/spacy/lang/da/tokenizer_exceptions.py b/spacy/lang/da/tokenizer_exceptions.py index c67c038bf..303d41158 100644 --- a/spacy/lang/da/tokenizer_exceptions.py +++ b/spacy/lang/da/tokenizer_exceptions.py @@ -1,7 +1,7 @@ # encoding: utf8 from __future__ import unicode_literals -from ...symbols import ORTH, LEMMA, NORM +from ...symbols import ORTH, LEMMA, NORM, TAG, ADP, PUNCT _exc = {} @@ -28,5 +28,12 @@ for orth in [ "t.o.m.", "vha.", ""]: _exc[orth] = [{ORTH: orth}] +_custom_base_exc = { + "i.": [ + {ORTH: "i", LEMMA: "i", NORM: "i"}, + {ORTH: ".", TAG: PUNCT}] +} +_exc.update(_custom_base_exc) + TOKENIZER_EXCEPTIONS = _exc