From ac8116510d3463995e0788b032b3aecb62acea22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=B8ren=20Lind=20Kristiansen?= Date: Fri, 24 Nov 2017 11:16:53 +0100 Subject: [PATCH] Fix tokenization of 'i.' for Danish. --- spacy/lang/da/tokenizer_exceptions.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/spacy/lang/da/tokenizer_exceptions.py b/spacy/lang/da/tokenizer_exceptions.py index c67c038bf..303d41158 100644 --- a/spacy/lang/da/tokenizer_exceptions.py +++ b/spacy/lang/da/tokenizer_exceptions.py @@ -1,7 +1,7 @@ # encoding: utf8 from __future__ import unicode_literals -from ...symbols import ORTH, LEMMA, NORM +from ...symbols import ORTH, LEMMA, NORM, TAG, ADP, PUNCT _exc = {} @@ -28,5 +28,12 @@ for orth in [ "t.o.m.", "vha.", ""]: _exc[orth] = [{ORTH: orth}] +_custom_base_exc = { + "i.": [ + {ORTH: "i", LEMMA: "i", NORM: "i"}, + {ORTH: ".", TAG: PUNCT}] +} +_exc.update(_custom_base_exc) + TOKENIZER_EXCEPTIONS = _exc