Fix tokenization of 'i.' for Danish.

This commit is contained in:
Søren Lind Kristiansen 2017-11-24 11:16:53 +01:00
parent 726fb2d0b5
commit ac8116510d

View File

@ -1,7 +1,7 @@
# encoding: utf8
from __future__ import unicode_literals
from ...symbols import ORTH, LEMMA, NORM
from ...symbols import ORTH, LEMMA, NORM, TAG, ADP, PUNCT
_exc = {}
@ -28,5 +28,12 @@ for orth in [
"t.o.m.", "vha.", ""]:
_exc[orth] = [{ORTH: orth}]
_custom_base_exc = {
"i.": [
{ORTH: "i", LEMMA: "i", NORM: "i"},
{ORTH: ".", TAG: PUNCT}]
}
_exc.update(_custom_base_exc)
TOKENIZER_EXCEPTIONS = _exc