From fdf4776262b978a710a73a95e67c8dca87f855e7 Mon Sep 17 00:00:00 2001 From: Magnus Burton Date: Thu, 22 Dec 2016 22:45:18 +0100 Subject: [PATCH] Added Swedish abbreviations --- spacy/sv/tokenizer_exceptions.py | 76 ++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 spacy/sv/tokenizer_exceptions.py diff --git a/spacy/sv/tokenizer_exceptions.py b/spacy/sv/tokenizer_exceptions.py new file mode 100644 index 000000000..ab2691eda --- /dev/null +++ b/spacy/sv/tokenizer_exceptions.py @@ -0,0 +1,76 @@ +# encoding: utf8 +from __future__ import unicode_literals + +from ..symbols import * +from ..language_data import PRON_LEMMA + + +TOKENIZER_EXCEPTIONS = { + +} + + +ORTH_ONLY = [ + "ang.", + "anm.", + "bil.", + "bl.a.", + "ca", + "cm", + "dl", + "dvs.", + "e.Kr.", + "el.", + "e.d.", + "eng.", + "etc.", + "exkl.", + "f.d.", + "fid.", + "f.Kr.", + "forts.", + "fr.o.m.", + "f.ö.", + "förf.", + "ha", + "hg", + "inkl.", + "i sht", + "i st", + "jmf", + "jur.", + "kcal", + "kg", + "kl.", + "km", + "kr.", + "l", + "lat.", + "m", + "m.a.o.", + "max.", + "m.fl.", + "min.", + "mm", + "m.m.", + "ngn", + "ngt", + "nr", + "obs.", + "o.d.", + "osv.", + "p.g.a.", + "ref.", + "resp.", + "s.", + "s.a.s.", + "s.k.", + "st.", + "s:t", + "t.ex.", + "t.o.m.", + "tfn", + "ung.", + "äv.", + "övers." +]