From 2f84c735856b1a69e732c1dfcb3842e0458d8781 Mon Sep 17 00:00:00 2001 From: Jim O'Regan Date: Mon, 26 Jun 2017 22:40:04 +0100 Subject: [PATCH] a start --- spacy/lang/pl/tokenizer_exceptions.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 spacy/lang/pl/tokenizer_exceptions.py diff --git a/spacy/lang/pl/tokenizer_exceptions.py b/spacy/lang/pl/tokenizer_exceptions.py new file mode 100644 index 000000000..4dffb6209 --- /dev/null +++ b/spacy/lang/pl/tokenizer_exceptions.py @@ -0,0 +1,23 @@ +# encoding: utf8 +from __future__ import unicode_literals + +from ..symbols import ORTH, LEMMA, POS + + +_exc = {} + +for exc_data in [ + {ORTH: "m.in.", LEMMA: "między innymi", POS: ADV}, + {ORTH: "inż.", LEMMA: "inżynier", POS: NOUN}, + {ORTH: "mgr.", LEMMA: "magister", POS: NOUN}, + {ORTH: "tzn.", LEMMA: "to znaczy", POS: ADV}, + {ORTH: "tj.", LEMMA: "to jest", POS: ADV}, + {ORTH: "tzw.", LEMMA: "tak zwany", POS: ADJ}]: + _exc[exc_data[ORTH]] = [dict(exc_data)], + +for orth in [ + "w.", "r."]: + _exc[orth] = [{ORTH: orth}] + + +TOKENIZER_EXCEPTIONS = dict(_exc)