a start

2026-02-16 20:20:41 +03:00 · 2017-06-26 22:40:04 +01:00 · 2017-06-26 22:40:04 +01:00 · 2f84c73585
commit 2f84c73585
parent 28d7f0a672
1 changed files with 23 additions and 0 deletions
--- a/spacy/lang/pl/tokenizer_exceptions.py
+++ b/spacy/lang/pl/tokenizer_exceptions.py
@ -0,0 +1,23 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+from ..symbols import ORTH, LEMMA, POS
+
+
+_exc = {}
+
+for exc_data in [
+    {ORTH: "m.in.", LEMMA: "między innymi", POS: ADV},
+    {ORTH: "inż.", LEMMA: "inżynier", POS: NOUN},
+    {ORTH: "mgr.", LEMMA: "magister", POS: NOUN},
+    {ORTH: "tzn.", LEMMA: "to znaczy", POS: ADV},
+    {ORTH: "tj.", LEMMA: "to jest", POS: ADV},
+    {ORTH: "tzw.", LEMMA: "tak zwany", POS: ADJ}]:
+    _exc[exc_data[ORTH]] = [dict(exc_data)],
+
+for orth in [
+    "w.", "r."]:
+    _exc[orth] = [{ORTH: orth}]
+
+
+TOKENIZER_EXCEPTIONS = dict(_exc)