spaCy/spacy/lang/pl/tokenizer_exceptions.py

# encoding: utf8
from __future__ import unicode_literals

from ...symbols import ORTH, LEMMA, POS, ADV, ADJ, NOUN


_exc = {}

for exc_data in [
    {ORTH: "m.in.", LEMMA: "między innymi", POS: ADV},
    {ORTH: "inż.", LEMMA: "inżynier", POS: NOUN},
    {ORTH: "mgr.", LEMMA: "magister", POS: NOUN},
    {ORTH: "tzn.", LEMMA: "to znaczy", POS: ADV},
    {ORTH: "tj.", LEMMA: "to jest", POS: ADV},
    {ORTH: "tzw.", LEMMA: "tak zwany", POS: ADJ}]:
    _exc[exc_data[ORTH]] = [exc_data]

for orth in [
    "w.", "r."]:
    _exc[orth] = [{ORTH: orth}]


TOKENIZER_EXCEPTIONS = _exc
a start 2017-06-27 00:40:04 +03:00			`# encoding: utf8`
			`from __future__ import unicode_literals`

Revert "Merge branch 'develop' of https://github.com/explosion/spaCy into develop" This reverts commit c9ba3d3c2dc7067cf8bd55f878cec45a8c6d73d4, reversing changes made to 92c26a35d425d4e8ca1b805ea776ea10f5ded3df. 2018-03-27 20:23:02 +03:00			`from ...symbols import ORTH, LEMMA, POS, ADV, ADJ, NOUN`
a start 2017-06-27 00:40:04 +03:00

			`_exc = {}`

			`for exc_data in [`
			`{ORTH: "m.in.", LEMMA: "między innymi", POS: ADV},`
			`{ORTH: "inż.", LEMMA: "inżynier", POS: NOUN},`
			`{ORTH: "mgr.", LEMMA: "magister", POS: NOUN},`
			`{ORTH: "tzn.", LEMMA: "to znaczy", POS: ADV},`
			`{ORTH: "tj.", LEMMA: "to jest", POS: ADV},`
Revert "Merge branch 'develop' of https://github.com/explosion/spaCy into develop" This reverts commit c9ba3d3c2dc7067cf8bd55f878cec45a8c6d73d4, reversing changes made to 92c26a35d425d4e8ca1b805ea776ea10f5ded3df. 2018-03-27 20:23:02 +03:00			`{ORTH: "tzw.", LEMMA: "tak zwany", POS: ADJ}]:`
Tidy up tokenizer exceptions 2017-11-02 01:02:45 +03:00			`_exc[exc_data[ORTH]] = [exc_data]`
a start 2017-06-27 00:40:04 +03:00
			`for orth in [`
Revert "Merge branch 'develop' of https://github.com/explosion/spaCy into develop" This reverts commit c9ba3d3c2dc7067cf8bd55f878cec45a8c6d73d4, reversing changes made to 92c26a35d425d4e8ca1b805ea776ea10f5ded3df. 2018-03-27 20:23:02 +03:00			`"w.", "r."]:`
a start 2017-06-27 00:40:04 +03:00			`_exc[orth] = [{ORTH: orth}]`


Don't copy exception dicts if not necessary and tidy up 2017-10-31 23:05:29 +03:00			`TOKENIZER_EXCEPTIONS = _exc`