mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-14 21:57:15 +03:00
1448ad100c
* Improved stop words list * Removed some wrong stop words form list * Improved stop words list * Removed some wrong stop words form list * Improved Polish Tokenizer (#38) * Add tests for polish tokenizer * Add polish tokenizer exceptions * Don't split any words containing hyphens * Fix test case with wrong model answer * Remove commented out line of code until better solution is found * Add source srx' license * Rename exception_list.py to match spaCy conventionality * Add a brief explanation of where the exception list comes from * Add newline after reach exception * Rename COPYING.txt to LICENSE * Delete old files * Add header to the license * Agreements signed * Stanisław Giziński agreement * Krzysztof Kowalczyk - signed agreement * Mateusz Olko agreement * Add DoomCoder's contributor agreement * Improve like number checking in polish lang * like num tests added * all from SI system added * Final licence and removed splitting exceptions * Added polish stop words to LEX_ATTRA * Add encoding info to pl tokenizer exceptions
39 lines
1.5 KiB
Python
39 lines
1.5 KiB
Python
# encoding: utf8
|
|
from __future__ import unicode_literals
|
|
|
|
from ...symbols import ORTH, LEMMA, POS, ADV, ADJ, NOUN, ADP
|
|
from ._tokenizer_exceptions_list import PL_BASE_EXCEPTIONS
|
|
|
|
_exc = {}
|
|
|
|
for exc_data in [
|
|
{ORTH: "m.in.", LEMMA: "między innymi", POS: ADV},
|
|
{ORTH: "inż.", LEMMA: "inżynier", POS: NOUN},
|
|
{ORTH: "mgr.", LEMMA: "magister", POS: NOUN},
|
|
{ORTH: "tzn.", LEMMA: "to znaczy", POS: ADV},
|
|
{ORTH: "tj.", LEMMA: "to jest", POS: ADV},
|
|
{ORTH: "tzw.", LEMMA: "tak zwany", POS: ADJ},
|
|
{ORTH: "adw.", LEMMA: "adwokat", POS: NOUN},
|
|
{ORTH: "afr.", LEMMA: "afrykański", POS: ADJ},
|
|
{ORTH: "c.b.d.o.", LEMMA: "co było do okazania", POS: ADV},
|
|
{ORTH: "cbdu.", LEMMA: "co było do udowodnienia", POS: ADV},
|
|
{ORTH: "mn.w.", LEMMA: "mniej więcej", POS: ADV},
|
|
{ORTH: "nt.", LEMMA: "na temat", POS: ADP},
|
|
{ORTH: "ok.", LEMMA: "około"},
|
|
{ORTH: "n.p.u.", LEMMA: "na psa urok"},
|
|
{ORTH: "ww.", LEMMA: "wyżej wymieniony", POS: ADV}]:
|
|
_exc[exc_data[ORTH]] = [exc_data]
|
|
|
|
for orth in [
|
|
"w.", "r.", "br.", "bm.", "b.r.", "amer.", "am.", "bdb.", "św.", "p.", "lit.",
|
|
"wym.", "czyt.", "daw.", "d.", "zob.", "gw.", "dn.", "dyr.", "im.", "mł.",
|
|
"min.", "dot.", "muz.", "k.k.", "k.p.a.", "k.p.c.", "n.p.m.", "p.p.m.", "nb.",
|
|
"ob.", "n.e.", "p.n.e.", "zw.", "zool.", "zach.", "żarg.", "żart.", "wzgl.",
|
|
"wyj.", "xx.", "ks.", "x.", "wyd.", "wsch.", "o.o."]:
|
|
_exc[orth] = [{ORTH: orth}]
|
|
|
|
for orth in PL_BASE_EXCEPTIONS:
|
|
_exc[orth] = [{ORTH: orth}]
|
|
|
|
TOKENIZER_EXCEPTIONS = _exc
|