Update Polish tokenizer for UD_Polish-PDB (#5432)

Update Polish tokenizer for UD_Polish-PDB, which is a relatively major
change from the existing tokenizer. Unused exceptions files and
conflicting test cases removed.

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
This commit is contained in:
adrianeboyd 2020-05-19 15:59:55 +02:00 committed by GitHub
parent a5cd203284
commit 0061992d95
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 39 additions and 1536 deletions

View File

@ -1,8 +1,8 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .punctuation import TOKENIZER_INFIXES from .punctuation import TOKENIZER_SUFFIXES
from .tag_map import TAG_MAP from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
@ -23,10 +23,15 @@ class PolishDefaults(Language.Defaults):
lex_attr_getters[NORM] = add_lookups( lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
) )
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) mod_base_exceptions = {
exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".")
}
tokenizer_exceptions = mod_base_exceptions
stop_words = STOP_WORDS stop_words = STOP_WORDS
tag_map = TAG_MAP tag_map = TAG_MAP
prefixes = TOKENIZER_PREFIXES
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES
@classmethod @classmethod
def create_lemmatizer(cls, nlp=None, lookups=None): def create_lemmatizer(cls, nlp=None, lookups=None):

File diff suppressed because it is too large Load Diff

View File

@ -1,23 +0,0 @@
Copyright (c) 2019, Marcin Miłkowski
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@ -1,22 +1,46 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from ..char_classes import LIST_ELLIPSES, CONCAT_ICONS from ..char_classes import LIST_ELLIPSES, LIST_PUNCT, LIST_HYPHENS
from ..char_classes import LIST_ICONS, LIST_QUOTES, CURRENCY, UNITS, PUNCT
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
_quotes = CONCAT_QUOTES.replace("'", "") _quotes = CONCAT_QUOTES.replace("'", "")
_prefixes = _prefixes = [r"(długo|krótko|jedno|dwu|trzy|cztero)-"] + BASE_TOKENIZER_PREFIXES
_infixes = ( _infixes = (
LIST_ELLIPSES LIST_ELLIPSES
+ [CONCAT_ICONS] + LIST_ICONS
+ LIST_HYPHENS
+ [ + [
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER), r"(?<=[0-9{al}])\.(?=[0-9{au}])".format(al=ALPHA, au=ALPHA_UPPER),
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA), r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA), r"(?<=[{a}])[:<>=\/](?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])".format(a=ALPHA, q=CONCAT_QUOTES), r"(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])".format(a=ALPHA, q=_quotes),
] ]
) )
_suffixes = (
["''", "", r"\.", ""]
+ LIST_PUNCT
+ LIST_QUOTES
+ LIST_ICONS
+ [
r"(?<=[0-9])\+",
r"(?<=°[FfCcKk])\.",
r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
r"(?<=[0-9])(?:{u})".format(u=UNITS),
r"(?<=[0-9{al}{e}{p}(?:{q})])\.".format(
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, p=PUNCT
),
r"(?<=[{au}])\.".format(au=ALPHA_UPPER),
]
)
TOKENIZER_PREFIXES = _prefixes
TOKENIZER_INFIXES = _infixes TOKENIZER_INFIXES = _infixes
TOKENIZER_SUFFIXES = _suffixes

View File

@ -1,26 +0,0 @@
# encoding: utf8
from __future__ import unicode_literals
from ._tokenizer_exceptions_list import PL_BASE_EXCEPTIONS
from ...symbols import POS, ADV, NOUN, ORTH, LEMMA, ADJ
_exc = {}
for exc_data in [
{ORTH: "m.in.", LEMMA: "między innymi", POS: ADV},
{ORTH: "inż.", LEMMA: "inżynier", POS: NOUN},
{ORTH: "mgr.", LEMMA: "magister", POS: NOUN},
{ORTH: "tzn.", LEMMA: "to znaczy", POS: ADV},
{ORTH: "tj.", LEMMA: "to jest", POS: ADV},
{ORTH: "tzw.", LEMMA: "tak zwany", POS: ADJ},
]:
_exc[exc_data[ORTH]] = [exc_data]
for orth in ["w.", "r."]:
_exc[orth] = [{ORTH: orth}]
for orth in PL_BASE_EXCEPTIONS:
_exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = _exc

View File

@ -4,49 +4,15 @@ from __future__ import unicode_literals
import pytest import pytest
DOT_TESTS = [ DOT_TESTS = [
("tel.", ["tel."]), ("tel.", ["tel", "."]),
("np.", ["np."]),
("godz. 21:37", ["godz.", "21:37"]),
("inż.", ["inż."]),
("gosp.-polit.", ["gosp.-polit."]),
("ppoż", ["ppoż"]),
("płn", ["płn"]),
("ul.", ["ul."]),
("jw.", ["jw."]),
("itd.", ["itd."]),
("cdn.", ["cdn."]),
("itp.", ["itp."]),
("10,- zł", ["10,-", ""]),
("0 zł 99 gr", ["0", "", "99", "gr"]), ("0 zł 99 gr", ["0", "", "99", "gr"]),
("0,99 rub.", ["0,99", "rub."]),
("dol.", ["dol."]),
("1000 m n.p.m.", ["1000", "m", "n.p.m."]),
("m.in.", ["m.in."]),
("p.n.e.", ["p.n.e."]),
("Sz.P.", ["Sz.P."]),
("p.o.", ["p.o."]),
("k.o.", ["k.o."]),
("m.st.", ["m.st."]),
("dra.", ["dra", "."]),
("pp.", ["pp."]),
("oo.", ["oo."]),
] ]
HYPHEN_TESTS = [ HYPHEN_TESTS = [
("5-fluoropentylo-3-pirydynyloindol", ["5-fluoropentylo-3-pirydynyloindol"]),
("NESS-040C5", ["NESS-040C5"]),
("JTE-7-31", ["JTE-7-31"]),
("BAY-59-3074", ["BAY-59-3074"]),
("BAY-38-7271", ["BAY-38-7271"]),
("STS-135", ["STS-135"]),
("5F-PB-22", ["5F-PB-22"]),
("cztero-", ["cztero-"]), ("cztero-", ["cztero-"]),
("jedno-", ["jedno-"]), ("jedno-", ["jedno-"]),
("dwu-", ["dwu-"]), ("dwu-", ["dwu-"]),
("trzy-", ["trzy-"]), ("trzy-", ["trzy-"]),
("b-adoratorzy", ["b-adoratorzy"]),
("2-3-4 drzewa", ["2-3-4", "drzewa"]),
("b-drzewa", ["b-drzewa"]),
] ]