Update Polish tokenizer for UD_Polish-PDB (#5432)

Update Polish tokenizer for UD_Polish-PDB, which is a relatively major change from the existing tokenizer. Unused exceptions files and conflicting test cases removed. Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
2025-10-16 08:46:51 +03:00 · 2020-05-19 15:59:55 +02:00 · 2020-05-19 15:59:55 +02:00 · 0061992d95
commit 0061992d95
parent a5cd203284
6 changed files with 39 additions and 1536 deletions
--- a/spacy/lang/pl/init.py
+++ b/spacy/lang/pl/init.py
@ -1,8 +1,8 @@
 # coding: utf8
 from __future__ import unicode_literals
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
+from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
-from .punctuation import TOKENIZER_INFIXES
+from .punctuation import TOKENIZER_SUFFIXES
 from .tag_map import TAG_MAP
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
@ -23,10 +23,15 @@ class PolishDefaults(Language.Defaults):
    lex_attr_getters[NORM] = add_lookups(
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
    )
-    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
+    mod_base_exceptions = {
        exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".")
    }
    tokenizer_exceptions = mod_base_exceptions
    stop_words = STOP_WORDS
    tag_map = TAG_MAP
    prefixes = TOKENIZER_PREFIXES
    infixes = TOKENIZER_INFIXES
    suffixes = TOKENIZER_SUFFIXES
    @classmethod
    def create_lemmatizer(cls, nlp=None, lookups=None):
--- a/spacy/lang/pl/_tokenizer_exceptions_list.py
+++ b/spacy/lang/pl/_tokenizer_exceptions_list.py
--- a/spacy/lang/pl/polish_srx_rules_LICENSE.txt
+++ b/spacy/lang/pl/polish_srx_rules_LICENSE.txt
@ -1,23 +0,0 @@
 Copyright (c) 2019, Marcin Miłkowski
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met: 
 1. Redistributions of source code must retain the above copyright notice, this
   list of conditions and the following disclaimer. 
 2. Redistributions in binary form must reproduce the above copyright notice,
   this list of conditions and the following disclaimer in the documentation
   and/or other materials provided with the distribution. 
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/spacy/lang/pl/punctuation.py
+++ b/spacy/lang/pl/punctuation.py
@ -1,22 +1,46 @@
 # coding: utf8
 from __future__ import unicode_literals
-from ..char_classes import LIST_ELLIPSES, CONCAT_ICONS
+from ..char_classes import LIST_ELLIPSES, LIST_PUNCT, LIST_HYPHENS
 from ..char_classes import LIST_ICONS, LIST_QUOTES, CURRENCY, UNITS, PUNCT
 from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
 from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
 _quotes = CONCAT_QUOTES.replace("'", "")
 _prefixes = _prefixes = [r"(długo|krótko|jedno|dwu|trzy|cztero)-"] + BASE_TOKENIZER_PREFIXES
 _infixes = (
    LIST_ELLIPSES
-    + [CONCAT_ICONS]
+    + LIST_ICONS
    + LIST_HYPHENS
    + [
-        r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
+        r"(?<=[0-9{al}])\.(?=[0-9{au}])".format(al=ALPHA, au=ALPHA_UPPER),
        r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
-        r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA),
+        r"(?<=[{a}])[:<>=\/](?=[{a}])".format(a=ALPHA),
        r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
-        r"(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])".format(a=ALPHA, q=CONCAT_QUOTES),
+        r"(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])".format(a=ALPHA, q=_quotes),
    ]
 )
 _suffixes = (
    ["''", "’’", r"\.", "…"]
    + LIST_PUNCT
    + LIST_QUOTES
    + LIST_ICONS
    + [
        r"(?<=[0-9])\+",
        r"(?<=°[FfCcKk])\.",
        r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
        r"(?<=[0-9])(?:{u})".format(u=UNITS),
        r"(?<=[0-9{al}{e}{p}(?:{q})])\.".format(
            al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, p=PUNCT
        ),
        r"(?<=[{au}])\.".format(au=ALPHA_UPPER),
    ]
 )
 TOKENIZER_PREFIXES = _prefixes
 TOKENIZER_INFIXES = _infixes
 TOKENIZER_SUFFIXES = _suffixes
--- a/spacy/lang/pl/tokenizer_exceptions.py
+++ b/spacy/lang/pl/tokenizer_exceptions.py
@ -1,26 +0,0 @@
 # encoding: utf8
 from __future__ import unicode_literals
 from ._tokenizer_exceptions_list import PL_BASE_EXCEPTIONS
 from ...symbols import POS, ADV, NOUN, ORTH, LEMMA, ADJ
 _exc = {}
 for exc_data in [
    {ORTH: "m.in.", LEMMA: "między innymi", POS: ADV},
    {ORTH: "inż.", LEMMA: "inżynier", POS: NOUN},
    {ORTH: "mgr.", LEMMA: "magister", POS: NOUN},
    {ORTH: "tzn.", LEMMA: "to znaczy", POS: ADV},
    {ORTH: "tj.", LEMMA: "to jest", POS: ADV},
    {ORTH: "tzw.", LEMMA: "tak zwany", POS: ADJ},
 ]:
    _exc[exc_data[ORTH]] = [exc_data]
 for orth in ["w.", "r."]:
    _exc[orth] = [{ORTH: orth}]
 for orth in PL_BASE_EXCEPTIONS:
    _exc[orth] = [{ORTH: orth}]
 TOKENIZER_EXCEPTIONS = _exc
--- a/spacy/tests/lang/pl/test_tokenizer.py
+++ b/spacy/tests/lang/pl/test_tokenizer.py
@ -4,49 +4,15 @@ from __future__ import unicode_literals
 import pytest
 DOT_TESTS = [
-    ("tel.", ["tel."]),
+    ("tel.", ["tel", "."]),
    ("np.", ["np."]),
    ("godz. 21:37", ["godz.", "21:37"]),
    ("inż.", ["inż."]),
    ("gosp.-polit.", ["gosp.-polit."]),
    ("ppoż", ["ppoż"]),
    ("płn", ["płn"]),
    ("ul.", ["ul."]),
    ("jw.", ["jw."]),
    ("itd.", ["itd."]),
    ("cdn.", ["cdn."]),
    ("itp.", ["itp."]),
    ("10,- zł", ["10,-", "zł"]),
    ("0 zł 99 gr", ["0", "zł", "99", "gr"]),
    ("0,99 rub.", ["0,99", "rub."]),
    ("dol.", ["dol."]),
    ("1000 m n.p.m.", ["1000", "m", "n.p.m."]),
    ("m.in.", ["m.in."]),
    ("p.n.e.", ["p.n.e."]),
    ("Sz.P.", ["Sz.P."]),
    ("p.o.", ["p.o."]),
    ("k.o.", ["k.o."]),
    ("m.st.", ["m.st."]),
    ("dra.", ["dra", "."]),
    ("pp.", ["pp."]),
    ("oo.", ["oo."]),
 ]
 HYPHEN_TESTS = [
    ("5-fluoropentylo-3-pirydynyloindol", ["5-fluoropentylo-3-pirydynyloindol"]),
    ("NESS-040C5", ["NESS-040C5"]),
    ("JTE-7-31", ["JTE-7-31"]),
    ("BAY-59-3074", ["BAY-59-3074"]),
    ("BAY-38-7271", ["BAY-38-7271"]),
    ("STS-135", ["STS-135"]),
    ("5F-PB-22", ["5F-PB-22"]),
    ("cztero-", ["cztero-"]),
    ("jedno-", ["jedno-"]),
    ("dwu-", ["dwu-"]),
    ("trzy-", ["trzy-"]),
    ("b-adoratorzy", ["b-adoratorzy"]),
    ("2-3-4 drzewa", ["2-3-4", "drzewa"]),
    ("b-drzewa", ["b-drzewa"]),
 ]