Update Polish tokenizer for UD_Polish-PDB (#5432)

Update Polish tokenizer for UD_Polish-PDB, which is a relatively major change from the existing tokenizer. Unused exceptions files and conflicting test cases removed. Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
2025-06-29 09:23:12 +03:00 · 2020-05-19 15:59:55 +02:00 · 2020-05-19 15:59:55 +02:00 · 0061992d95
commit 0061992d95
parent a5cd203284
6 changed files with 39 additions and 1536 deletions
--- a/spacy/lang/pl/init.py
+++ b/spacy/lang/pl/init.py
@ -1,8 +1,8 @@
 # coding: utf8
 from __future__ import unicode_literals

-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .punctuation import TOKENIZER_INFIXES
+from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
+from .punctuation import TOKENIZER_SUFFIXES
 from .tag_map import TAG_MAP
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
@ -23,10 +23,15 @@ class PolishDefaults(Language.Defaults):
    lex_attr_getters[NORM] = add_lookups(
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
    )
-    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
+    mod_base_exceptions = {
+        exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".")
+    }
+    tokenizer_exceptions = mod_base_exceptions
    stop_words = STOP_WORDS
    tag_map = TAG_MAP
+    prefixes = TOKENIZER_PREFIXES
    infixes = TOKENIZER_INFIXES
+    suffixes = TOKENIZER_SUFFIXES

    @classmethod
    def create_lemmatizer(cls, nlp=None, lookups=None):
--- a/spacy/lang/pl/_tokenizer_exceptions_list.py
+++ b/spacy/lang/pl/_tokenizer_exceptions_list.py
--- a/spacy/lang/pl/polish_srx_rules_LICENSE.txt
+++ b/spacy/lang/pl/polish_srx_rules_LICENSE.txt
@ -1,23 +0,0 @@
-
-Copyright (c) 2019, Marcin Miłkowski
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met: 
-
-1. Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer. 
-2. Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution. 
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/spacy/lang/pl/punctuation.py
+++ b/spacy/lang/pl/punctuation.py
@ -1,22 +1,46 @@
 # coding: utf8
 from __future__ import unicode_literals

-from ..char_classes import LIST_ELLIPSES, CONCAT_ICONS
+from ..char_classes import LIST_ELLIPSES, LIST_PUNCT, LIST_HYPHENS
+from ..char_classes import LIST_ICONS, LIST_QUOTES, CURRENCY, UNITS, PUNCT
 from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
+from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES

 _quotes = CONCAT_QUOTES.replace("'", "")

+_prefixes = _prefixes = [r"(długo|krótko|jedno|dwu|trzy|cztero)-"] + BASE_TOKENIZER_PREFIXES
+
 _infixes = (
    LIST_ELLIPSES
-    + [CONCAT_ICONS]
+    + LIST_ICONS
+    + LIST_HYPHENS
    + [
-        r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
+        r"(?<=[0-9{al}])\.(?=[0-9{au}])".format(al=ALPHA, au=ALPHA_UPPER),
        r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
-        r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA),
-        r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
+        r"(?<=[{a}])[:<>=\/](?=[{a}])".format(a=ALPHA),
        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
-        r"(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])".format(a=ALPHA, q=CONCAT_QUOTES),
+        r"(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])".format(a=ALPHA, q=_quotes),
    ]
 )

+_suffixes = (
+    ["''", "’’", r"\.", "…"]
+    + LIST_PUNCT
+    + LIST_QUOTES
+    + LIST_ICONS
+    + [
+        r"(?<=[0-9])\+",
+        r"(?<=°[FfCcKk])\.",
+        r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
+        r"(?<=[0-9])(?:{u})".format(u=UNITS),
+        r"(?<=[0-9{al}{e}{p}(?:{q})])\.".format(
+            al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, p=PUNCT
+        ),
+        r"(?<=[{au}])\.".format(au=ALPHA_UPPER),
+    ]
+)
+
+
+TOKENIZER_PREFIXES = _prefixes
 TOKENIZER_INFIXES = _infixes
+TOKENIZER_SUFFIXES = _suffixes
--- a/spacy/lang/pl/tokenizer_exceptions.py
+++ b/spacy/lang/pl/tokenizer_exceptions.py
@ -1,26 +0,0 @@
-# encoding: utf8
-from __future__ import unicode_literals
-
-from ._tokenizer_exceptions_list import PL_BASE_EXCEPTIONS
-from ...symbols import POS, ADV, NOUN, ORTH, LEMMA, ADJ
-
-
-_exc = {}
-
-for exc_data in [
-    {ORTH: "m.in.", LEMMA: "między innymi", POS: ADV},
-    {ORTH: "inż.", LEMMA: "inżynier", POS: NOUN},
-    {ORTH: "mgr.", LEMMA: "magister", POS: NOUN},
-    {ORTH: "tzn.", LEMMA: "to znaczy", POS: ADV},
-    {ORTH: "tj.", LEMMA: "to jest", POS: ADV},
-    {ORTH: "tzw.", LEMMA: "tak zwany", POS: ADJ},
-]:
-    _exc[exc_data[ORTH]] = [exc_data]
-
-for orth in ["w.", "r."]:
-    _exc[orth] = [{ORTH: orth}]
-
-for orth in PL_BASE_EXCEPTIONS:
-    _exc[orth] = [{ORTH: orth}]
-
-TOKENIZER_EXCEPTIONS = _exc
--- a/spacy/tests/lang/pl/test_tokenizer.py
+++ b/spacy/tests/lang/pl/test_tokenizer.py
@ -4,49 +4,15 @@ from __future__ import unicode_literals
 import pytest

 DOT_TESTS = [
-    ("tel.", ["tel."]),
-    ("np.", ["np."]),
-    ("godz. 21:37", ["godz.", "21:37"]),
-    ("inż.", ["inż."]),
-    ("gosp.-polit.", ["gosp.-polit."]),
-    ("ppoż", ["ppoż"]),
-    ("płn", ["płn"]),
-    ("ul.", ["ul."]),
-    ("jw.", ["jw."]),
-    ("itd.", ["itd."]),
-    ("cdn.", ["cdn."]),
-    ("itp.", ["itp."]),
-    ("10,- zł", ["10,-", "zł"]),
+    ("tel.", ["tel", "."]),
    ("0 zł 99 gr", ["0", "zł", "99", "gr"]),
-    ("0,99 rub.", ["0,99", "rub."]),
-    ("dol.", ["dol."]),
-    ("1000 m n.p.m.", ["1000", "m", "n.p.m."]),
-    ("m.in.", ["m.in."]),
-    ("p.n.e.", ["p.n.e."]),
-    ("Sz.P.", ["Sz.P."]),
-    ("p.o.", ["p.o."]),
-    ("k.o.", ["k.o."]),
-    ("m.st.", ["m.st."]),
-    ("dra.", ["dra", "."]),
-    ("pp.", ["pp."]),
-    ("oo.", ["oo."]),
 ]

 HYPHEN_TESTS = [
-    ("5-fluoropentylo-3-pirydynyloindol", ["5-fluoropentylo-3-pirydynyloindol"]),
-    ("NESS-040C5", ["NESS-040C5"]),
-    ("JTE-7-31", ["JTE-7-31"]),
-    ("BAY-59-3074", ["BAY-59-3074"]),
-    ("BAY-38-7271", ["BAY-38-7271"]),
-    ("STS-135", ["STS-135"]),
-    ("5F-PB-22", ["5F-PB-22"]),
    ("cztero-", ["cztero-"]),
    ("jedno-", ["jedno-"]),
    ("dwu-", ["dwu-"]),
    ("trzy-", ["trzy-"]),
-    ("b-adoratorzy", ["b-adoratorzy"]),
-    ("2-3-4 drzewa", ["2-3-4", "drzewa"]),
-    ("b-drzewa", ["b-drzewa"]),
 ]