Add Polish lemmatizer (#5413)

* Add Polish lemmatizer Contributed by @ryszardtuora * Add missing import
2025-10-19 18:24:30 +03:00 · 2020-05-14 18:23:19 +02:00 · 2020-05-14 18:23:19 +02:00 · f49e2810e6
commit f49e2810e6
parent e63880e081
3 changed files with 116 additions and 1 deletions
--- a/setup.cfg
+++ b/setup.cfg
@ -59,7 +59,7 @@ install_requires =

 [options.extras_require]
 lookups =
-    spacy_lookups_data>=0.0.5,<0.2.0
+    spacy_lookups_data>=0.3.1,<0.4.0
 cuda =
    cupy>=5.0.0b4,<9.0.0
 cuda80 =
--- a/spacy/lang/pl/init.py
+++ b/spacy/lang/pl/init.py
@ -6,12 +6,14 @@ from .punctuation import TOKENIZER_INFIXES
 from .tag_map import TAG_MAP
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
+from .lemmatizer import PolishLemmatizer

 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
 from ...language import Language
 from ...attrs import LANG, NORM
 from ...util import update_exc, add_lookups
+from ...lookups import Lookups


 class PolishDefaults(Language.Defaults):
@ -26,6 +28,12 @@ class PolishDefaults(Language.Defaults):
    tag_map = TAG_MAP
    infixes = TOKENIZER_INFIXES

+    @classmethod
+    def create_lemmatizer(cls, nlp=None, lookups=None):
+        if lookups is None:
+            lookups = Lookups()
+        return PolishLemmatizer(lookups)
+

 class Polish(Language):
    lang = "pl"
--- a/spacy/lang/pl/lemmatizer.py
+++ b/spacy/lang/pl/lemmatizer.py
@ -0,0 +1,107 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from ...lemmatizer import Lemmatizer
+from ...parts_of_speech import NAMES
+from ...errors import Errors
+
+
+class PolishLemmatizer(Lemmatizer):
+    # This lemmatizer implements lookup lemmatization based on
+    # the Morfeusz dictionary (morfeusz.sgjp.pl/en) by Institute of Computer Science PAS
+    # It utilizes some prefix based improvements for
+    # verb and adjectives lemmatization, as well as case-sensitive
+    # lemmatization for nouns
+    def __init__(self, lookups, *args, **kwargs):
+        # this lemmatizer is lookup based, so it does not require an index, exceptionlist, or rules
+        super().__init__(lookups)
+        self.lemma_lookups = {}
+        for tag in [
+            "ADJ",
+            "ADP",
+            "ADV",
+            "AUX",
+            "NOUN",
+            "NUM",
+            "PART",
+            "PRON",
+            "VERB",
+            "X",
+        ]:
+            self.lemma_lookups[tag] = self.lookups.get_table(
+                "lemma_lookup_" + tag.lower(), {}
+            )
+        self.lemma_lookups["DET"] = self.lemma_lookups["X"]
+        self.lemma_lookups["PROPN"] = self.lemma_lookups["NOUN"]
+
+    def __call__(self, string, univ_pos, morphology=None):
+        if isinstance(univ_pos, int):
+            univ_pos = NAMES.get(univ_pos, "X")
+        univ_pos = univ_pos.upper()
+
+        if univ_pos == "NOUN":
+            return self.lemmatize_noun(string, morphology)
+
+        if univ_pos != "PROPN":
+            string = string.lower()
+
+        if univ_pos == "ADJ":
+            return self.lemmatize_adj(string, morphology)
+        elif univ_pos == "VERB":
+            return self.lemmatize_verb(string, morphology)
+
+        lemma_dict = self.lemma_lookups.get(univ_pos, {})
+        return [lemma_dict.get(string, string.lower())]
+
+    def lemmatize_adj(self, string, morphology):
+        # this method utilizes different procedures for adjectives
+        # with 'nie' and 'naj' prefixes
+        lemma_dict = self.lemma_lookups["ADJ"]
+
+        if string[:3] == "nie":
+            search_string = string[3:]
+            if search_string[:3] == "naj":
+                naj_search_string = search_string[3:]
+                if naj_search_string in lemma_dict:
+                    return [lemma_dict[naj_search_string]]
+            if search_string in lemma_dict:
+                return [lemma_dict[search_string]]
+
+        if string[:3] == "naj":
+            naj_search_string = string[3:]
+            if naj_search_string in lemma_dict:
+                return [lemma_dict[naj_search_string]]
+
+        return [lemma_dict.get(string, string)]
+
+    def lemmatize_verb(self, string, morphology):
+        # this method utilizes a different procedure for verbs
+        # with 'nie' prefix
+        lemma_dict = self.lemma_lookups["VERB"]
+
+        if string[:3] == "nie":
+            search_string = string[3:]
+            if search_string in lemma_dict:
+                return [lemma_dict[search_string]]
+
+        return [lemma_dict.get(string, string)]
+
+    def lemmatize_noun(self, string, morphology):
+        # this method is case-sensitive, in order to work
+        # for incorrectly tagged proper names
+        lemma_dict = self.lemma_lookups["NOUN"]
+
+        if string != string.lower():
+            if string.lower() in lemma_dict:
+                return [lemma_dict[string.lower()]]
+            elif string in lemma_dict:
+                return [lemma_dict[string]]
+            return [string.lower()]
+
+        return [lemma_dict.get(string, string)]
+
+    def lookup(self, string, orth=None):
+        return string.lower()
+
+    def lemmatize(self, string, index, exceptions, rules):
+        raise NotImplementedError