From f49e2810e6ea5c8b848df5b0f393c27ee31bb7f4 Mon Sep 17 00:00:00 2001
From: adrianeboyd <adrianeboyd@gmail.com>
Date: Thu, 14 May 2020 18:23:19 +0200
Subject: [PATCH] Add Polish lemmatizer (#5413)

* Add Polish lemmatizer

Contributed by @ryszardtuora

* Add missing import
---
 setup.cfg                   |   2 +-
 spacy/lang/pl/__init__.py   |   8 +++
 spacy/lang/pl/lemmatizer.py | 107 ++++++++++++++++++++++++++++++++++++
 3 files changed, 116 insertions(+), 1 deletion(-)
 create mode 100644 spacy/lang/pl/lemmatizer.py

diff --git a/setup.cfg b/setup.cfg
index 3e0acd12f..af3579f88 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -59,7 +59,7 @@ install_requires =
 
 [options.extras_require]
 lookups =
-    spacy_lookups_data>=0.0.5,<0.2.0
+    spacy_lookups_data>=0.3.1,<0.4.0
 cuda =
     cupy>=5.0.0b4,<9.0.0
 cuda80 =
diff --git a/spacy/lang/pl/__init__.py b/spacy/lang/pl/__init__.py
index 702a19063..0540bf535 100644
--- a/spacy/lang/pl/__init__.py
+++ b/spacy/lang/pl/__init__.py
@@ -6,12 +6,14 @@ from .punctuation import TOKENIZER_INFIXES
 from .tag_map import TAG_MAP
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
+from .lemmatizer import PolishLemmatizer
 
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
 from ...language import Language
 from ...attrs import LANG, NORM
 from ...util import update_exc, add_lookups
+from ...lookups import Lookups
 
 
 class PolishDefaults(Language.Defaults):
@@ -26,6 +28,12 @@ class PolishDefaults(Language.Defaults):
     tag_map = TAG_MAP
     infixes = TOKENIZER_INFIXES
 
+    @classmethod
+    def create_lemmatizer(cls, nlp=None, lookups=None):
+        if lookups is None:
+            lookups = Lookups()
+        return PolishLemmatizer(lookups)
+
 
 class Polish(Language):
     lang = "pl"
diff --git a/spacy/lang/pl/lemmatizer.py b/spacy/lang/pl/lemmatizer.py
new file mode 100644
index 000000000..2be4b0fb7
--- /dev/null
+++ b/spacy/lang/pl/lemmatizer.py
@@ -0,0 +1,107 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from ...lemmatizer import Lemmatizer
+from ...parts_of_speech import NAMES
+from ...errors import Errors
+
+
+class PolishLemmatizer(Lemmatizer):
+    # This lemmatizer implements lookup lemmatization based on
+    # the Morfeusz dictionary (morfeusz.sgjp.pl/en) by Institute of Computer Science PAS
+    # It utilizes some prefix based improvements for
+    # verb and adjectives lemmatization, as well as case-sensitive
+    # lemmatization for nouns
+    def __init__(self, lookups, *args, **kwargs):
+        # this lemmatizer is lookup based, so it does not require an index, exceptionlist, or rules
+        super().__init__(lookups)
+        self.lemma_lookups = {}
+        for tag in [
+            "ADJ",
+            "ADP",
+            "ADV",
+            "AUX",
+            "NOUN",
+            "NUM",
+            "PART",
+            "PRON",
+            "VERB",
+            "X",
+        ]:
+            self.lemma_lookups[tag] = self.lookups.get_table(
+                "lemma_lookup_" + tag.lower(), {}
+            )
+        self.lemma_lookups["DET"] = self.lemma_lookups["X"]
+        self.lemma_lookups["PROPN"] = self.lemma_lookups["NOUN"]
+
+    def __call__(self, string, univ_pos, morphology=None):
+        if isinstance(univ_pos, int):
+            univ_pos = NAMES.get(univ_pos, "X")
+        univ_pos = univ_pos.upper()
+
+        if univ_pos == "NOUN":
+            return self.lemmatize_noun(string, morphology)
+
+        if univ_pos != "PROPN":
+            string = string.lower()
+
+        if univ_pos == "ADJ":
+            return self.lemmatize_adj(string, morphology)
+        elif univ_pos == "VERB":
+            return self.lemmatize_verb(string, morphology)
+
+        lemma_dict = self.lemma_lookups.get(univ_pos, {})
+        return [lemma_dict.get(string, string.lower())]
+
+    def lemmatize_adj(self, string, morphology):
+        # this method utilizes different procedures for adjectives
+        # with 'nie' and 'naj' prefixes
+        lemma_dict = self.lemma_lookups["ADJ"]
+
+        if string[:3] == "nie":
+            search_string = string[3:]
+            if search_string[:3] == "naj":
+                naj_search_string = search_string[3:]
+                if naj_search_string in lemma_dict:
+                    return [lemma_dict[naj_search_string]]
+            if search_string in lemma_dict:
+                return [lemma_dict[search_string]]
+
+        if string[:3] == "naj":
+            naj_search_string = string[3:]
+            if naj_search_string in lemma_dict:
+                return [lemma_dict[naj_search_string]]
+
+        return [lemma_dict.get(string, string)]
+
+    def lemmatize_verb(self, string, morphology):
+        # this method utilizes a different procedure for verbs
+        # with 'nie' prefix
+        lemma_dict = self.lemma_lookups["VERB"]
+
+        if string[:3] == "nie":
+            search_string = string[3:]
+            if search_string in lemma_dict:
+                return [lemma_dict[search_string]]
+
+        return [lemma_dict.get(string, string)]
+
+    def lemmatize_noun(self, string, morphology):
+        # this method is case-sensitive, in order to work
+        # for incorrectly tagged proper names
+        lemma_dict = self.lemma_lookups["NOUN"]
+
+        if string != string.lower():
+            if string.lower() in lemma_dict:
+                return [lemma_dict[string.lower()]]
+            elif string in lemma_dict:
+                return [lemma_dict[string]]
+            return [string.lower()]
+
+        return [lemma_dict.get(string, string)]
+
+    def lookup(self, string, orth=None):
+        return string.lower()
+
+    def lemmatize(self, string, index, exceptions, rules):
+        raise NotImplementedError