From f49e2810e6ea5c8b848df5b0f393c27ee31bb7f4 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Thu, 14 May 2020 18:23:19 +0200 Subject: [PATCH] Add Polish lemmatizer (#5413) * Add Polish lemmatizer Contributed by @ryszardtuora * Add missing import --- setup.cfg | 2 +- spacy/lang/pl/__init__.py | 8 +++ spacy/lang/pl/lemmatizer.py | 107 ++++++++++++++++++++++++++++++++++++ 3 files changed, 116 insertions(+), 1 deletion(-) create mode 100644 spacy/lang/pl/lemmatizer.py diff --git a/setup.cfg b/setup.cfg index 3e0acd12f..af3579f88 100644 --- a/setup.cfg +++ b/setup.cfg @@ -59,7 +59,7 @@ install_requires = [options.extras_require] lookups = - spacy_lookups_data>=0.0.5,<0.2.0 + spacy_lookups_data>=0.3.1,<0.4.0 cuda = cupy>=5.0.0b4,<9.0.0 cuda80 = diff --git a/spacy/lang/pl/__init__.py b/spacy/lang/pl/__init__.py index 702a19063..0540bf535 100644 --- a/spacy/lang/pl/__init__.py +++ b/spacy/lang/pl/__init__.py @@ -6,12 +6,14 @@ from .punctuation import TOKENIZER_INFIXES from .tag_map import TAG_MAP from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS +from .lemmatizer import PolishLemmatizer from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS from ...language import Language from ...attrs import LANG, NORM from ...util import update_exc, add_lookups +from ...lookups import Lookups class PolishDefaults(Language.Defaults): @@ -26,6 +28,12 @@ class PolishDefaults(Language.Defaults): tag_map = TAG_MAP infixes = TOKENIZER_INFIXES + @classmethod + def create_lemmatizer(cls, nlp=None, lookups=None): + if lookups is None: + lookups = Lookups() + return PolishLemmatizer(lookups) + class Polish(Language): lang = "pl" diff --git a/spacy/lang/pl/lemmatizer.py b/spacy/lang/pl/lemmatizer.py new file mode 100644 index 000000000..2be4b0fb7 --- /dev/null +++ b/spacy/lang/pl/lemmatizer.py @@ -0,0 +1,107 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from ...lemmatizer import Lemmatizer +from ...parts_of_speech import NAMES +from ...errors import Errors + + +class PolishLemmatizer(Lemmatizer): + # This lemmatizer implements lookup lemmatization based on + # the Morfeusz dictionary (morfeusz.sgjp.pl/en) by Institute of Computer Science PAS + # It utilizes some prefix based improvements for + # verb and adjectives lemmatization, as well as case-sensitive + # lemmatization for nouns + def __init__(self, lookups, *args, **kwargs): + # this lemmatizer is lookup based, so it does not require an index, exceptionlist, or rules + super().__init__(lookups) + self.lemma_lookups = {} + for tag in [ + "ADJ", + "ADP", + "ADV", + "AUX", + "NOUN", + "NUM", + "PART", + "PRON", + "VERB", + "X", + ]: + self.lemma_lookups[tag] = self.lookups.get_table( + "lemma_lookup_" + tag.lower(), {} + ) + self.lemma_lookups["DET"] = self.lemma_lookups["X"] + self.lemma_lookups["PROPN"] = self.lemma_lookups["NOUN"] + + def __call__(self, string, univ_pos, morphology=None): + if isinstance(univ_pos, int): + univ_pos = NAMES.get(univ_pos, "X") + univ_pos = univ_pos.upper() + + if univ_pos == "NOUN": + return self.lemmatize_noun(string, morphology) + + if univ_pos != "PROPN": + string = string.lower() + + if univ_pos == "ADJ": + return self.lemmatize_adj(string, morphology) + elif univ_pos == "VERB": + return self.lemmatize_verb(string, morphology) + + lemma_dict = self.lemma_lookups.get(univ_pos, {}) + return [lemma_dict.get(string, string.lower())] + + def lemmatize_adj(self, string, morphology): + # this method utilizes different procedures for adjectives + # with 'nie' and 'naj' prefixes + lemma_dict = self.lemma_lookups["ADJ"] + + if string[:3] == "nie": + search_string = string[3:] + if search_string[:3] == "naj": + naj_search_string = search_string[3:] + if naj_search_string in lemma_dict: + return [lemma_dict[naj_search_string]] + if search_string in lemma_dict: + return [lemma_dict[search_string]] + + if string[:3] == "naj": + naj_search_string = string[3:] + if naj_search_string in lemma_dict: + return [lemma_dict[naj_search_string]] + + return [lemma_dict.get(string, string)] + + def lemmatize_verb(self, string, morphology): + # this method utilizes a different procedure for verbs + # with 'nie' prefix + lemma_dict = self.lemma_lookups["VERB"] + + if string[:3] == "nie": + search_string = string[3:] + if search_string in lemma_dict: + return [lemma_dict[search_string]] + + return [lemma_dict.get(string, string)] + + def lemmatize_noun(self, string, morphology): + # this method is case-sensitive, in order to work + # for incorrectly tagged proper names + lemma_dict = self.lemma_lookups["NOUN"] + + if string != string.lower(): + if string.lower() in lemma_dict: + return [lemma_dict[string.lower()]] + elif string in lemma_dict: + return [lemma_dict[string]] + return [string.lower()] + + return [lemma_dict.get(string, string)] + + def lookup(self, string, orth=None): + return string.lower() + + def lemmatize(self, string, index, exceptions, rules): + raise NotImplementedError