spaCy/spacy/lang/mk/lemmatizer.py

from typing import List
from collections import OrderedDict

from ...pipeline import Lemmatizer
from ...tokens import Token


class MacedonianLemmatizer(Lemmatizer):
    def rule_lemmatize(self, token: Token) -> List[str]:
        string = token.text
        univ_pos = token.pos_.lower()
        morphology = token.morph.to_dict()

        if univ_pos in ("", "eol", "space"):
            return [string.lower()]

        if string[-3:] == "јќи":
            string = string[:-3]
            univ_pos = "verb"

        index_table = self.lookups.get_table("lemma_index", {})
        exc_table = self.lookups.get_table("lemma_exc", {})
        rules_table = self.lookups.get_table("lemma_rules", {})
        if not any(
            (
                index_table.get(univ_pos),
                exc_table.get(univ_pos),
                rules_table.get(univ_pos),
            )
        ):
            if univ_pos == "propn":
                return [string]
            else:
                return [string.lower()]

        index = index_table.get(univ_pos, {})
        exceptions = exc_table.get(univ_pos, {})
        rules = rules_table.get(univ_pos, [])

        orig = string
        string = string.lower()
        forms = []

        for old, new in rules:
            if string.endswith(old):
                form = string[: len(string) - len(old)] + new
                if not form:
                    continue
                if form in index or not form.isalpha():
                    forms.append(form)

        forms = list(OrderedDict.fromkeys(forms))
        for form in exceptions.get(string, []):
            if form not in forms:
                forms.insert(0, form)
        if not forms:
            forms.append(orig)

        return forms