spaCy/spacy/lang/mk/lemmatizer.py

from typing import List
from collections import OrderedDict

from ...pipeline import Lemmatizer
from ...tokens import Token


class MacedonianLemmatizer(Lemmatizer):
    def rule_lemmatize(self, token: Token) -> List[str]:
        string = token.text
        univ_pos = token.pos_.lower()
        morphology = token.morph.to_dict()

        if univ_pos in ("", "eol", "space"):
            return [string.lower()]

        if string[-3:] == "јќи":
            string = string[:-3]
            univ_pos = "verb"

        if callable(self.is_base_form) and self.is_base_form(univ_pos, morphology):
            return [string.lower()]
        index_table = self.lookups.get_table("lemma_index", {})
        exc_table = self.lookups.get_table("lemma_exc", {})
        rules_table = self.lookups.get_table("lemma_rules", {})
        if not any(
            (
                index_table.get(univ_pos),
                exc_table.get(univ_pos),
                rules_table.get(univ_pos),
            )
        ):
            if univ_pos == "propn":
                return [string]
            else:
                return [string.lower()]

        index = index_table.get(univ_pos, {})
        exceptions = exc_table.get(univ_pos, {})
        rules = rules_table.get(univ_pos, [])

        orig = string
        string = string.lower()
        forms = []

        for old, new in rules:
            if string.endswith(old):
                form = string[: len(string) - len(old)] + new
                if not form:
                    continue
                if form in index or not form.isalpha():
                    forms.append(form)

        forms = list(OrderedDict.fromkeys(forms))
        for form in exceptions.get(string, []):
            if form not in forms:
                forms.insert(0, form)
        if not forms:
            forms.append(orig)

        return forms
Merge remote-tracking branch 'upstream/master' into chore/update-develop-from-master * Update Macedonian for v3 * Update Turkish for v3 2020-11-25 12:33:49 +03:00			`from typing import List`
Include Macedonian language (#6230) * Include Macedonian language * Fix indentation at char_classes.py * Fix indentation at char_classes.py * Add Macedonian tests, update lex_attrs and char_classes * Import unicode literals for python 2 2020-10-15 16:55:01 +03:00			`from collections import OrderedDict`

Merge remote-tracking branch 'upstream/master' into chore/update-develop-from-master * Update Macedonian for v3 * Update Turkish for v3 2020-11-25 12:33:49 +03:00			`from ...pipeline import Lemmatizer`
			`from ...tokens import Token`
Include Macedonian language (#6230) * Include Macedonian language * Fix indentation at char_classes.py * Fix indentation at char_classes.py * Add Macedonian tests, update lex_attrs and char_classes * Import unicode literals for python 2 2020-10-15 16:55:01 +03:00

			`class MacedonianLemmatizer(Lemmatizer):`
Merge remote-tracking branch 'upstream/master' into chore/update-develop-from-master * Update Macedonian for v3 * Update Turkish for v3 2020-11-25 12:33:49 +03:00			`def rule_lemmatize(self, token: Token) -> List[str]:`
			`string = token.text`
			`univ_pos = token.pos_.lower()`
			`morphology = token.morph.to_dict()`
Include Macedonian language (#6230) * Include Macedonian language * Fix indentation at char_classes.py * Fix indentation at char_classes.py * Add Macedonian tests, update lex_attrs and char_classes * Import unicode literals for python 2 2020-10-15 16:55:01 +03:00
			`if univ_pos in ("", "eol", "space"):`
			`return [string.lower()]`

Tidy up and auto-format 2021-01-05 05:41:53 +03:00			`if string[-3:] == "јќи":`
Include Macedonian language (#6230) * Include Macedonian language * Fix indentation at char_classes.py * Fix indentation at char_classes.py * Add Macedonian tests, update lex_attrs and char_classes * Import unicode literals for python 2 2020-10-15 16:55:01 +03:00			`string = string[:-3]`
			`univ_pos = "verb"`

			`if callable(self.is_base_form) and self.is_base_form(univ_pos, morphology):`
			`return [string.lower()]`
			`index_table = self.lookups.get_table("lemma_index", {})`
			`exc_table = self.lookups.get_table("lemma_exc", {})`
			`rules_table = self.lookups.get_table("lemma_rules", {})`
Tidy up and auto-format 2021-01-05 05:41:53 +03:00			`if not any(`
			`(`
			`index_table.get(univ_pos),`
			`exc_table.get(univ_pos),`
			`rules_table.get(univ_pos),`
			`)`
			`):`
Include Macedonian language (#6230) * Include Macedonian language * Fix indentation at char_classes.py * Fix indentation at char_classes.py * Add Macedonian tests, update lex_attrs and char_classes * Import unicode literals for python 2 2020-10-15 16:55:01 +03:00			`if univ_pos == "propn":`
			`return [string]`
			`else:`
			`return [string.lower()]`

Merge remote-tracking branch 'upstream/master' into chore/update-develop-from-master * Update Macedonian for v3 * Update Turkish for v3 2020-11-25 12:33:49 +03:00			`index = index_table.get(univ_pos, {})`
			`exceptions = exc_table.get(univ_pos, {})`
			`rules = rules_table.get(univ_pos, [])`

Include Macedonian language (#6230) * Include Macedonian language * Fix indentation at char_classes.py * Fix indentation at char_classes.py * Add Macedonian tests, update lex_attrs and char_classes * Import unicode literals for python 2 2020-10-15 16:55:01 +03:00			`orig = string`
			`string = string.lower()`
			`forms = []`

			`for old, new in rules:`
			`if string.endswith(old):`
			`form = string[: len(string) - len(old)] + new`
			`if not form:`
			`continue`
			`if form in index or not form.isalpha():`
			`forms.append(form)`

			`forms = list(OrderedDict.fromkeys(forms))`
			`for form in exceptions.get(string, []):`
			`if form not in forms:`
			`forms.insert(0, form)`
			`if not forms:`
			`forms.append(orig)`

			`return forms`