spaCy/spacy/lang/mk/lemmatizer.py

62 lines
1.8 KiB
Python
Raw Normal View History

from typing import List
from collections import OrderedDict
from ...pipeline import Lemmatizer
from ...tokens import Token
class MacedonianLemmatizer(Lemmatizer):
def rule_lemmatize(self, token: Token) -> List[str]:
string = token.text
univ_pos = token.pos_.lower()
morphology = token.morph.to_dict()
if univ_pos in ("", "eol", "space"):
return [string.lower()]
2021-01-05 05:41:53 +03:00
if string[-3:] == "јќи":
string = string[:-3]
univ_pos = "verb"
if callable(self.is_base_form) and self.is_base_form(univ_pos, morphology):
return [string.lower()]
index_table = self.lookups.get_table("lemma_index", {})
exc_table = self.lookups.get_table("lemma_exc", {})
rules_table = self.lookups.get_table("lemma_rules", {})
2021-01-05 05:41:53 +03:00
if not any(
(
index_table.get(univ_pos),
exc_table.get(univ_pos),
rules_table.get(univ_pos),
)
):
if univ_pos == "propn":
return [string]
else:
return [string.lower()]
index = index_table.get(univ_pos, {})
exceptions = exc_table.get(univ_pos, {})
rules = rules_table.get(univ_pos, [])
orig = string
string = string.lower()
forms = []
for old, new in rules:
if string.endswith(old):
form = string[: len(string) - len(old)] + new
if not form:
continue
if form in index or not form.isalpha():
forms.append(form)
forms = list(OrderedDict.fromkeys(forms))
for form in exceptions.get(string, []):
if form not in forms:
forms.insert(0, form)
if not forms:
forms.append(orig)
return forms