from typing import Dict, List, Tuple from ...pipeline import Lemmatizer from ...tokens import Token class IrishLemmatizer(Lemmatizer): # This is a lookup-based lemmatiser using data extracted from # BuNaMo (https://github.com/michmech/BuNaMo) @classmethod def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]: if mode == "pos_lookup": # fmt: off required = [ "lemma_lookup_adj", "lemma_lookup_adp", "lemma_lookup_noun", "lemma_lookup_verb" ] # fmt: on return (required, []) else: return super().get_lookups_config(mode) def pos_lookup_lemmatize(self, token: Token) -> List[str]: univ_pos = token.pos_ string = unponc(token.text) if univ_pos not in ["PROPN", "ADP", "ADJ", "NOUN", "VERB"]: return [string.lower()] demutated = demutate(string) secondary = "" if string[0:1].lower() == "h" and string[1:2].lower() in "aáeéiíoóuú": secondary = string[1:] lookup_pos = univ_pos.lower() if univ_pos == "PROPN": lookup_pos = "noun" if token.has_morph(): # TODO: lookup is actually required for the genitive forms, but # this is not in BuNaMo, and would not be of use with IDT. if univ_pos == "NOUN" and ( "VerbForm=Vnoun" in token.morph or "VerbForm=Inf" in token.morph ): hpref = "Form=HPref" in token.morph return [demutate(string, hpref).lower()] elif univ_pos == "ADJ" and "VerbForm=Part" in token.morph: return [demutate(string).lower()] lookup_table = self.lookups.get_table("lemma_lookup_" + lookup_pos, {}) def to_list(value): if value is None: value = [] elif not isinstance(value, list): value = [value] return value if univ_pos == "ADP": return to_list(lookup_table.get(string, string.lower())) ret = [] if univ_pos == "PROPN": ret.extend(to_list(lookup_table.get(demutated))) ret.extend(to_list(lookup_table.get(secondary))) else: ret.extend(to_list(lookup_table.get(demutated.lower()))) ret.extend(to_list(lookup_table.get(secondary.lower()))) if len(ret) == 0: ret = [string.lower()] return ret def demutate(word: str, is_hpref: bool = False) -> str: UVOWELS = "AÁEÉIÍOÓUÚ" LVOWELS = "aáeéiíoóuú" lc = word.lower() # remove eclipsis if lc.startswith("bhf"): word = word[2:] elif lc.startswith("mb"): word = word[1:] elif lc.startswith("gc"): word = word[1:] elif lc.startswith("nd"): word = word[1:] elif lc.startswith("ng"): word = word[1:] elif lc.startswith("bp"): word = word[1:] elif lc.startswith("dt"): word = word[1:] elif word[0:1] == "n" and word[1:2] in UVOWELS: word = word[1:] elif lc.startswith("n-") and word[2:3] in LVOWELS: word = word[2:] # non-standard eclipsis elif lc.startswith("bh-f"): word = word[3:] elif lc.startswith("m-b"): word = word[2:] elif lc.startswith("g-c"): word = word[2:] elif lc.startswith("n-d"): word = word[2:] elif lc.startswith("n-g"): word = word[2:] elif lc.startswith("b-p"): word = word[2:] elif lc.startswith("d-t"): word = word[2:] # t-prothesis elif lc.startswith("ts"): word = word[1:] elif lc.startswith("t-s"): word = word[2:] # h-prothesis, if known to be present elif is_hpref and word[0:1] == "h": word = word[1:] # h-prothesis, simple case # words can also begin with 'h', but unlike eclipsis, # a hyphen is not used, so that needs to be handled # elsewhere elif word[0:1] == "h" and word[1:2] in UVOWELS: word = word[1:] # lenition # this breaks the previous if, to handle super-non-standard # text where both eclipsis and lenition were used. if lc[0:1] in "bcdfgmpst" and lc[1:2] == "h": word = word[0:1] + word[2:] return word def unponc(word: str) -> str: # fmt: off PONC = { "ḃ": "bh", "ċ": "ch", "ḋ": "dh", "ḟ": "fh", "ġ": "gh", "ṁ": "mh", "ṗ": "ph", "ṡ": "sh", "ṫ": "th", "Ḃ": "BH", "Ċ": "CH", "Ḋ": "DH", "Ḟ": "FH", "Ġ": "GH", "Ṁ": "MH", "Ṗ": "PH", "Ṡ": "SH", "Ṫ": "TH" } # fmt: on buf = [] for ch in word: if ch in PONC: buf.append(PONC[ch]) else: buf.append(ch) return "".join(buf)