spaCy/spacy/lang/ga/lemmatizer.py

from typing import List, Dict, Tuple

from ...pipeline import Lemmatizer
from ...tokens import Token


class IrishLemmatizer(Lemmatizer):
    # This is a lookup-based lemmatiser using data extracted from
    # BuNaMo (https://github.com/michmech/BuNaMo)

    @classmethod
    def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
        if mode == "pos_lookup":
            # fmt: off
            required = [
                "lemma_lookup_adj", "lemma_lookup_adp",
                "lemma_lookup_noun", "lemma_lookup_verb"
            ]
            # fmt: on
            return (required, [])
        else:
            return super().get_lookups_config(mode)

    def pos_lookup_lemmatize(self, token: Token) -> List[str]:
        univ_pos = token.pos_
        string = unponc(token.text)
        if univ_pos not in ["PROPN", "ADP", "ADJ", "NOUN", "VERB"]:
            return [string.lower()]
        demutated = demutate(string)
        secondary = ""
        if string[0:1].lower() == "h" and string[1:2].lower() in "aáeéiíoóuú":
            secondary = string[1:]
        lookup_pos = univ_pos.lower()
        if univ_pos == "PROPN":
            lookup_pos = "noun"
        if token.has_morph():
            # TODO: lookup is actually required for the genitive forms, but
            # this is not in BuNaMo, and would not be of use with IDT.
            if univ_pos == "NOUN" and (
                "VerbForm=Vnoun" in token.morph or "VerbForm=Inf" in token.morph
            ):
                hpref = "Form=HPref" in token.morph
                return [demutate(string, hpref).lower()]
            elif univ_pos == "ADJ" and "VerbForm=Part" in token.morph:
                return [demutate(string).lower()]
        lookup_table = self.lookups.get_table("lemma_lookup_" + lookup_pos, {})

        def to_list(value):
            if value is None:
                value = []
            elif not isinstance(value, list):
                value = [value]
            return value

        if univ_pos == "ADP":
            return to_list(lookup_table.get(string, string.lower()))
        ret = []
        if univ_pos == "PROPN":
            ret.extend(to_list(lookup_table.get(demutated)))
            ret.extend(to_list(lookup_table.get(secondary)))
        else:
            ret.extend(to_list(lookup_table.get(demutated.lower())))
            ret.extend(to_list(lookup_table.get(secondary.lower())))
        if len(ret) == 0:
            ret = [string.lower()]
        return ret


def demutate(word: str, is_hpref: bool = False) -> str:
    UVOWELS = "AÁEÉIÍOÓUÚ"
    LVOWELS = "aáeéiíoóuú"
    lc = word.lower()
    # remove eclipsis
    if lc.startswith("bhf"):
        word = word[2:]
    elif lc.startswith("mb"):
        word = word[1:]
    elif lc.startswith("gc"):
        word = word[1:]
    elif lc.startswith("nd"):
        word = word[1:]
    elif lc.startswith("ng"):
        word = word[1:]
    elif lc.startswith("bp"):
        word = word[1:]
    elif lc.startswith("dt"):
        word = word[1:]
    elif word[0:1] == "n" and word[1:2] in UVOWELS:
        word = word[1:]
    elif lc.startswith("n-") and word[2:3] in LVOWELS:
        word = word[2:]
    # non-standard eclipsis
    elif lc.startswith("bh-f"):
        word = word[3:]
    elif lc.startswith("m-b"):
        word = word[2:]
    elif lc.startswith("g-c"):
        word = word[2:]
    elif lc.startswith("n-d"):
        word = word[2:]
    elif lc.startswith("n-g"):
        word = word[2:]
    elif lc.startswith("b-p"):
        word = word[2:]
    elif lc.startswith("d-t"):
        word = word[2:]

    # t-prothesis
    elif lc.startswith("ts"):
        word = word[1:]
    elif lc.startswith("t-s"):
        word = word[2:]

    # h-prothesis, if known to be present
    elif is_hpref and word[0:1] == "h":
        word = word[1:]
    # h-prothesis, simple case
    # words can also begin with 'h', but unlike eclipsis,
    # a hyphen is not used, so that needs to be handled
    # elsewhere
    elif word[0:1] == "h" and word[1:2] in UVOWELS:
        word = word[1:]

    # lenition
    # this breaks the previous if, to handle super-non-standard
    # text where both eclipsis and lenition were used.
    if lc[0:1] in "bcdfgmpst" and lc[1:2] == "h":
        word = word[0:1] + word[2:]

    return word


def unponc(word: str) -> str:
    # fmt: off
    PONC = {
        "ḃ": "bh",
        "ċ": "ch",
        "ḋ": "dh",
        "ḟ": "fh",
        "ġ": "gh",
        "ṁ": "mh",
        "ṗ": "ph",
        "ṡ": "sh",
        "ṫ": "th",
        "Ḃ": "BH",
        "Ċ": "CH",
        "Ḋ": "DH",
        "Ḟ": "FH",
        "Ġ": "GH",
        "Ṁ": "MH",
        "Ṗ": "PH",
        "Ṡ": "SH",
        "Ṫ": "TH"
    }
    # fmt: on
    buf = []
    for ch in word:
        if ch in PONC:
            buf.append(PONC[ch])
        else:
            buf.append(ch)
    return "".join(buf)