spaCy/spacy/lang/ga/lemmatizer.py

from typing import List, Dict, Tuple

from ...pipeline import Lemmatizer
from ...tokens import Token


class IrishLemmatizer(Lemmatizer):
    # This is a lookup-based lemmatiser using data extracted from
    # BuNaMo (https://github.com/michmech/BuNaMo)

    @classmethod
    def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
        if mode == "pos_lookup":
            # fmt: off
            required = [
                "lemma_lookup_adj", "lemma_lookup_adp",
                "lemma_lookup_noun", "lemma_lookup_verb"
            ]
            # fmt: on
            return (required, [])
        else:
            return super().get_lookups_config(mode)

    def pos_lookup_lemmatize(self, token: Token) -> List[str]:
        univ_pos = token.pos_
        string = unponc(token.text)
        if univ_pos not in ["PROPN", "ADP", "ADJ", "NOUN", "VERB"]:
            return [string.lower()]
        demutated = demutate(string)
        secondary = ""
        if string[0:1].lower() == "h" and string[1:2].lower() in "aáeéiíoóuú":
            secondary = string[1:]
        lookup_pos = univ_pos.lower()
        if univ_pos == "PROPN":
            lookup_pos = "noun"
        if token.has_morph():
            # TODO: lookup is actually required for the genitive forms, but
            # this is not in BuNaMo, and would not be of use with IDT.
            if univ_pos == "NOUN" and (
                "VerbForm=Vnoun" in token.morph or "VerbForm=Inf" in token.morph
            ):
                hpref = "Form=HPref" in token.morph
                return [demutate(string, hpref).lower()]
            elif univ_pos == "ADJ" and "VerbForm=Part" in token.morph:
                return [demutate(string).lower()]
        lookup_table = self.lookups.get_table("lemma_lookup_" + lookup_pos, {})

        def to_list(value):
            if value is None:
                value = []
            elif not isinstance(value, list):
                value = [value]
            return value

        if univ_pos == "ADP":
            return to_list(lookup_table.get(string, string.lower()))
        ret = []
        if univ_pos == "PROPN":
            ret.extend(to_list(lookup_table.get(demutated)))
            ret.extend(to_list(lookup_table.get(secondary)))
        else:
            ret.extend(to_list(lookup_table.get(demutated.lower())))
            ret.extend(to_list(lookup_table.get(secondary.lower())))
        if len(ret) == 0:
            ret = [string.lower()]
        return ret


def demutate(word: str, is_hpref: bool = False) -> str:
    UVOWELS = "AÁEÉIÍOÓUÚ"
    LVOWELS = "aáeéiíoóuú"
    lc = word.lower()
    # remove eclipsis
    if lc.startswith("bhf"):
        word = word[2:]
    elif lc.startswith("mb"):
        word = word[1:]
    elif lc.startswith("gc"):
        word = word[1:]
    elif lc.startswith("nd"):
        word = word[1:]
    elif lc.startswith("ng"):
        word = word[1:]
    elif lc.startswith("bp"):
        word = word[1:]
    elif lc.startswith("dt"):
        word = word[1:]
    elif word[0:1] == "n" and word[1:2] in UVOWELS:
        word = word[1:]
    elif lc.startswith("n-") and word[2:3] in LVOWELS:
        word = word[2:]
    # non-standard eclipsis
    elif lc.startswith("bh-f"):
        word = word[3:]
    elif lc.startswith("m-b"):
        word = word[2:]
    elif lc.startswith("g-c"):
        word = word[2:]
    elif lc.startswith("n-d"):
        word = word[2:]
    elif lc.startswith("n-g"):
        word = word[2:]
    elif lc.startswith("b-p"):
        word = word[2:]
    elif lc.startswith("d-t"):
        word = word[2:]

    # t-prothesis
    elif lc.startswith("ts"):
        word = word[1:]
    elif lc.startswith("t-s"):
        word = word[2:]

    # h-prothesis, if known to be present
    elif is_hpref and word[0:1] == "h":
        word = word[1:]
    # h-prothesis, simple case
    # words can also begin with 'h', but unlike eclipsis,
    # a hyphen is not used, so that needs to be handled
    # elsewhere
    elif word[0:1] == "h" and word[1:2] in UVOWELS:
        word = word[1:]

    # lenition
    # this breaks the previous if, to handle super-non-standard
    # text where both eclipsis and lenition were used.
    if lc[0:1] in "bcdfgmpst" and lc[1:2] == "h":
        word = word[0:1] + word[2:]

    return word


def unponc(word: str) -> str:
    # fmt: off
    PONC = {
        "ḃ": "bh",
        "ċ": "ch",
        "ḋ": "dh",
        "ḟ": "fh",
        "ġ": "gh",
        "ṁ": "mh",
        "ṗ": "ph",
        "ṡ": "sh",
        "ṫ": "th",
        "Ḃ": "BH",
        "Ċ": "CH",
        "Ḋ": "DH",
        "Ḟ": "FH",
        "Ġ": "GH",
        "Ṁ": "MH",
        "Ṗ": "PH",
        "Ṡ": "SH",
        "Ṫ": "TH"
    }
    # fmt: on
    buf = []
    for ch in word:
        if ch in PONC:
            buf.append(PONC[ch])
        else:
            buf.append(ch)
    return "".join(buf)
Add an Irish lemmatiser, based on BuNaMo (#9102) * add tréis/théis * remove previous contents, add demutate/unponc * fmt off/on wrapping * type hints * IrishLemmatizer (sic) * Use spacy-lookups-data>=1.0.3 * Minor bug fixes, refactoring for IrishLemmatizer * Fix return type for ADP list lookups * Fix and refactor lookup table lookups for missing/string/list * Remove unused variables * skip lookup of verbal substantives and adjectives; just demutate * Fix morph checks API details * Add types and format * Move helper methods into lemmatizer Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> 2021-09-30 15:18:47 +03:00			`from typing import List, Dict, Tuple`

			`from ...pipeline import Lemmatizer`
			`from ...tokens import Token`


			`class IrishLemmatizer(Lemmatizer):`
			`# This is a lookup-based lemmatiser using data extracted from`
			`# BuNaMo (https://github.com/michmech/BuNaMo)`

			`@classmethod`
			`def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:`
			`if mode == "pos_lookup":`
			`# fmt: off`
			`required = [`
			`"lemma_lookup_adj", "lemma_lookup_adp",`
			`"lemma_lookup_noun", "lemma_lookup_verb"`
			`]`
			`# fmt: on`
			`return (required, [])`
			`else:`
			`return super().get_lookups_config(mode)`

			`def pos_lookup_lemmatize(self, token: Token) -> List[str]:`
			`univ_pos = token.pos_`
			`string = unponc(token.text)`
			`if univ_pos not in ["PROPN", "ADP", "ADJ", "NOUN", "VERB"]:`
			`return [string.lower()]`
			`demutated = demutate(string)`
			`secondary = ""`
			`if string[0:1].lower() == "h" and string[1:2].lower() in "aáeéiíoóuú":`
			`secondary = string[1:]`
			`lookup_pos = univ_pos.lower()`
			`if univ_pos == "PROPN":`
			`lookup_pos = "noun"`
			`if token.has_morph():`
			`# TODO: lookup is actually required for the genitive forms, but`
			`# this is not in BuNaMo, and would not be of use with IDT.`
			`if univ_pos == "NOUN" and (`
			`"VerbForm=Vnoun" in token.morph or "VerbForm=Inf" in token.morph`
			`):`
			`hpref = "Form=HPref" in token.morph`
			`return [demutate(string, hpref).lower()]`
			`elif univ_pos == "ADJ" and "VerbForm=Part" in token.morph:`
			`return [demutate(string).lower()]`
			`lookup_table = self.lookups.get_table("lemma_lookup_" + lookup_pos, {})`

			`def to_list(value):`
			`if value is None:`
			`value = []`
			`elif not isinstance(value, list):`
			`value = [value]`
			`return value`

			`if univ_pos == "ADP":`
			`return to_list(lookup_table.get(string, string.lower()))`
			`ret = []`
			`if univ_pos == "PROPN":`
			`ret.extend(to_list(lookup_table.get(demutated)))`
			`ret.extend(to_list(lookup_table.get(secondary)))`
			`else:`
			`ret.extend(to_list(lookup_table.get(demutated.lower())))`
			`ret.extend(to_list(lookup_table.get(secondary.lower())))`
			`if len(ret) == 0:`
			`ret = [string.lower()]`
			`return ret`


			`def demutate(word: str, is_hpref: bool = False) -> str:`
			`UVOWELS = "AÁEÉIÍOÓUÚ"`
			`LVOWELS = "aáeéiíoóuú"`
			`lc = word.lower()`
			`# remove eclipsis`
			`if lc.startswith("bhf"):`
			`word = word[2:]`
			`elif lc.startswith("mb"):`
			`word = word[1:]`
			`elif lc.startswith("gc"):`
			`word = word[1:]`
			`elif lc.startswith("nd"):`
			`word = word[1:]`
			`elif lc.startswith("ng"):`
			`word = word[1:]`
			`elif lc.startswith("bp"):`
			`word = word[1:]`
			`elif lc.startswith("dt"):`
			`word = word[1:]`
			`elif word[0:1] == "n" and word[1:2] in UVOWELS:`
			`word = word[1:]`
			`elif lc.startswith("n-") and word[2:3] in LVOWELS:`
			`word = word[2:]`
			`# non-standard eclipsis`
			`elif lc.startswith("bh-f"):`
			`word = word[3:]`
			`elif lc.startswith("m-b"):`
			`word = word[2:]`
			`elif lc.startswith("g-c"):`
			`word = word[2:]`
			`elif lc.startswith("n-d"):`
			`word = word[2:]`
			`elif lc.startswith("n-g"):`
			`word = word[2:]`
			`elif lc.startswith("b-p"):`
			`word = word[2:]`
			`elif lc.startswith("d-t"):`
			`word = word[2:]`

			`# t-prothesis`
			`elif lc.startswith("ts"):`
			`word = word[1:]`
			`elif lc.startswith("t-s"):`
			`word = word[2:]`

			`# h-prothesis, if known to be present`
			`elif is_hpref and word[0:1] == "h":`
			`word = word[1:]`
			`# h-prothesis, simple case`
			`# words can also begin with 'h', but unlike eclipsis,`
			`# a hyphen is not used, so that needs to be handled`
			`# elsewhere`
			`elif word[0:1] == "h" and word[1:2] in UVOWELS:`
			`word = word[1:]`

			`# lenition`
			`# this breaks the previous if, to handle super-non-standard`
			`# text where both eclipsis and lenition were used.`
			`if lc[0:1] in "bcdfgmpst" and lc[1:2] == "h":`
			`word = word[0:1] + word[2:]`

			`return word`


			`def unponc(word: str) -> str:`
			`# fmt: off`
			`PONC = {`
			`"ḃ": "bh",`
			`"ċ": "ch",`
			`"ḋ": "dh",`
			`"ḟ": "fh",`
			`"ġ": "gh",`
			`"ṁ": "mh",`
			`"ṗ": "ph",`
			`"ṡ": "sh",`
			`"ṫ": "th",`
			`"Ḃ": "BH",`
			`"Ċ": "CH",`
			`"Ḋ": "DH",`
			`"Ḟ": "FH",`
			`"Ġ": "GH",`
			`"Ṁ": "MH",`
			`"Ṗ": "PH",`
			`"Ṡ": "SH",`
			`"Ṫ": "TH"`
			`}`
			`# fmt: on`
			`buf = []`
			`for ch in word:`
			`if ch in PONC:`
			`buf.append(PONC[ch])`
			`else:`
			`buf.append(ch)`
			`return "".join(buf)`