Add an Irish lemmatiser, based on BuNaMo (#9102)

* add tréis/théis * remove previous contents, add demutate/unponc * fmt off/on wrapping * type hints * IrishLemmatizer (sic) * Use spacy-lookups-data>=1.0.3 * Minor bug fixes, refactoring for IrishLemmatizer * Fix return type for ADP list lookups * Fix and refactor lookup table lookups for missing/string/list * Remove unused variables * skip lookup of verbal substantives and adjectives; just demutate * Fix morph checks API details * Add types and format * Move helper methods into lemmatizer Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
2025-11-01 16:37:45 +03:00 · 2021-09-30 13:18:47 +01:00 · 2021-09-30 13:18:47 +01:00 · 8fe525beb5
commit 8fe525beb5
parent 5b0b0ca809
5 changed files with 182 additions and 36 deletions
--- a/setup.cfg
+++ b/setup.cfg
@ -69,7 +69,7 @@ console_scripts =
 [options.extras_require]
 lookups =
-    spacy_lookups_data>=1.0.2,<1.1.0
+    spacy_lookups_data>=1.0.3,<1.1.0
 transformers =
    spacy_transformers>=1.0.1,<1.1.0
 ray =
--- a/spacy/lang/ga/init.py
+++ b/spacy/lang/ga/init.py
@ -1,6 +1,11 @@
 from typing import Optional
 from thinc.api import Model
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
 from ...language import Language
 from .lemmatizer import IrishLemmatizer
 class IrishDefaults(Language.Defaults):
@ -13,4 +18,16 @@ class Irish(Language):
    Defaults = IrishDefaults
@Irish.factory(
    "lemmatizer",
    assigns=["token.lemma"],
    default_config={"model": None, "mode": "pos_lookup", "overwrite": False},
    default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
    nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
 ):
    return IrishLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
 __all__ = ["Irish"]
--- a/spacy/lang/ga/irish_morphology_helpers.py
+++ b/spacy/lang/ga/irish_morphology_helpers.py
@ -1,35 +0,0 @@
 # fmt: off
 consonants = ["b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "z"]
 broad_vowels = ["a", "á", "o", "ó", "u", "ú"]
 slender_vowels = ["e", "é", "i", "í"]
 vowels = broad_vowels + slender_vowels
 # fmt: on
 def ends_dentals(word):
    if word != "" and word[-1] in ["d", "n", "t", "s"]:
        return True
    else:
        return False
 def devoice(word):
    if len(word) > 2 and word[-2] == "s" and word[-1] == "d":
        return word[:-1] + "t"
    else:
        return word
 def ends_with_vowel(word):
    return word != "" and word[-1] in vowels
 def starts_with_vowel(word):
    return word != "" and word[0] in vowels
 def deduplicate(word):
    if len(word) > 2 and word[-2] == word[-1] and word[-1] in consonants:
        return word[:-1]
    else:
        return word
--- a/spacy/lang/ga/lemmatizer.py
+++ b/spacy/lang/ga/lemmatizer.py
@ -0,0 +1,162 @@
 from typing import List, Dict, Tuple
 from ...pipeline import Lemmatizer
 from ...tokens import Token
 class IrishLemmatizer(Lemmatizer):
    # This is a lookup-based lemmatiser using data extracted from
    # BuNaMo (https://github.com/michmech/BuNaMo)
    @classmethod
    def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
        if mode == "pos_lookup":
            # fmt: off
            required = [
                "lemma_lookup_adj", "lemma_lookup_adp",
                "lemma_lookup_noun", "lemma_lookup_verb"
            ]
            # fmt: on
            return (required, [])
        else:
            return super().get_lookups_config(mode)
    def pos_lookup_lemmatize(self, token: Token) -> List[str]:
        univ_pos = token.pos_
        string = unponc(token.text)
        if univ_pos not in ["PROPN", "ADP", "ADJ", "NOUN", "VERB"]:
            return [string.lower()]
        demutated = demutate(string)
        secondary = ""
        if string[0:1].lower() == "h" and string[1:2].lower() in "aáeéiíoóuú":
            secondary = string[1:]
        lookup_pos = univ_pos.lower()
        if univ_pos == "PROPN":
            lookup_pos = "noun"
        if token.has_morph():
            # TODO: lookup is actually required for the genitive forms, but
            # this is not in BuNaMo, and would not be of use with IDT.
            if univ_pos == "NOUN" and (
                "VerbForm=Vnoun" in token.morph or "VerbForm=Inf" in token.morph
            ):
                hpref = "Form=HPref" in token.morph
                return [demutate(string, hpref).lower()]
            elif univ_pos == "ADJ" and "VerbForm=Part" in token.morph:
                return [demutate(string).lower()]
        lookup_table = self.lookups.get_table("lemma_lookup_" + lookup_pos, {})
        def to_list(value):
            if value is None:
                value = []
            elif not isinstance(value, list):
                value = [value]
            return value
        if univ_pos == "ADP":
            return to_list(lookup_table.get(string, string.lower()))
        ret = []
        if univ_pos == "PROPN":
            ret.extend(to_list(lookup_table.get(demutated)))
            ret.extend(to_list(lookup_table.get(secondary)))
        else:
            ret.extend(to_list(lookup_table.get(demutated.lower())))
            ret.extend(to_list(lookup_table.get(secondary.lower())))
        if len(ret) == 0:
            ret = [string.lower()]
        return ret
 def demutate(word: str, is_hpref: bool = False) -> str:
    UVOWELS = "AÁEÉIÍOÓUÚ"
    LVOWELS = "aáeéiíoóuú"
    lc = word.lower()
    # remove eclipsis
    if lc.startswith("bhf"):
        word = word[2:]
    elif lc.startswith("mb"):
        word = word[1:]
    elif lc.startswith("gc"):
        word = word[1:]
    elif lc.startswith("nd"):
        word = word[1:]
    elif lc.startswith("ng"):
        word = word[1:]
    elif lc.startswith("bp"):
        word = word[1:]
    elif lc.startswith("dt"):
        word = word[1:]
    elif word[0:1] == "n" and word[1:2] in UVOWELS:
        word = word[1:]
    elif lc.startswith("n-") and word[2:3] in LVOWELS:
        word = word[2:]
    # non-standard eclipsis
    elif lc.startswith("bh-f"):
        word = word[3:]
    elif lc.startswith("m-b"):
        word = word[2:]
    elif lc.startswith("g-c"):
        word = word[2:]
    elif lc.startswith("n-d"):
        word = word[2:]
    elif lc.startswith("n-g"):
        word = word[2:]
    elif lc.startswith("b-p"):
        word = word[2:]
    elif lc.startswith("d-t"):
        word = word[2:]
    # t-prothesis
    elif lc.startswith("ts"):
        word = word[1:]
    elif lc.startswith("t-s"):
        word = word[2:]
    # h-prothesis, if known to be present
    elif is_hpref and word[0:1] == "h":
        word = word[1:]
    # h-prothesis, simple case
    # words can also begin with 'h', but unlike eclipsis,
    # a hyphen is not used, so that needs to be handled
    # elsewhere
    elif word[0:1] == "h" and word[1:2] in UVOWELS:
        word = word[1:]
    # lenition
    # this breaks the previous if, to handle super-non-standard
    # text where both eclipsis and lenition were used.
    if lc[0:1] in "bcdfgmpst" and lc[1:2] == "h":
        word = word[0:1] + word[2:]
    return word
 def unponc(word: str) -> str:
    # fmt: off
    PONC = {
        "ḃ": "bh",
        "ċ": "ch",
        "ḋ": "dh",
        "ḟ": "fh",
        "ġ": "gh",
        "ṁ": "mh",
        "ṗ": "ph",
        "ṡ": "sh",
        "ṫ": "th",
        "Ḃ": "BH",
        "Ċ": "CH",
        "Ḋ": "DH",
        "Ḟ": "FH",
        "Ġ": "GH",
        "Ṁ": "MH",
        "Ṗ": "PH",
        "Ṡ": "SH",
        "Ṫ": "TH"
    }
    # fmt: on
    buf = []
    for ch in word:
        if ch in PONC:
            buf.append(PONC[ch])
        else:
            buf.append(ch)
    return "".join(buf)
--- a/spacy/lang/ga/tokenizer_exceptions.py
+++ b/spacy/lang/ga/tokenizer_exceptions.py
@ -9,6 +9,8 @@ _exc = {
    "ded'": [{ORTH: "de", NORM: "de"}, {ORTH: "d'", NORM: "do"}],
    "lem'": [{ORTH: "le", NORM: "le"}, {ORTH: "m'", NORM: "mo"}],
    "led'": [{ORTH: "le", NORM: "le"}, {ORTH: "d'", NORM: "do"}],
    "théis": [{ORTH: "th", NORM: "tar"}, {ORTH: "éis", NORM: "éis"}],
    "tréis": [{ORTH: "tr", NORM: "tar"}, {ORTH: "éis", NORM: "éis"}],
 }
 for exc_data in [