Add an Irish lemmatiser, based on BuNaMo (#9102)

* add tréis/théis * remove previous contents, add demutate/unponc * fmt off/on wrapping * type hints * IrishLemmatizer (sic) * Use spacy-lookups-data>=1.0.3 * Minor bug fixes, refactoring for IrishLemmatizer * Fix return type for ADP list lookups * Fix and refactor lookup table lookups for missing/string/list * Remove unused variables * skip lookup of verbal substantives and adjectives; just demutate * Fix morph checks API details * Add types and format * Move helper methods into lemmatizer Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
2025-11-09 12:27:54 +03:00 · 2021-09-30 13:18:47 +01:00 · 2021-09-30 13:18:47 +01:00 · 8fe525beb5
commit 8fe525beb5
parent 5b0b0ca809
5 changed files with 182 additions and 36 deletions
--- a/setup.cfg
+++ b/setup.cfg
@ -69,7 +69,7 @@ console_scripts =

 [options.extras_require]
 lookups =
-    spacy_lookups_data>=1.0.2,<1.1.0
+    spacy_lookups_data>=1.0.3,<1.1.0
 transformers =
    spacy_transformers>=1.0.1,<1.1.0
 ray =
--- a/spacy/lang/ga/init.py
+++ b/spacy/lang/ga/init.py
@ -1,6 +1,11 @@
+from typing import Optional
+
+from thinc.api import Model
+
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
 from ...language import Language
+from .lemmatizer import IrishLemmatizer


 class IrishDefaults(Language.Defaults):
@ -13,4 +18,16 @@ class Irish(Language):
    Defaults = IrishDefaults


+@Irish.factory(
+    "lemmatizer",
+    assigns=["token.lemma"],
+    default_config={"model": None, "mode": "pos_lookup", "overwrite": False},
+    default_score_weights={"lemma_acc": 1.0},
+)
+def make_lemmatizer(
+    nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
+):
+    return IrishLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+
+
 __all__ = ["Irish"]
--- a/spacy/lang/ga/irish_morphology_helpers.py
+++ b/spacy/lang/ga/irish_morphology_helpers.py
@ -1,35 +0,0 @@
-# fmt: off
-consonants = ["b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "z"]
-broad_vowels = ["a", "á", "o", "ó", "u", "ú"]
-slender_vowels = ["e", "é", "i", "í"]
-vowels = broad_vowels + slender_vowels
-# fmt: on
-
-
-def ends_dentals(word):
-    if word != "" and word[-1] in ["d", "n", "t", "s"]:
-        return True
-    else:
-        return False
-
-
-def devoice(word):
-    if len(word) > 2 and word[-2] == "s" and word[-1] == "d":
-        return word[:-1] + "t"
-    else:
-        return word
-
-
-def ends_with_vowel(word):
-    return word != "" and word[-1] in vowels
-
-
-def starts_with_vowel(word):
-    return word != "" and word[0] in vowels
-
-
-def deduplicate(word):
-    if len(word) > 2 and word[-2] == word[-1] and word[-1] in consonants:
-        return word[:-1]
-    else:
-        return word
--- a/spacy/lang/ga/lemmatizer.py
+++ b/spacy/lang/ga/lemmatizer.py
@ -0,0 +1,162 @@
+from typing import List, Dict, Tuple
+
+from ...pipeline import Lemmatizer
+from ...tokens import Token
+
+
+class IrishLemmatizer(Lemmatizer):
+    # This is a lookup-based lemmatiser using data extracted from
+    # BuNaMo (https://github.com/michmech/BuNaMo)
+
+    @classmethod
+    def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
+        if mode == "pos_lookup":
+            # fmt: off
+            required = [
+                "lemma_lookup_adj", "lemma_lookup_adp",
+                "lemma_lookup_noun", "lemma_lookup_verb"
+            ]
+            # fmt: on
+            return (required, [])
+        else:
+            return super().get_lookups_config(mode)
+
+    def pos_lookup_lemmatize(self, token: Token) -> List[str]:
+        univ_pos = token.pos_
+        string = unponc(token.text)
+        if univ_pos not in ["PROPN", "ADP", "ADJ", "NOUN", "VERB"]:
+            return [string.lower()]
+        demutated = demutate(string)
+        secondary = ""
+        if string[0:1].lower() == "h" and string[1:2].lower() in "aáeéiíoóuú":
+            secondary = string[1:]
+        lookup_pos = univ_pos.lower()
+        if univ_pos == "PROPN":
+            lookup_pos = "noun"
+        if token.has_morph():
+            # TODO: lookup is actually required for the genitive forms, but
+            # this is not in BuNaMo, and would not be of use with IDT.
+            if univ_pos == "NOUN" and (
+                "VerbForm=Vnoun" in token.morph or "VerbForm=Inf" in token.morph
+            ):
+                hpref = "Form=HPref" in token.morph
+                return [demutate(string, hpref).lower()]
+            elif univ_pos == "ADJ" and "VerbForm=Part" in token.morph:
+                return [demutate(string).lower()]
+        lookup_table = self.lookups.get_table("lemma_lookup_" + lookup_pos, {})
+
+        def to_list(value):
+            if value is None:
+                value = []
+            elif not isinstance(value, list):
+                value = [value]
+            return value
+
+        if univ_pos == "ADP":
+            return to_list(lookup_table.get(string, string.lower()))
+        ret = []
+        if univ_pos == "PROPN":
+            ret.extend(to_list(lookup_table.get(demutated)))
+            ret.extend(to_list(lookup_table.get(secondary)))
+        else:
+            ret.extend(to_list(lookup_table.get(demutated.lower())))
+            ret.extend(to_list(lookup_table.get(secondary.lower())))
+        if len(ret) == 0:
+            ret = [string.lower()]
+        return ret
+
+
+def demutate(word: str, is_hpref: bool = False) -> str:
+    UVOWELS = "AÁEÉIÍOÓUÚ"
+    LVOWELS = "aáeéiíoóuú"
+    lc = word.lower()
+    # remove eclipsis
+    if lc.startswith("bhf"):
+        word = word[2:]
+    elif lc.startswith("mb"):
+        word = word[1:]
+    elif lc.startswith("gc"):
+        word = word[1:]
+    elif lc.startswith("nd"):
+        word = word[1:]
+    elif lc.startswith("ng"):
+        word = word[1:]
+    elif lc.startswith("bp"):
+        word = word[1:]
+    elif lc.startswith("dt"):
+        word = word[1:]
+    elif word[0:1] == "n" and word[1:2] in UVOWELS:
+        word = word[1:]
+    elif lc.startswith("n-") and word[2:3] in LVOWELS:
+        word = word[2:]
+    # non-standard eclipsis
+    elif lc.startswith("bh-f"):
+        word = word[3:]
+    elif lc.startswith("m-b"):
+        word = word[2:]
+    elif lc.startswith("g-c"):
+        word = word[2:]
+    elif lc.startswith("n-d"):
+        word = word[2:]
+    elif lc.startswith("n-g"):
+        word = word[2:]
+    elif lc.startswith("b-p"):
+        word = word[2:]
+    elif lc.startswith("d-t"):
+        word = word[2:]
+
+    # t-prothesis
+    elif lc.startswith("ts"):
+        word = word[1:]
+    elif lc.startswith("t-s"):
+        word = word[2:]
+
+    # h-prothesis, if known to be present
+    elif is_hpref and word[0:1] == "h":
+        word = word[1:]
+    # h-prothesis, simple case
+    # words can also begin with 'h', but unlike eclipsis,
+    # a hyphen is not used, so that needs to be handled
+    # elsewhere
+    elif word[0:1] == "h" and word[1:2] in UVOWELS:
+        word = word[1:]
+
+    # lenition
+    # this breaks the previous if, to handle super-non-standard
+    # text where both eclipsis and lenition were used.
+    if lc[0:1] in "bcdfgmpst" and lc[1:2] == "h":
+        word = word[0:1] + word[2:]
+
+    return word
+
+
+def unponc(word: str) -> str:
+    # fmt: off
+    PONC = {
+        "ḃ": "bh",
+        "ċ": "ch",
+        "ḋ": "dh",
+        "ḟ": "fh",
+        "ġ": "gh",
+        "ṁ": "mh",
+        "ṗ": "ph",
+        "ṡ": "sh",
+        "ṫ": "th",
+        "Ḃ": "BH",
+        "Ċ": "CH",
+        "Ḋ": "DH",
+        "Ḟ": "FH",
+        "Ġ": "GH",
+        "Ṁ": "MH",
+        "Ṗ": "PH",
+        "Ṡ": "SH",
+        "Ṫ": "TH"
+    }
+    # fmt: on
+    buf = []
+    for ch in word:
+        if ch in PONC:
+            buf.append(PONC[ch])
+        else:
+            buf.append(ch)
+    return "".join(buf)
--- a/spacy/lang/ga/tokenizer_exceptions.py
+++ b/spacy/lang/ga/tokenizer_exceptions.py
@ -9,6 +9,8 @@ _exc = {
    "ded'": [{ORTH: "de", NORM: "de"}, {ORTH: "d'", NORM: "do"}],
    "lem'": [{ORTH: "le", NORM: "le"}, {ORTH: "m'", NORM: "mo"}],
    "led'": [{ORTH: "le", NORM: "le"}, {ORTH: "d'", NORM: "do"}],
+    "théis": [{ORTH: "th", NORM: "tar"}, {ORTH: "éis", NORM: "éis"}],
+    "tréis": [{ORTH: "tr", NORM: "tar"}, {ORTH: "éis", NORM: "éis"}],
 }

 for exc_data in [