spaCy/spacy/lang/ga/lemmatizer.py
Jim O’Regan 8fe525beb5
Add an Irish lemmatiser, based on BuNaMo (#9102)
* add tréis/théis

* remove previous contents, add demutate/unponc

* fmt off/on wrapping

* type hints

* IrishLemmatizer (sic)

* Use spacy-lookups-data>=1.0.3

* Minor bug fixes, refactoring for IrishLemmatizer

* Fix return type for ADP list lookups
* Fix and refactor lookup table lookups for missing/string/list
* Remove unused variables

* skip lookup of verbal substantives and adjectives; just demutate

* Fix morph checks API details

* Add types and format

* Move helper methods into lemmatizer

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
2021-09-30 14:18:47 +02:00

163 lines
4.8 KiB
Python

from typing import List, Dict, Tuple
from ...pipeline import Lemmatizer
from ...tokens import Token
class IrishLemmatizer(Lemmatizer):
# This is a lookup-based lemmatiser using data extracted from
# BuNaMo (https://github.com/michmech/BuNaMo)
@classmethod
def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
if mode == "pos_lookup":
# fmt: off
required = [
"lemma_lookup_adj", "lemma_lookup_adp",
"lemma_lookup_noun", "lemma_lookup_verb"
]
# fmt: on
return (required, [])
else:
return super().get_lookups_config(mode)
def pos_lookup_lemmatize(self, token: Token) -> List[str]:
univ_pos = token.pos_
string = unponc(token.text)
if univ_pos not in ["PROPN", "ADP", "ADJ", "NOUN", "VERB"]:
return [string.lower()]
demutated = demutate(string)
secondary = ""
if string[0:1].lower() == "h" and string[1:2].lower() in "aáeéiíoóuú":
secondary = string[1:]
lookup_pos = univ_pos.lower()
if univ_pos == "PROPN":
lookup_pos = "noun"
if token.has_morph():
# TODO: lookup is actually required for the genitive forms, but
# this is not in BuNaMo, and would not be of use with IDT.
if univ_pos == "NOUN" and (
"VerbForm=Vnoun" in token.morph or "VerbForm=Inf" in token.morph
):
hpref = "Form=HPref" in token.morph
return [demutate(string, hpref).lower()]
elif univ_pos == "ADJ" and "VerbForm=Part" in token.morph:
return [demutate(string).lower()]
lookup_table = self.lookups.get_table("lemma_lookup_" + lookup_pos, {})
def to_list(value):
if value is None:
value = []
elif not isinstance(value, list):
value = [value]
return value
if univ_pos == "ADP":
return to_list(lookup_table.get(string, string.lower()))
ret = []
if univ_pos == "PROPN":
ret.extend(to_list(lookup_table.get(demutated)))
ret.extend(to_list(lookup_table.get(secondary)))
else:
ret.extend(to_list(lookup_table.get(demutated.lower())))
ret.extend(to_list(lookup_table.get(secondary.lower())))
if len(ret) == 0:
ret = [string.lower()]
return ret
def demutate(word: str, is_hpref: bool = False) -> str:
UVOWELS = "AÁEÉIÍOÓUÚ"
LVOWELS = "aáeéiíoóuú"
lc = word.lower()
# remove eclipsis
if lc.startswith("bhf"):
word = word[2:]
elif lc.startswith("mb"):
word = word[1:]
elif lc.startswith("gc"):
word = word[1:]
elif lc.startswith("nd"):
word = word[1:]
elif lc.startswith("ng"):
word = word[1:]
elif lc.startswith("bp"):
word = word[1:]
elif lc.startswith("dt"):
word = word[1:]
elif word[0:1] == "n" and word[1:2] in UVOWELS:
word = word[1:]
elif lc.startswith("n-") and word[2:3] in LVOWELS:
word = word[2:]
# non-standard eclipsis
elif lc.startswith("bh-f"):
word = word[3:]
elif lc.startswith("m-b"):
word = word[2:]
elif lc.startswith("g-c"):
word = word[2:]
elif lc.startswith("n-d"):
word = word[2:]
elif lc.startswith("n-g"):
word = word[2:]
elif lc.startswith("b-p"):
word = word[2:]
elif lc.startswith("d-t"):
word = word[2:]
# t-prothesis
elif lc.startswith("ts"):
word = word[1:]
elif lc.startswith("t-s"):
word = word[2:]
# h-prothesis, if known to be present
elif is_hpref and word[0:1] == "h":
word = word[1:]
# h-prothesis, simple case
# words can also begin with 'h', but unlike eclipsis,
# a hyphen is not used, so that needs to be handled
# elsewhere
elif word[0:1] == "h" and word[1:2] in UVOWELS:
word = word[1:]
# lenition
# this breaks the previous if, to handle super-non-standard
# text where both eclipsis and lenition were used.
if lc[0:1] in "bcdfgmpst" and lc[1:2] == "h":
word = word[0:1] + word[2:]
return word
def unponc(word: str) -> str:
# fmt: off
PONC = {
"": "bh",
"ċ": "ch",
"": "dh",
"": "fh",
"ġ": "gh",
"": "mh",
"": "ph",
"": "sh",
"": "th",
"": "BH",
"Ċ": "CH",
"": "DH",
"": "FH",
"Ġ": "GH",
"": "MH",
"": "PH",
"": "SH",
"": "TH"
}
# fmt: on
buf = []
for ch in word:
if ch in PONC:
buf.append(PONC[ch])
else:
buf.append(ch)
return "".join(buf)