mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	* add tréis/théis * remove previous contents, add demutate/unponc * fmt off/on wrapping * type hints * IrishLemmatizer (sic) * Use spacy-lookups-data>=1.0.3 * Minor bug fixes, refactoring for IrishLemmatizer * Fix return type for ADP list lookups * Fix and refactor lookup table lookups for missing/string/list * Remove unused variables * skip lookup of verbal substantives and adjectives; just demutate * Fix morph checks API details * Add types and format * Move helper methods into lemmatizer Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
		
			
				
	
	
		
			163 lines
		
	
	
		
			4.8 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			163 lines
		
	
	
		
			4.8 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
from typing import List, Dict, Tuple
 | 
						|
 | 
						|
from ...pipeline import Lemmatizer
 | 
						|
from ...tokens import Token
 | 
						|
 | 
						|
 | 
						|
class IrishLemmatizer(Lemmatizer):
 | 
						|
    # This is a lookup-based lemmatiser using data extracted from
 | 
						|
    # BuNaMo (https://github.com/michmech/BuNaMo)
 | 
						|
 | 
						|
    @classmethod
 | 
						|
    def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
 | 
						|
        if mode == "pos_lookup":
 | 
						|
            # fmt: off
 | 
						|
            required = [
 | 
						|
                "lemma_lookup_adj", "lemma_lookup_adp",
 | 
						|
                "lemma_lookup_noun", "lemma_lookup_verb"
 | 
						|
            ]
 | 
						|
            # fmt: on
 | 
						|
            return (required, [])
 | 
						|
        else:
 | 
						|
            return super().get_lookups_config(mode)
 | 
						|
 | 
						|
    def pos_lookup_lemmatize(self, token: Token) -> List[str]:
 | 
						|
        univ_pos = token.pos_
 | 
						|
        string = unponc(token.text)
 | 
						|
        if univ_pos not in ["PROPN", "ADP", "ADJ", "NOUN", "VERB"]:
 | 
						|
            return [string.lower()]
 | 
						|
        demutated = demutate(string)
 | 
						|
        secondary = ""
 | 
						|
        if string[0:1].lower() == "h" and string[1:2].lower() in "aáeéiíoóuú":
 | 
						|
            secondary = string[1:]
 | 
						|
        lookup_pos = univ_pos.lower()
 | 
						|
        if univ_pos == "PROPN":
 | 
						|
            lookup_pos = "noun"
 | 
						|
        if token.has_morph():
 | 
						|
            # TODO: lookup is actually required for the genitive forms, but
 | 
						|
            # this is not in BuNaMo, and would not be of use with IDT.
 | 
						|
            if univ_pos == "NOUN" and (
 | 
						|
                "VerbForm=Vnoun" in token.morph or "VerbForm=Inf" in token.morph
 | 
						|
            ):
 | 
						|
                hpref = "Form=HPref" in token.morph
 | 
						|
                return [demutate(string, hpref).lower()]
 | 
						|
            elif univ_pos == "ADJ" and "VerbForm=Part" in token.morph:
 | 
						|
                return [demutate(string).lower()]
 | 
						|
        lookup_table = self.lookups.get_table("lemma_lookup_" + lookup_pos, {})
 | 
						|
 | 
						|
        def to_list(value):
 | 
						|
            if value is None:
 | 
						|
                value = []
 | 
						|
            elif not isinstance(value, list):
 | 
						|
                value = [value]
 | 
						|
            return value
 | 
						|
 | 
						|
        if univ_pos == "ADP":
 | 
						|
            return to_list(lookup_table.get(string, string.lower()))
 | 
						|
        ret = []
 | 
						|
        if univ_pos == "PROPN":
 | 
						|
            ret.extend(to_list(lookup_table.get(demutated)))
 | 
						|
            ret.extend(to_list(lookup_table.get(secondary)))
 | 
						|
        else:
 | 
						|
            ret.extend(to_list(lookup_table.get(demutated.lower())))
 | 
						|
            ret.extend(to_list(lookup_table.get(secondary.lower())))
 | 
						|
        if len(ret) == 0:
 | 
						|
            ret = [string.lower()]
 | 
						|
        return ret
 | 
						|
 | 
						|
 | 
						|
def demutate(word: str, is_hpref: bool = False) -> str:
 | 
						|
    UVOWELS = "AÁEÉIÍOÓUÚ"
 | 
						|
    LVOWELS = "aáeéiíoóuú"
 | 
						|
    lc = word.lower()
 | 
						|
    # remove eclipsis
 | 
						|
    if lc.startswith("bhf"):
 | 
						|
        word = word[2:]
 | 
						|
    elif lc.startswith("mb"):
 | 
						|
        word = word[1:]
 | 
						|
    elif lc.startswith("gc"):
 | 
						|
        word = word[1:]
 | 
						|
    elif lc.startswith("nd"):
 | 
						|
        word = word[1:]
 | 
						|
    elif lc.startswith("ng"):
 | 
						|
        word = word[1:]
 | 
						|
    elif lc.startswith("bp"):
 | 
						|
        word = word[1:]
 | 
						|
    elif lc.startswith("dt"):
 | 
						|
        word = word[1:]
 | 
						|
    elif word[0:1] == "n" and word[1:2] in UVOWELS:
 | 
						|
        word = word[1:]
 | 
						|
    elif lc.startswith("n-") and word[2:3] in LVOWELS:
 | 
						|
        word = word[2:]
 | 
						|
    # non-standard eclipsis
 | 
						|
    elif lc.startswith("bh-f"):
 | 
						|
        word = word[3:]
 | 
						|
    elif lc.startswith("m-b"):
 | 
						|
        word = word[2:]
 | 
						|
    elif lc.startswith("g-c"):
 | 
						|
        word = word[2:]
 | 
						|
    elif lc.startswith("n-d"):
 | 
						|
        word = word[2:]
 | 
						|
    elif lc.startswith("n-g"):
 | 
						|
        word = word[2:]
 | 
						|
    elif lc.startswith("b-p"):
 | 
						|
        word = word[2:]
 | 
						|
    elif lc.startswith("d-t"):
 | 
						|
        word = word[2:]
 | 
						|
 | 
						|
    # t-prothesis
 | 
						|
    elif lc.startswith("ts"):
 | 
						|
        word = word[1:]
 | 
						|
    elif lc.startswith("t-s"):
 | 
						|
        word = word[2:]
 | 
						|
 | 
						|
    # h-prothesis, if known to be present
 | 
						|
    elif is_hpref and word[0:1] == "h":
 | 
						|
        word = word[1:]
 | 
						|
    # h-prothesis, simple case
 | 
						|
    # words can also begin with 'h', but unlike eclipsis,
 | 
						|
    # a hyphen is not used, so that needs to be handled
 | 
						|
    # elsewhere
 | 
						|
    elif word[0:1] == "h" and word[1:2] in UVOWELS:
 | 
						|
        word = word[1:]
 | 
						|
 | 
						|
    # lenition
 | 
						|
    # this breaks the previous if, to handle super-non-standard
 | 
						|
    # text where both eclipsis and lenition were used.
 | 
						|
    if lc[0:1] in "bcdfgmpst" and lc[1:2] == "h":
 | 
						|
        word = word[0:1] + word[2:]
 | 
						|
 | 
						|
    return word
 | 
						|
 | 
						|
 | 
						|
def unponc(word: str) -> str:
 | 
						|
    # fmt: off
 | 
						|
    PONC = {
 | 
						|
        "ḃ": "bh",
 | 
						|
        "ċ": "ch",
 | 
						|
        "ḋ": "dh",
 | 
						|
        "ḟ": "fh",
 | 
						|
        "ġ": "gh",
 | 
						|
        "ṁ": "mh",
 | 
						|
        "ṗ": "ph",
 | 
						|
        "ṡ": "sh",
 | 
						|
        "ṫ": "th",
 | 
						|
        "Ḃ": "BH",
 | 
						|
        "Ċ": "CH",
 | 
						|
        "Ḋ": "DH",
 | 
						|
        "Ḟ": "FH",
 | 
						|
        "Ġ": "GH",
 | 
						|
        "Ṁ": "MH",
 | 
						|
        "Ṗ": "PH",
 | 
						|
        "Ṡ": "SH",
 | 
						|
        "Ṫ": "TH"
 | 
						|
    }
 | 
						|
    # fmt: on
 | 
						|
    buf = []
 | 
						|
    for ch in word:
 | 
						|
        if ch in PONC:
 | 
						|
            buf.append(PONC[ch])
 | 
						|
        else:
 | 
						|
            buf.append(ch)
 | 
						|
    return "".join(buf)
 |