mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	Add an Irish lemmatiser, based on BuNaMo (#9102)
* add tréis/théis * remove previous contents, add demutate/unponc * fmt off/on wrapping * type hints * IrishLemmatizer (sic) * Use spacy-lookups-data>=1.0.3 * Minor bug fixes, refactoring for IrishLemmatizer * Fix return type for ADP list lookups * Fix and refactor lookup table lookups for missing/string/list * Remove unused variables * skip lookup of verbal substantives and adjectives; just demutate * Fix morph checks API details * Add types and format * Move helper methods into lemmatizer Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
This commit is contained in:
		
							parent
							
								
									5b0b0ca809
								
							
						
					
					
						commit
						8fe525beb5
					
				|  | @ -69,7 +69,7 @@ console_scripts = | |||
| 
 | ||||
| [options.extras_require] | ||||
| lookups = | ||||
|     spacy_lookups_data>=1.0.2,<1.1.0 | ||||
|     spacy_lookups_data>=1.0.3,<1.1.0 | ||||
| transformers = | ||||
|     spacy_transformers>=1.0.1,<1.1.0 | ||||
| ray = | ||||
|  |  | |||
|  | @ -1,6 +1,11 @@ | |||
| from typing import Optional | ||||
| 
 | ||||
| from thinc.api import Model | ||||
| 
 | ||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||
| from .stop_words import STOP_WORDS | ||||
| from ...language import Language | ||||
| from .lemmatizer import IrishLemmatizer | ||||
| 
 | ||||
| 
 | ||||
| class IrishDefaults(Language.Defaults): | ||||
|  | @ -13,4 +18,16 @@ class Irish(Language): | |||
|     Defaults = IrishDefaults | ||||
| 
 | ||||
| 
 | ||||
| @Irish.factory( | ||||
|     "lemmatizer", | ||||
|     assigns=["token.lemma"], | ||||
|     default_config={"model": None, "mode": "pos_lookup", "overwrite": False}, | ||||
|     default_score_weights={"lemma_acc": 1.0}, | ||||
| ) | ||||
| def make_lemmatizer( | ||||
|     nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool | ||||
| ): | ||||
|     return IrishLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) | ||||
| 
 | ||||
| 
 | ||||
| __all__ = ["Irish"] | ||||
|  |  | |||
|  | @ -1,35 +0,0 @@ | |||
| # fmt: off | ||||
| consonants = ["b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "z"] | ||||
| broad_vowels = ["a", "á", "o", "ó", "u", "ú"] | ||||
| slender_vowels = ["e", "é", "i", "í"] | ||||
| vowels = broad_vowels + slender_vowels | ||||
| # fmt: on | ||||
| 
 | ||||
| 
 | ||||
| def ends_dentals(word): | ||||
|     if word != "" and word[-1] in ["d", "n", "t", "s"]: | ||||
|         return True | ||||
|     else: | ||||
|         return False | ||||
| 
 | ||||
| 
 | ||||
| def devoice(word): | ||||
|     if len(word) > 2 and word[-2] == "s" and word[-1] == "d": | ||||
|         return word[:-1] + "t" | ||||
|     else: | ||||
|         return word | ||||
| 
 | ||||
| 
 | ||||
| def ends_with_vowel(word): | ||||
|     return word != "" and word[-1] in vowels | ||||
| 
 | ||||
| 
 | ||||
| def starts_with_vowel(word): | ||||
|     return word != "" and word[0] in vowels | ||||
| 
 | ||||
| 
 | ||||
| def deduplicate(word): | ||||
|     if len(word) > 2 and word[-2] == word[-1] and word[-1] in consonants: | ||||
|         return word[:-1] | ||||
|     else: | ||||
|         return word | ||||
							
								
								
									
										162
									
								
								spacy/lang/ga/lemmatizer.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										162
									
								
								spacy/lang/ga/lemmatizer.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,162 @@ | |||
| from typing import List, Dict, Tuple | ||||
| 
 | ||||
| from ...pipeline import Lemmatizer | ||||
| from ...tokens import Token | ||||
| 
 | ||||
| 
 | ||||
| class IrishLemmatizer(Lemmatizer): | ||||
|     # This is a lookup-based lemmatiser using data extracted from | ||||
|     # BuNaMo (https://github.com/michmech/BuNaMo) | ||||
| 
 | ||||
|     @classmethod | ||||
|     def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]: | ||||
|         if mode == "pos_lookup": | ||||
|             # fmt: off | ||||
|             required = [ | ||||
|                 "lemma_lookup_adj", "lemma_lookup_adp", | ||||
|                 "lemma_lookup_noun", "lemma_lookup_verb" | ||||
|             ] | ||||
|             # fmt: on | ||||
|             return (required, []) | ||||
|         else: | ||||
|             return super().get_lookups_config(mode) | ||||
| 
 | ||||
|     def pos_lookup_lemmatize(self, token: Token) -> List[str]: | ||||
|         univ_pos = token.pos_ | ||||
|         string = unponc(token.text) | ||||
|         if univ_pos not in ["PROPN", "ADP", "ADJ", "NOUN", "VERB"]: | ||||
|             return [string.lower()] | ||||
|         demutated = demutate(string) | ||||
|         secondary = "" | ||||
|         if string[0:1].lower() == "h" and string[1:2].lower() in "aáeéiíoóuú": | ||||
|             secondary = string[1:] | ||||
|         lookup_pos = univ_pos.lower() | ||||
|         if univ_pos == "PROPN": | ||||
|             lookup_pos = "noun" | ||||
|         if token.has_morph(): | ||||
|             # TODO: lookup is actually required for the genitive forms, but | ||||
|             # this is not in BuNaMo, and would not be of use with IDT. | ||||
|             if univ_pos == "NOUN" and ( | ||||
|                 "VerbForm=Vnoun" in token.morph or "VerbForm=Inf" in token.morph | ||||
|             ): | ||||
|                 hpref = "Form=HPref" in token.morph | ||||
|                 return [demutate(string, hpref).lower()] | ||||
|             elif univ_pos == "ADJ" and "VerbForm=Part" in token.morph: | ||||
|                 return [demutate(string).lower()] | ||||
|         lookup_table = self.lookups.get_table("lemma_lookup_" + lookup_pos, {}) | ||||
| 
 | ||||
|         def to_list(value): | ||||
|             if value is None: | ||||
|                 value = [] | ||||
|             elif not isinstance(value, list): | ||||
|                 value = [value] | ||||
|             return value | ||||
| 
 | ||||
|         if univ_pos == "ADP": | ||||
|             return to_list(lookup_table.get(string, string.lower())) | ||||
|         ret = [] | ||||
|         if univ_pos == "PROPN": | ||||
|             ret.extend(to_list(lookup_table.get(demutated))) | ||||
|             ret.extend(to_list(lookup_table.get(secondary))) | ||||
|         else: | ||||
|             ret.extend(to_list(lookup_table.get(demutated.lower()))) | ||||
|             ret.extend(to_list(lookup_table.get(secondary.lower()))) | ||||
|         if len(ret) == 0: | ||||
|             ret = [string.lower()] | ||||
|         return ret | ||||
| 
 | ||||
| 
 | ||||
| def demutate(word: str, is_hpref: bool = False) -> str: | ||||
|     UVOWELS = "AÁEÉIÍOÓUÚ" | ||||
|     LVOWELS = "aáeéiíoóuú" | ||||
|     lc = word.lower() | ||||
|     # remove eclipsis | ||||
|     if lc.startswith("bhf"): | ||||
|         word = word[2:] | ||||
|     elif lc.startswith("mb"): | ||||
|         word = word[1:] | ||||
|     elif lc.startswith("gc"): | ||||
|         word = word[1:] | ||||
|     elif lc.startswith("nd"): | ||||
|         word = word[1:] | ||||
|     elif lc.startswith("ng"): | ||||
|         word = word[1:] | ||||
|     elif lc.startswith("bp"): | ||||
|         word = word[1:] | ||||
|     elif lc.startswith("dt"): | ||||
|         word = word[1:] | ||||
|     elif word[0:1] == "n" and word[1:2] in UVOWELS: | ||||
|         word = word[1:] | ||||
|     elif lc.startswith("n-") and word[2:3] in LVOWELS: | ||||
|         word = word[2:] | ||||
|     # non-standard eclipsis | ||||
|     elif lc.startswith("bh-f"): | ||||
|         word = word[3:] | ||||
|     elif lc.startswith("m-b"): | ||||
|         word = word[2:] | ||||
|     elif lc.startswith("g-c"): | ||||
|         word = word[2:] | ||||
|     elif lc.startswith("n-d"): | ||||
|         word = word[2:] | ||||
|     elif lc.startswith("n-g"): | ||||
|         word = word[2:] | ||||
|     elif lc.startswith("b-p"): | ||||
|         word = word[2:] | ||||
|     elif lc.startswith("d-t"): | ||||
|         word = word[2:] | ||||
| 
 | ||||
|     # t-prothesis | ||||
|     elif lc.startswith("ts"): | ||||
|         word = word[1:] | ||||
|     elif lc.startswith("t-s"): | ||||
|         word = word[2:] | ||||
| 
 | ||||
|     # h-prothesis, if known to be present | ||||
|     elif is_hpref and word[0:1] == "h": | ||||
|         word = word[1:] | ||||
|     # h-prothesis, simple case | ||||
|     # words can also begin with 'h', but unlike eclipsis, | ||||
|     # a hyphen is not used, so that needs to be handled | ||||
|     # elsewhere | ||||
|     elif word[0:1] == "h" and word[1:2] in UVOWELS: | ||||
|         word = word[1:] | ||||
| 
 | ||||
|     # lenition | ||||
|     # this breaks the previous if, to handle super-non-standard | ||||
|     # text where both eclipsis and lenition were used. | ||||
|     if lc[0:1] in "bcdfgmpst" and lc[1:2] == "h": | ||||
|         word = word[0:1] + word[2:] | ||||
| 
 | ||||
|     return word | ||||
| 
 | ||||
| 
 | ||||
| def unponc(word: str) -> str: | ||||
|     # fmt: off | ||||
|     PONC = { | ||||
|         "ḃ": "bh", | ||||
|         "ċ": "ch", | ||||
|         "ḋ": "dh", | ||||
|         "ḟ": "fh", | ||||
|         "ġ": "gh", | ||||
|         "ṁ": "mh", | ||||
|         "ṗ": "ph", | ||||
|         "ṡ": "sh", | ||||
|         "ṫ": "th", | ||||
|         "Ḃ": "BH", | ||||
|         "Ċ": "CH", | ||||
|         "Ḋ": "DH", | ||||
|         "Ḟ": "FH", | ||||
|         "Ġ": "GH", | ||||
|         "Ṁ": "MH", | ||||
|         "Ṗ": "PH", | ||||
|         "Ṡ": "SH", | ||||
|         "Ṫ": "TH" | ||||
|     } | ||||
|     # fmt: on | ||||
|     buf = [] | ||||
|     for ch in word: | ||||
|         if ch in PONC: | ||||
|             buf.append(PONC[ch]) | ||||
|         else: | ||||
|             buf.append(ch) | ||||
|     return "".join(buf) | ||||
|  | @ -9,6 +9,8 @@ _exc = { | |||
|     "ded'": [{ORTH: "de", NORM: "de"}, {ORTH: "d'", NORM: "do"}], | ||||
|     "lem'": [{ORTH: "le", NORM: "le"}, {ORTH: "m'", NORM: "mo"}], | ||||
|     "led'": [{ORTH: "le", NORM: "le"}, {ORTH: "d'", NORM: "do"}], | ||||
|     "théis": [{ORTH: "th", NORM: "tar"}, {ORTH: "éis", NORM: "éis"}], | ||||
|     "tréis": [{ORTH: "tr", NORM: "tar"}, {ORTH: "éis", NORM: "éis"}], | ||||
| } | ||||
| 
 | ||||
| for exc_data in [ | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user