mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 00:46:28 +03:00
Add an Irish lemmatiser, based on BuNaMo (#9102)
* add tréis/théis * remove previous contents, add demutate/unponc * fmt off/on wrapping * type hints * IrishLemmatizer (sic) * Use spacy-lookups-data>=1.0.3 * Minor bug fixes, refactoring for IrishLemmatizer * Fix return type for ADP list lookups * Fix and refactor lookup table lookups for missing/string/list * Remove unused variables * skip lookup of verbal substantives and adjectives; just demutate * Fix morph checks API details * Add types and format * Move helper methods into lemmatizer Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
This commit is contained in:
parent
5b0b0ca809
commit
8fe525beb5
|
@ -69,7 +69,7 @@ console_scripts =
|
|||
|
||||
[options.extras_require]
|
||||
lookups =
|
||||
spacy_lookups_data>=1.0.2,<1.1.0
|
||||
spacy_lookups_data>=1.0.3,<1.1.0
|
||||
transformers =
|
||||
spacy_transformers>=1.0.1,<1.1.0
|
||||
ray =
|
||||
|
|
|
@ -1,6 +1,11 @@
|
|||
from typing import Optional
|
||||
|
||||
from thinc.api import Model
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
from ...language import Language
|
||||
from .lemmatizer import IrishLemmatizer
|
||||
|
||||
|
||||
class IrishDefaults(Language.Defaults):
|
||||
|
@ -13,4 +18,16 @@ class Irish(Language):
|
|||
Defaults = IrishDefaults
|
||||
|
||||
|
||||
@Irish.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "pos_lookup", "overwrite": False},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
||||
):
|
||||
return IrishLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
||||
|
||||
|
||||
__all__ = ["Irish"]
|
||||
|
|
|
@ -1,35 +0,0 @@
|
|||
# fmt: off
|
||||
consonants = ["b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "z"]
|
||||
broad_vowels = ["a", "á", "o", "ó", "u", "ú"]
|
||||
slender_vowels = ["e", "é", "i", "í"]
|
||||
vowels = broad_vowels + slender_vowels
|
||||
# fmt: on
|
||||
|
||||
|
||||
def ends_dentals(word):
|
||||
if word != "" and word[-1] in ["d", "n", "t", "s"]:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
def devoice(word):
|
||||
if len(word) > 2 and word[-2] == "s" and word[-1] == "d":
|
||||
return word[:-1] + "t"
|
||||
else:
|
||||
return word
|
||||
|
||||
|
||||
def ends_with_vowel(word):
|
||||
return word != "" and word[-1] in vowels
|
||||
|
||||
|
||||
def starts_with_vowel(word):
|
||||
return word != "" and word[0] in vowels
|
||||
|
||||
|
||||
def deduplicate(word):
|
||||
if len(word) > 2 and word[-2] == word[-1] and word[-1] in consonants:
|
||||
return word[:-1]
|
||||
else:
|
||||
return word
|
162
spacy/lang/ga/lemmatizer.py
Normal file
162
spacy/lang/ga/lemmatizer.py
Normal file
|
@ -0,0 +1,162 @@
|
|||
from typing import List, Dict, Tuple
|
||||
|
||||
from ...pipeline import Lemmatizer
|
||||
from ...tokens import Token
|
||||
|
||||
|
||||
class IrishLemmatizer(Lemmatizer):
|
||||
# This is a lookup-based lemmatiser using data extracted from
|
||||
# BuNaMo (https://github.com/michmech/BuNaMo)
|
||||
|
||||
@classmethod
|
||||
def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
|
||||
if mode == "pos_lookup":
|
||||
# fmt: off
|
||||
required = [
|
||||
"lemma_lookup_adj", "lemma_lookup_adp",
|
||||
"lemma_lookup_noun", "lemma_lookup_verb"
|
||||
]
|
||||
# fmt: on
|
||||
return (required, [])
|
||||
else:
|
||||
return super().get_lookups_config(mode)
|
||||
|
||||
def pos_lookup_lemmatize(self, token: Token) -> List[str]:
|
||||
univ_pos = token.pos_
|
||||
string = unponc(token.text)
|
||||
if univ_pos not in ["PROPN", "ADP", "ADJ", "NOUN", "VERB"]:
|
||||
return [string.lower()]
|
||||
demutated = demutate(string)
|
||||
secondary = ""
|
||||
if string[0:1].lower() == "h" and string[1:2].lower() in "aáeéiíoóuú":
|
||||
secondary = string[1:]
|
||||
lookup_pos = univ_pos.lower()
|
||||
if univ_pos == "PROPN":
|
||||
lookup_pos = "noun"
|
||||
if token.has_morph():
|
||||
# TODO: lookup is actually required for the genitive forms, but
|
||||
# this is not in BuNaMo, and would not be of use with IDT.
|
||||
if univ_pos == "NOUN" and (
|
||||
"VerbForm=Vnoun" in token.morph or "VerbForm=Inf" in token.morph
|
||||
):
|
||||
hpref = "Form=HPref" in token.morph
|
||||
return [demutate(string, hpref).lower()]
|
||||
elif univ_pos == "ADJ" and "VerbForm=Part" in token.morph:
|
||||
return [demutate(string).lower()]
|
||||
lookup_table = self.lookups.get_table("lemma_lookup_" + lookup_pos, {})
|
||||
|
||||
def to_list(value):
|
||||
if value is None:
|
||||
value = []
|
||||
elif not isinstance(value, list):
|
||||
value = [value]
|
||||
return value
|
||||
|
||||
if univ_pos == "ADP":
|
||||
return to_list(lookup_table.get(string, string.lower()))
|
||||
ret = []
|
||||
if univ_pos == "PROPN":
|
||||
ret.extend(to_list(lookup_table.get(demutated)))
|
||||
ret.extend(to_list(lookup_table.get(secondary)))
|
||||
else:
|
||||
ret.extend(to_list(lookup_table.get(demutated.lower())))
|
||||
ret.extend(to_list(lookup_table.get(secondary.lower())))
|
||||
if len(ret) == 0:
|
||||
ret = [string.lower()]
|
||||
return ret
|
||||
|
||||
|
||||
def demutate(word: str, is_hpref: bool = False) -> str:
|
||||
UVOWELS = "AÁEÉIÍOÓUÚ"
|
||||
LVOWELS = "aáeéiíoóuú"
|
||||
lc = word.lower()
|
||||
# remove eclipsis
|
||||
if lc.startswith("bhf"):
|
||||
word = word[2:]
|
||||
elif lc.startswith("mb"):
|
||||
word = word[1:]
|
||||
elif lc.startswith("gc"):
|
||||
word = word[1:]
|
||||
elif lc.startswith("nd"):
|
||||
word = word[1:]
|
||||
elif lc.startswith("ng"):
|
||||
word = word[1:]
|
||||
elif lc.startswith("bp"):
|
||||
word = word[1:]
|
||||
elif lc.startswith("dt"):
|
||||
word = word[1:]
|
||||
elif word[0:1] == "n" and word[1:2] in UVOWELS:
|
||||
word = word[1:]
|
||||
elif lc.startswith("n-") and word[2:3] in LVOWELS:
|
||||
word = word[2:]
|
||||
# non-standard eclipsis
|
||||
elif lc.startswith("bh-f"):
|
||||
word = word[3:]
|
||||
elif lc.startswith("m-b"):
|
||||
word = word[2:]
|
||||
elif lc.startswith("g-c"):
|
||||
word = word[2:]
|
||||
elif lc.startswith("n-d"):
|
||||
word = word[2:]
|
||||
elif lc.startswith("n-g"):
|
||||
word = word[2:]
|
||||
elif lc.startswith("b-p"):
|
||||
word = word[2:]
|
||||
elif lc.startswith("d-t"):
|
||||
word = word[2:]
|
||||
|
||||
# t-prothesis
|
||||
elif lc.startswith("ts"):
|
||||
word = word[1:]
|
||||
elif lc.startswith("t-s"):
|
||||
word = word[2:]
|
||||
|
||||
# h-prothesis, if known to be present
|
||||
elif is_hpref and word[0:1] == "h":
|
||||
word = word[1:]
|
||||
# h-prothesis, simple case
|
||||
# words can also begin with 'h', but unlike eclipsis,
|
||||
# a hyphen is not used, so that needs to be handled
|
||||
# elsewhere
|
||||
elif word[0:1] == "h" and word[1:2] in UVOWELS:
|
||||
word = word[1:]
|
||||
|
||||
# lenition
|
||||
# this breaks the previous if, to handle super-non-standard
|
||||
# text where both eclipsis and lenition were used.
|
||||
if lc[0:1] in "bcdfgmpst" and lc[1:2] == "h":
|
||||
word = word[0:1] + word[2:]
|
||||
|
||||
return word
|
||||
|
||||
|
||||
def unponc(word: str) -> str:
|
||||
# fmt: off
|
||||
PONC = {
|
||||
"ḃ": "bh",
|
||||
"ċ": "ch",
|
||||
"ḋ": "dh",
|
||||
"ḟ": "fh",
|
||||
"ġ": "gh",
|
||||
"ṁ": "mh",
|
||||
"ṗ": "ph",
|
||||
"ṡ": "sh",
|
||||
"ṫ": "th",
|
||||
"Ḃ": "BH",
|
||||
"Ċ": "CH",
|
||||
"Ḋ": "DH",
|
||||
"Ḟ": "FH",
|
||||
"Ġ": "GH",
|
||||
"Ṁ": "MH",
|
||||
"Ṗ": "PH",
|
||||
"Ṡ": "SH",
|
||||
"Ṫ": "TH"
|
||||
}
|
||||
# fmt: on
|
||||
buf = []
|
||||
for ch in word:
|
||||
if ch in PONC:
|
||||
buf.append(PONC[ch])
|
||||
else:
|
||||
buf.append(ch)
|
||||
return "".join(buf)
|
|
@ -9,6 +9,8 @@ _exc = {
|
|||
"ded'": [{ORTH: "de", NORM: "de"}, {ORTH: "d'", NORM: "do"}],
|
||||
"lem'": [{ORTH: "le", NORM: "le"}, {ORTH: "m'", NORM: "mo"}],
|
||||
"led'": [{ORTH: "le", NORM: "le"}, {ORTH: "d'", NORM: "do"}],
|
||||
"théis": [{ORTH: "th", NORM: "tar"}, {ORTH: "éis", NORM: "éis"}],
|
||||
"tréis": [{ORTH: "tr", NORM: "tar"}, {ORTH: "éis", NORM: "éis"}],
|
||||
}
|
||||
|
||||
for exc_data in [
|
||||
|
|
Loading…
Reference in New Issue
Block a user