This commit is contained in:
BLKSerene 2025-11-27 11:00:23 -08:00 committed by GitHub
commit e5eb6ec9c0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -89,80 +89,81 @@ LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "grc", "id", "lb", "mk", "pt"
CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "corpora", "training", "pretraining", "initialize"]
LANG_ALIASES = {
"af": ["afr"],
"am": ["amh"],
"ar": ["ara"],
"az": ["aze"],
"bg": ["bul"],
"bn": ["ben"],
"bo": ["bod", "tib"],
"ca": ["cat"],
"cs": ["ces", "cze"],
"da": ["dan"],
"de": ["deu", "ger"],
"el": ["ell", "gre"],
"en": ["eng"],
"es": ["spa"],
"et": ["est"],
"eu": ["eus", "baq"],
"fa": ["fas", "per"],
"fi": ["fin"],
"fo": ["fao"],
"fr": ["fra", "fre"],
"ga": ["gle"],
"gd": ["gla"],
"gu": ["guj"],
"he": ["heb", "iw"], # "iw" is the obsolete ISO 639-1 code for Hebrew
"hi": ["hin"],
"hr": ["hrv", "scr"], # "scr" is the deprecated ISO 639-2/B for Croatian
"hu": ["hun"],
"hy": ["hye"],
"id": ["ind", "in"], # "in" is the obsolete ISO 639-1 code for Hebrew
"is": ["isl", "ice"],
"it": ["ita"],
"ja": ["jpn"],
"kn": ["kan"],
"ko": ["kor"],
"ky": ["kir"],
"la": ["lat"],
"lb": ["ltz"],
"lg": ["lug"],
"lt": ["lit"],
"lv": ["lav"],
"mk": ["mkd", "mac"],
"ml": ["mal"],
"mr": ["mar"],
"ms": ["msa", "may"],
"nb": ["nob"],
"ne": ["nep"],
"nl": ["nld", "dut"],
"nn": ["nno"],
"pl": ["pol"],
"pt": ["por"],
"ro": ["ron", "rom", "mo", "mol"], # "mo" and "mol" are deprecated codes for Moldavian
"ru": ["rus"],
"sa": ["san"],
"si": ["sin"],
"sk": ["slk", "slo"],
"sl": ["slv"],
"sq": ["sqi", "alb"],
"sr": ["srp", "scc"], # "scc" is the deprecated ISO 639-2/B code for Serbian
"sv": ["swe"],
"ta": ["tam"],
"te": ["tel"],
"th": ["tha"],
"ti": ["tir"],
"tl": ["tgl"],
"tn": ["tsn"],
"tr": ["tur"],
"tt": ["tat"],
"uk": ["ukr"],
"ur": ["urd"],
"vi": ["viw"],
"yo": ["yor"],
"zh": ["zho", "chi"],
"af": {"afr"},
"am": {"amh"},
"ar": {"ara"},
"az": {"aze"},
"bg": {"bul"},
"bn": {"ben"},
"bo": {"bod", "tib"},
"ca": {"cat"},
"cs": {"ces", "cze"},
"da": {"dan"},
"de": {"deu", "ger"},
"el": {"ell", "gre"},
"en": {"eng"},
"es": {"spa"},
"et": {"est"},
"eu": {"eus", "baq"},
"fa": {"fas", "per"},
"fi": {"fin"},
"fo": {"fao"},
"fr": {"fra", "fre"},
"ga": {"gle"},
"gd": {"gla"},
"gu": {"guj"},
"he": {"heb", "iw"}, # "iw" is the obsolete ISO 639-1 code for Hebrew
"hi": {"hin"},
"hr": {"hrv", "scr"}, # "scr" is the deprecated ISO 639-2/B for Croatian
"ht": {"hat"},
"hu": {"hun"},
"hy": {"hye"},
"id": {"ind", "in"}, # "in" is the obsolete ISO 639-1 code for Hebrew
"is": {"isl", "ice"},
"it": {"ita"},
"ja": {"jpn"},
"kn": {"kan"},
"ko": {"kor"},
"ky": {"kir"},
"la": {"lat"},
"lb": {"ltz"},
"lg": {"lug"},
"lt": {"lit"},
"lv": {"lav"},
"mk": {"mkd", "mac"},
"ml": {"mal"},
"mr": {"mar"},
"ms": {"msa", "may"},
"nb": {"nob"},
"ne": {"nep"},
"nl": {"nld", "dut"},
"nn": {"nno"},
"pl": {"pol"},
"pt": {"por"},
"ro": {"ron", "rom", "mo", "mol"}, # "mo" and "mol" are deprecated codes for Moldavian
"ru": {"rus"},
"sa": {"san"},
"si": {"sin"},
"sk": {"slk", "slo"},
"sl": {"slv"},
"sq": {"sqi", "alb"},
"sr": {"srp", "scc"}, # "scc" is the deprecated ISO 639-2/B code for Serbian
"sv": {"swe"},
"ta": {"tam"},
"te": {"tel"},
"th": {"tha"},
"ti": {"tir"},
"tl": {"tgl"},
"tn": {"tsn"},
"tr": {"tur"},
"tt": {"tat"},
"uk": {"ukr"},
"ur": {"urd"},
"vi": {"viw"},
"yo": {"yor"},
"zh": {"zho", "chi"},
"xx": ["mul"],
"xx": {"mul"},
}
# fmt: on