From 12854f85bd06b42d003f2246ed7c32f1a2c7f5a5 Mon Sep 17 00:00:00 2001 From: BLKSerene Date: Thu, 6 Nov 2025 10:57:58 +0800 Subject: [PATCH] Add language alias for Haitian Creole --- spacy/util.py | 147 +++++++++++++++++++++++++------------------------- 1 file changed, 74 insertions(+), 73 deletions(-) diff --git a/spacy/util.py b/spacy/util.py index ad5a7e0ba..72548e3ad 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -89,80 +89,81 @@ LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "grc", "id", "lb", "mk", "pt" CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "corpora", "training", "pretraining", "initialize"] LANG_ALIASES = { - "af": ["afr"], - "am": ["amh"], - "ar": ["ara"], - "az": ["aze"], - "bg": ["bul"], - "bn": ["ben"], - "bo": ["bod", "tib"], - "ca": ["cat"], - "cs": ["ces", "cze"], - "da": ["dan"], - "de": ["deu", "ger"], - "el": ["ell", "gre"], - "en": ["eng"], - "es": ["spa"], - "et": ["est"], - "eu": ["eus", "baq"], - "fa": ["fas", "per"], - "fi": ["fin"], - "fo": ["fao"], - "fr": ["fra", "fre"], - "ga": ["gle"], - "gd": ["gla"], - "gu": ["guj"], - "he": ["heb", "iw"], # "iw" is the obsolete ISO 639-1 code for Hebrew - "hi": ["hin"], - "hr": ["hrv", "scr"], # "scr" is the deprecated ISO 639-2/B for Croatian - "hu": ["hun"], - "hy": ["hye"], - "id": ["ind", "in"], # "in" is the obsolete ISO 639-1 code for Hebrew - "is": ["isl", "ice"], - "it": ["ita"], - "ja": ["jpn"], - "kn": ["kan"], - "ko": ["kor"], - "ky": ["kir"], - "la": ["lat"], - "lb": ["ltz"], - "lg": ["lug"], - "lt": ["lit"], - "lv": ["lav"], - "mk": ["mkd", "mac"], - "ml": ["mal"], - "mr": ["mar"], - "ms": ["msa", "may"], - "nb": ["nob"], - "ne": ["nep"], - "nl": ["nld", "dut"], - "nn": ["nno"], - "pl": ["pol"], - "pt": ["por"], - "ro": ["ron", "rom", "mo", "mol"], # "mo" and "mol" are deprecated codes for Moldavian - "ru": ["rus"], - "sa": ["san"], - "si": ["sin"], - "sk": ["slk", "slo"], - "sl": ["slv"], - "sq": ["sqi", "alb"], - "sr": ["srp", "scc"], # "scc" is the deprecated ISO 639-2/B code for Serbian - "sv": ["swe"], - "ta": ["tam"], - "te": ["tel"], - "th": ["tha"], - "ti": ["tir"], - "tl": ["tgl"], - "tn": ["tsn"], - "tr": ["tur"], - "tt": ["tat"], - "uk": ["ukr"], - "ur": ["urd"], - "vi": ["viw"], - "yo": ["yor"], - "zh": ["zho", "chi"], + "af": {"afr"}, + "am": {"amh"}, + "ar": {"ara"}, + "az": {"aze"}, + "bg": {"bul"}, + "bn": {"ben"}, + "bo": {"bod", "tib"}, + "ca": {"cat"}, + "cs": {"ces", "cze"}, + "da": {"dan"}, + "de": {"deu", "ger"}, + "el": {"ell", "gre"}, + "en": {"eng"}, + "es": {"spa"}, + "et": {"est"}, + "eu": {"eus", "baq"}, + "fa": {"fas", "per"}, + "fi": {"fin"}, + "fo": {"fao"}, + "fr": {"fra", "fre"}, + "ga": {"gle"}, + "gd": {"gla"}, + "gu": {"guj"}, + "he": {"heb", "iw"}, # "iw" is the obsolete ISO 639-1 code for Hebrew + "hi": {"hin"}, + "hr": {"hrv", "scr"}, # "scr" is the deprecated ISO 639-2/B for Croatian + "ht": {"hat"}, + "hu": {"hun"}, + "hy": {"hye"}, + "id": {"ind", "in"}, # "in" is the obsolete ISO 639-1 code for Hebrew + "is": {"isl", "ice"}, + "it": {"ita"}, + "ja": {"jpn"}, + "kn": {"kan"}, + "ko": {"kor"}, + "ky": {"kir"}, + "la": {"lat"}, + "lb": {"ltz"}, + "lg": {"lug"}, + "lt": {"lit"}, + "lv": {"lav"}, + "mk": {"mkd", "mac"}, + "ml": {"mal"}, + "mr": {"mar"}, + "ms": {"msa", "may"}, + "nb": {"nob"}, + "ne": {"nep"}, + "nl": {"nld", "dut"}, + "nn": {"nno"}, + "pl": {"pol"}, + "pt": {"por"}, + "ro": {"ron", "rom", "mo", "mol"}, # "mo" and "mol" are deprecated codes for Moldavian + "ru": {"rus"}, + "sa": {"san"}, + "si": {"sin"}, + "sk": {"slk", "slo"}, + "sl": {"slv"}, + "sq": {"sqi", "alb"}, + "sr": {"srp", "scc"}, # "scc" is the deprecated ISO 639-2/B code for Serbian + "sv": {"swe"}, + "ta": {"tam"}, + "te": {"tel"}, + "th": {"tha"}, + "ti": {"tir"}, + "tl": {"tgl"}, + "tn": {"tsn"}, + "tr": {"tur"}, + "tt": {"tat"}, + "uk": {"ukr"}, + "ur": {"urd"}, + "vi": {"viw"}, + "yo": {"yor"}, + "zh": {"zho", "chi"}, - "xx": ["mul"], + "xx": {"mul"}, } # fmt: on