From c5abf918f4024883d4ea3908231db7a27bffa60b Mon Sep 17 00:00:00 2001
From: thomashacker <EdwardSchmuhl@web.de>
Date: Wed, 23 Nov 2022 13:53:56 +0100
Subject: [PATCH] add lang check function to cli._util

---
 spacy/cli/_util.py                  | 11 +++++++++++
 spacy/cli/convert.py                | 10 ++--------
 spacy/cli/init_config.py            | 13 ++++---------
 spacy/cli/init_pipeline.py          | 13 ++++---------
 spacy/errors.py                     |  5 +----
 spacy/tests/lang/test_initialize.py |  4 ++--
 6 files changed, 24 insertions(+), 32 deletions(-)

diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index 897964a88..b4a19d6b2 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -20,6 +20,7 @@ from ..compat import Literal
 from ..schemas import ProjectConfigSchema, validate
 from ..util import import_file, run_command, make_tempdir, registry, logger
 from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS
+from ..errors import RENAMED_LANGUAGE_CODES
 from .. import about
 
 if TYPE_CHECKING:
@@ -132,6 +133,16 @@ def _parse_override(value: Any) -> Any:
         return str(value)
 
 
+def _handle_renamed_language_codes(lang: str) -> None:
+    # Throw error for renamed language codes in v4
+    if lang in RENAMED_LANGUAGE_CODES:
+        msg.fail(
+            title="Renamed language code",
+            text=f"Language code '{lang}' was replaced with '{RENAMED_LANGUAGE_CODES[lang]}' in spaCy v4. Update the language code from '{lang}' to '{RENAMED_LANGUAGE_CODES[lang]}'.",
+            exits=1,
+        )
+
+
 def load_project_config(
     path: Path, interpolate: bool = True, overrides: Dict[str, Any] = SimpleFrozenDict()
 ) -> Dict[str, Any]:
diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py
index 90e61cf4f..f7165cfa7 100644
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@@ -7,13 +7,12 @@ import re
 import sys
 import itertools
 
-from ._util import app, Arg, Opt
+from ._util import app, Arg, Opt, _handle_renamed_language_codes
 from ..training import docs_to_json
 from ..tokens import Doc, DocBin
 from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs
 from ..training.converters import conllu_to_docs
 
-from ..errors import RENAMED_LANGUAGE_CODES
 
 # Converters are matched by file extension except for ner/iob, which are
 # matched by file extension and content. To add a converter, add a new
@@ -113,12 +112,7 @@ def convert(
         msg = Printer(no_print=silent)
 
     # Throw error for renamed language codes in v4
-    if lang in RENAMED_LANGUAGE_CODES:
-        msg.fail(
-            title="Renamed language code",
-            text=f"Language code '{lang}' was replaced with '{RENAMED_LANGUAGE_CODES[lang]}' in v4. Please change your current defined language code from '{lang}' to '{RENAMED_LANGUAGE_CODES[lang]}'.",
-            exits=1,
-        )
+    _handle_renamed_language_codes(lang)
 
     ner_map = srsly.read_json(ner_map) if ner_map is not None else None
     doc_files = []
diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py
index 84b29682b..f758902b7 100644
--- a/spacy/cli/init_config.py
+++ b/spacy/cli/init_config.py
@@ -12,9 +12,7 @@ from ..language import DEFAULT_CONFIG_PRETRAIN_PATH
 from ..schemas import RecommendationSchema
 from ..util import SimpleFrozenList
 from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND
-from ._util import string_to_list, import_code
-
-from ..errors import RENAMED_LANGUAGE_CODES
+from ._util import string_to_list, import_code, _handle_renamed_language_codes
 
 
 ROOT = Path(__file__).parent / "templates"
@@ -160,13 +158,10 @@ def init_config(
     msg = Printer(no_print=silent)
     with TEMPLATE_PATH.open("r") as f:
         template = Template(f.read())
+
     # Throw error for renamed language codes in v4
-    if lang in RENAMED_LANGUAGE_CODES:
-        msg.fail(
-            title="Renamed language code",
-            text=f"Language code '{lang}' was replaced with '{RENAMED_LANGUAGE_CODES[lang]}' in v4. Please change your current defined language code from '{lang}' to '{RENAMED_LANGUAGE_CODES[lang]}'.",
-            exits=1,
-        )
+    _handle_renamed_language_codes(lang)
+
     # Filter out duplicates since tok2vec and transformer are added by template
     pipeline = [pipe for pipe in pipeline if pipe not in ("tok2vec", "transformer")]
     defaults = RECOMMENDATIONS["__default__"]
diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py
index ddea0e5cf..f279cf793 100644
--- a/spacy/cli/init_pipeline.py
+++ b/spacy/cli/init_pipeline.py
@@ -9,9 +9,7 @@ from .. import util
 from ..training.initialize import init_nlp, convert_vectors
 from ..language import Language
 from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
-from ._util import import_code, setup_gpu
-
-from ..errors import RENAMED_LANGUAGE_CODES
+from ._util import import_code, setup_gpu, _handle_renamed_language_codes
 
 
 @init_cli.command("vectors")
@@ -33,13 +31,10 @@ def init_vectors_cli(
     a model with vectors.
     """
     util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
+
     # Throw error for renamed language codes in v4
-    if lang in RENAMED_LANGUAGE_CODES:
-        msg.fail(
-            title="Renamed language code",
-            text=f"Language code '{lang}' was replaced with '{RENAMED_LANGUAGE_CODES[lang]}' in v4. Please change your current defined language code from '{lang}' to '{RENAMED_LANGUAGE_CODES[lang]}'.",
-            exits=1,
-        )
+    _handle_renamed_language_codes(lang)
+
     msg.info(f"Creating blank nlp object for language '{lang}'")
     nlp = util.get_lang_class(lang)()
     if jsonl_loc is not None:
diff --git a/spacy/errors.py b/spacy/errors.py
index d4a7ac191..83aea983a 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -965,10 +965,7 @@ OLD_MODEL_SHORTCUTS = {
 }
 
 # Renamed language codes in v4
-RENAMED_LANGUAGE_CODES = {
-        "xx":"mul", "is":"isl"
-}
-
+RENAMED_LANGUAGE_CODES = {"xx":"mul", "is":"isl"}
 
 # fmt: on
 
diff --git a/spacy/tests/lang/test_initialize.py b/spacy/tests/lang/test_initialize.py
index df76307f1..98d37f832 100644
--- a/spacy/tests/lang/test_initialize.py
+++ b/spacy/tests/lang/test_initialize.py
@@ -8,9 +8,9 @@ from spacy.util import get_lang_class
 LANGUAGES = ["af", "am", "ar", "az", "bg", "bn", "ca", "cs", "da", "de", "el",
              "en", "es", "et", "eu", "fa", "fi", "fr", "ga", "gu", "he", "hi",
              "hr", "hu", "hy", "id", "isl", "it", "kn", "ky", "lb", "lt", "lv",
-             "mk", "ml", "mr", "nb", "ne", "nl", "pl", "pt", "ro", "ru", "sa",
+             "mk", "ml", "mr", "mul", "nb", "ne", "nl", "pl", "pt", "ro", "ru", "sa",
              "si", "sk", "sl", "sq", "sr", "sv", "ta", "te", "ti", "tl", "tn",
-             "tr", "tt", "uk", "ur", "mul", "yo"]
+             "tr", "tt", "uk", "ur", "yo"]
 # fmt: on