From c5abf918f4024883d4ea3908231db7a27bffa60b Mon Sep 17 00:00:00 2001 From: thomashacker Date: Wed, 23 Nov 2022 13:53:56 +0100 Subject: [PATCH] add lang check function to cli._util --- spacy/cli/_util.py | 11 +++++++++++ spacy/cli/convert.py | 10 ++-------- spacy/cli/init_config.py | 13 ++++--------- spacy/cli/init_pipeline.py | 13 ++++--------- spacy/errors.py | 5 +---- spacy/tests/lang/test_initialize.py | 4 ++-- 6 files changed, 24 insertions(+), 32 deletions(-) diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index 897964a88..b4a19d6b2 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -20,6 +20,7 @@ from ..compat import Literal from ..schemas import ProjectConfigSchema, validate from ..util import import_file, run_command, make_tempdir, registry, logger from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS +from ..errors import RENAMED_LANGUAGE_CODES from .. import about if TYPE_CHECKING: @@ -132,6 +133,16 @@ def _parse_override(value: Any) -> Any: return str(value) +def _handle_renamed_language_codes(lang: str) -> None: + # Throw error for renamed language codes in v4 + if lang in RENAMED_LANGUAGE_CODES: + msg.fail( + title="Renamed language code", + text=f"Language code '{lang}' was replaced with '{RENAMED_LANGUAGE_CODES[lang]}' in spaCy v4. Update the language code from '{lang}' to '{RENAMED_LANGUAGE_CODES[lang]}'.", + exits=1, + ) + + def load_project_config( path: Path, interpolate: bool = True, overrides: Dict[str, Any] = SimpleFrozenDict() ) -> Dict[str, Any]: diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index 90e61cf4f..f7165cfa7 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -7,13 +7,12 @@ import re import sys import itertools -from ._util import app, Arg, Opt +from ._util import app, Arg, Opt, _handle_renamed_language_codes from ..training import docs_to_json from ..tokens import Doc, DocBin from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs from ..training.converters import conllu_to_docs -from ..errors import RENAMED_LANGUAGE_CODES # Converters are matched by file extension except for ner/iob, which are # matched by file extension and content. To add a converter, add a new @@ -113,12 +112,7 @@ def convert( msg = Printer(no_print=silent) # Throw error for renamed language codes in v4 - if lang in RENAMED_LANGUAGE_CODES: - msg.fail( - title="Renamed language code", - text=f"Language code '{lang}' was replaced with '{RENAMED_LANGUAGE_CODES[lang]}' in v4. Please change your current defined language code from '{lang}' to '{RENAMED_LANGUAGE_CODES[lang]}'.", - exits=1, - ) + _handle_renamed_language_codes(lang) ner_map = srsly.read_json(ner_map) if ner_map is not None else None doc_files = [] diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py index 84b29682b..f758902b7 100644 --- a/spacy/cli/init_config.py +++ b/spacy/cli/init_config.py @@ -12,9 +12,7 @@ from ..language import DEFAULT_CONFIG_PRETRAIN_PATH from ..schemas import RecommendationSchema from ..util import SimpleFrozenList from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND -from ._util import string_to_list, import_code - -from ..errors import RENAMED_LANGUAGE_CODES +from ._util import string_to_list, import_code, _handle_renamed_language_codes ROOT = Path(__file__).parent / "templates" @@ -160,13 +158,10 @@ def init_config( msg = Printer(no_print=silent) with TEMPLATE_PATH.open("r") as f: template = Template(f.read()) + # Throw error for renamed language codes in v4 - if lang in RENAMED_LANGUAGE_CODES: - msg.fail( - title="Renamed language code", - text=f"Language code '{lang}' was replaced with '{RENAMED_LANGUAGE_CODES[lang]}' in v4. Please change your current defined language code from '{lang}' to '{RENAMED_LANGUAGE_CODES[lang]}'.", - exits=1, - ) + _handle_renamed_language_codes(lang) + # Filter out duplicates since tok2vec and transformer are added by template pipeline = [pipe for pipe in pipeline if pipe not in ("tok2vec", "transformer")] defaults = RECOMMENDATIONS["__default__"] diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py index ddea0e5cf..f279cf793 100644 --- a/spacy/cli/init_pipeline.py +++ b/spacy/cli/init_pipeline.py @@ -9,9 +9,7 @@ from .. import util from ..training.initialize import init_nlp, convert_vectors from ..language import Language from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error -from ._util import import_code, setup_gpu - -from ..errors import RENAMED_LANGUAGE_CODES +from ._util import import_code, setup_gpu, _handle_renamed_language_codes @init_cli.command("vectors") @@ -33,13 +31,10 @@ def init_vectors_cli( a model with vectors. """ util.logger.setLevel(logging.DEBUG if verbose else logging.INFO) + # Throw error for renamed language codes in v4 - if lang in RENAMED_LANGUAGE_CODES: - msg.fail( - title="Renamed language code", - text=f"Language code '{lang}' was replaced with '{RENAMED_LANGUAGE_CODES[lang]}' in v4. Please change your current defined language code from '{lang}' to '{RENAMED_LANGUAGE_CODES[lang]}'.", - exits=1, - ) + _handle_renamed_language_codes(lang) + msg.info(f"Creating blank nlp object for language '{lang}'") nlp = util.get_lang_class(lang)() if jsonl_loc is not None: diff --git a/spacy/errors.py b/spacy/errors.py index d4a7ac191..83aea983a 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -965,10 +965,7 @@ OLD_MODEL_SHORTCUTS = { } # Renamed language codes in v4 -RENAMED_LANGUAGE_CODES = { - "xx":"mul", "is":"isl" -} - +RENAMED_LANGUAGE_CODES = {"xx":"mul", "is":"isl"} # fmt: on diff --git a/spacy/tests/lang/test_initialize.py b/spacy/tests/lang/test_initialize.py index df76307f1..98d37f832 100644 --- a/spacy/tests/lang/test_initialize.py +++ b/spacy/tests/lang/test_initialize.py @@ -8,9 +8,9 @@ from spacy.util import get_lang_class LANGUAGES = ["af", "am", "ar", "az", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es", "et", "eu", "fa", "fi", "fr", "ga", "gu", "he", "hi", "hr", "hu", "hy", "id", "isl", "it", "kn", "ky", "lb", "lt", "lv", - "mk", "ml", "mr", "nb", "ne", "nl", "pl", "pt", "ro", "ru", "sa", + "mk", "ml", "mr", "mul", "nb", "ne", "nl", "pl", "pt", "ro", "ru", "sa", "si", "sk", "sl", "sq", "sr", "sv", "ta", "te", "ti", "tl", "tn", - "tr", "tt", "uk", "ur", "mul", "yo"] + "tr", "tt", "uk", "ur", "yo"] # fmt: on