add lang check function to cli._util

This commit is contained in:
thomashacker 2022-11-23 13:53:56 +01:00
parent 49fda655c3
commit c5abf918f4
6 changed files with 24 additions and 32 deletions

View File

@ -20,6 +20,7 @@ from ..compat import Literal
from ..schemas import ProjectConfigSchema, validate from ..schemas import ProjectConfigSchema, validate
from ..util import import_file, run_command, make_tempdir, registry, logger from ..util import import_file, run_command, make_tempdir, registry, logger
from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS
from ..errors import RENAMED_LANGUAGE_CODES
from .. import about from .. import about
if TYPE_CHECKING: if TYPE_CHECKING:
@ -132,6 +133,16 @@ def _parse_override(value: Any) -> Any:
return str(value) return str(value)
def _handle_renamed_language_codes(lang: str) -> None:
# Throw error for renamed language codes in v4
if lang in RENAMED_LANGUAGE_CODES:
msg.fail(
title="Renamed language code",
text=f"Language code '{lang}' was replaced with '{RENAMED_LANGUAGE_CODES[lang]}' in spaCy v4. Update the language code from '{lang}' to '{RENAMED_LANGUAGE_CODES[lang]}'.",
exits=1,
)
def load_project_config( def load_project_config(
path: Path, interpolate: bool = True, overrides: Dict[str, Any] = SimpleFrozenDict() path: Path, interpolate: bool = True, overrides: Dict[str, Any] = SimpleFrozenDict()
) -> Dict[str, Any]: ) -> Dict[str, Any]:

View File

@ -7,13 +7,12 @@ import re
import sys import sys
import itertools import itertools
from ._util import app, Arg, Opt from ._util import app, Arg, Opt, _handle_renamed_language_codes
from ..training import docs_to_json from ..training import docs_to_json
from ..tokens import Doc, DocBin from ..tokens import Doc, DocBin
from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs
from ..training.converters import conllu_to_docs from ..training.converters import conllu_to_docs
from ..errors import RENAMED_LANGUAGE_CODES
# Converters are matched by file extension except for ner/iob, which are # Converters are matched by file extension except for ner/iob, which are
# matched by file extension and content. To add a converter, add a new # matched by file extension and content. To add a converter, add a new
@ -113,12 +112,7 @@ def convert(
msg = Printer(no_print=silent) msg = Printer(no_print=silent)
# Throw error for renamed language codes in v4 # Throw error for renamed language codes in v4
if lang in RENAMED_LANGUAGE_CODES: _handle_renamed_language_codes(lang)
msg.fail(
title="Renamed language code",
text=f"Language code '{lang}' was replaced with '{RENAMED_LANGUAGE_CODES[lang]}' in v4. Please change your current defined language code from '{lang}' to '{RENAMED_LANGUAGE_CODES[lang]}'.",
exits=1,
)
ner_map = srsly.read_json(ner_map) if ner_map is not None else None ner_map = srsly.read_json(ner_map) if ner_map is not None else None
doc_files = [] doc_files = []

View File

@ -12,9 +12,7 @@ from ..language import DEFAULT_CONFIG_PRETRAIN_PATH
from ..schemas import RecommendationSchema from ..schemas import RecommendationSchema
from ..util import SimpleFrozenList from ..util import SimpleFrozenList
from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND
from ._util import string_to_list, import_code from ._util import string_to_list, import_code, _handle_renamed_language_codes
from ..errors import RENAMED_LANGUAGE_CODES
ROOT = Path(__file__).parent / "templates" ROOT = Path(__file__).parent / "templates"
@ -160,13 +158,10 @@ def init_config(
msg = Printer(no_print=silent) msg = Printer(no_print=silent)
with TEMPLATE_PATH.open("r") as f: with TEMPLATE_PATH.open("r") as f:
template = Template(f.read()) template = Template(f.read())
# Throw error for renamed language codes in v4 # Throw error for renamed language codes in v4
if lang in RENAMED_LANGUAGE_CODES: _handle_renamed_language_codes(lang)
msg.fail(
title="Renamed language code",
text=f"Language code '{lang}' was replaced with '{RENAMED_LANGUAGE_CODES[lang]}' in v4. Please change your current defined language code from '{lang}' to '{RENAMED_LANGUAGE_CODES[lang]}'.",
exits=1,
)
# Filter out duplicates since tok2vec and transformer are added by template # Filter out duplicates since tok2vec and transformer are added by template
pipeline = [pipe for pipe in pipeline if pipe not in ("tok2vec", "transformer")] pipeline = [pipe for pipe in pipeline if pipe not in ("tok2vec", "transformer")]
defaults = RECOMMENDATIONS["__default__"] defaults = RECOMMENDATIONS["__default__"]

View File

@ -9,9 +9,7 @@ from .. import util
from ..training.initialize import init_nlp, convert_vectors from ..training.initialize import init_nlp, convert_vectors
from ..language import Language from ..language import Language
from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
from ._util import import_code, setup_gpu from ._util import import_code, setup_gpu, _handle_renamed_language_codes
from ..errors import RENAMED_LANGUAGE_CODES
@init_cli.command("vectors") @init_cli.command("vectors")
@ -33,13 +31,10 @@ def init_vectors_cli(
a model with vectors. a model with vectors.
""" """
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO) util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
# Throw error for renamed language codes in v4 # Throw error for renamed language codes in v4
if lang in RENAMED_LANGUAGE_CODES: _handle_renamed_language_codes(lang)
msg.fail(
title="Renamed language code",
text=f"Language code '{lang}' was replaced with '{RENAMED_LANGUAGE_CODES[lang]}' in v4. Please change your current defined language code from '{lang}' to '{RENAMED_LANGUAGE_CODES[lang]}'.",
exits=1,
)
msg.info(f"Creating blank nlp object for language '{lang}'") msg.info(f"Creating blank nlp object for language '{lang}'")
nlp = util.get_lang_class(lang)() nlp = util.get_lang_class(lang)()
if jsonl_loc is not None: if jsonl_loc is not None:

View File

@ -965,10 +965,7 @@ OLD_MODEL_SHORTCUTS = {
} }
# Renamed language codes in v4 # Renamed language codes in v4
RENAMED_LANGUAGE_CODES = { RENAMED_LANGUAGE_CODES = {"xx":"mul", "is":"isl"}
"xx":"mul", "is":"isl"
}
# fmt: on # fmt: on

View File

@ -8,9 +8,9 @@ from spacy.util import get_lang_class
LANGUAGES = ["af", "am", "ar", "az", "bg", "bn", "ca", "cs", "da", "de", "el", LANGUAGES = ["af", "am", "ar", "az", "bg", "bn", "ca", "cs", "da", "de", "el",
"en", "es", "et", "eu", "fa", "fi", "fr", "ga", "gu", "he", "hi", "en", "es", "et", "eu", "fa", "fi", "fr", "ga", "gu", "he", "hi",
"hr", "hu", "hy", "id", "isl", "it", "kn", "ky", "lb", "lt", "lv", "hr", "hu", "hy", "id", "isl", "it", "kn", "ky", "lb", "lt", "lv",
"mk", "ml", "mr", "nb", "ne", "nl", "pl", "pt", "ro", "ru", "sa", "mk", "ml", "mr", "mul", "nb", "ne", "nl", "pl", "pt", "ro", "ru", "sa",
"si", "sk", "sl", "sq", "sr", "sv", "ta", "te", "ti", "tl", "tn", "si", "sk", "sl", "sq", "sr", "sv", "ta", "te", "ti", "tl", "tn",
"tr", "tt", "uk", "ur", "mul", "yo"] "tr", "tt", "uk", "ur", "yo"]
# fmt: on # fmt: on