mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-05 12:50:20 +03:00
add lang check function to cli._util
This commit is contained in:
parent
49fda655c3
commit
c5abf918f4
|
@ -20,6 +20,7 @@ from ..compat import Literal
|
||||||
from ..schemas import ProjectConfigSchema, validate
|
from ..schemas import ProjectConfigSchema, validate
|
||||||
from ..util import import_file, run_command, make_tempdir, registry, logger
|
from ..util import import_file, run_command, make_tempdir, registry, logger
|
||||||
from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS
|
from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS
|
||||||
|
from ..errors import RENAMED_LANGUAGE_CODES
|
||||||
from .. import about
|
from .. import about
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
|
@ -132,6 +133,16 @@ def _parse_override(value: Any) -> Any:
|
||||||
return str(value)
|
return str(value)
|
||||||
|
|
||||||
|
|
||||||
|
def _handle_renamed_language_codes(lang: str) -> None:
|
||||||
|
# Throw error for renamed language codes in v4
|
||||||
|
if lang in RENAMED_LANGUAGE_CODES:
|
||||||
|
msg.fail(
|
||||||
|
title="Renamed language code",
|
||||||
|
text=f"Language code '{lang}' was replaced with '{RENAMED_LANGUAGE_CODES[lang]}' in spaCy v4. Update the language code from '{lang}' to '{RENAMED_LANGUAGE_CODES[lang]}'.",
|
||||||
|
exits=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def load_project_config(
|
def load_project_config(
|
||||||
path: Path, interpolate: bool = True, overrides: Dict[str, Any] = SimpleFrozenDict()
|
path: Path, interpolate: bool = True, overrides: Dict[str, Any] = SimpleFrozenDict()
|
||||||
) -> Dict[str, Any]:
|
) -> Dict[str, Any]:
|
||||||
|
|
|
@ -7,13 +7,12 @@ import re
|
||||||
import sys
|
import sys
|
||||||
import itertools
|
import itertools
|
||||||
|
|
||||||
from ._util import app, Arg, Opt
|
from ._util import app, Arg, Opt, _handle_renamed_language_codes
|
||||||
from ..training import docs_to_json
|
from ..training import docs_to_json
|
||||||
from ..tokens import Doc, DocBin
|
from ..tokens import Doc, DocBin
|
||||||
from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs
|
from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs
|
||||||
from ..training.converters import conllu_to_docs
|
from ..training.converters import conllu_to_docs
|
||||||
|
|
||||||
from ..errors import RENAMED_LANGUAGE_CODES
|
|
||||||
|
|
||||||
# Converters are matched by file extension except for ner/iob, which are
|
# Converters are matched by file extension except for ner/iob, which are
|
||||||
# matched by file extension and content. To add a converter, add a new
|
# matched by file extension and content. To add a converter, add a new
|
||||||
|
@ -113,12 +112,7 @@ def convert(
|
||||||
msg = Printer(no_print=silent)
|
msg = Printer(no_print=silent)
|
||||||
|
|
||||||
# Throw error for renamed language codes in v4
|
# Throw error for renamed language codes in v4
|
||||||
if lang in RENAMED_LANGUAGE_CODES:
|
_handle_renamed_language_codes(lang)
|
||||||
msg.fail(
|
|
||||||
title="Renamed language code",
|
|
||||||
text=f"Language code '{lang}' was replaced with '{RENAMED_LANGUAGE_CODES[lang]}' in v4. Please change your current defined language code from '{lang}' to '{RENAMED_LANGUAGE_CODES[lang]}'.",
|
|
||||||
exits=1,
|
|
||||||
)
|
|
||||||
|
|
||||||
ner_map = srsly.read_json(ner_map) if ner_map is not None else None
|
ner_map = srsly.read_json(ner_map) if ner_map is not None else None
|
||||||
doc_files = []
|
doc_files = []
|
||||||
|
|
|
@ -12,9 +12,7 @@ from ..language import DEFAULT_CONFIG_PRETRAIN_PATH
|
||||||
from ..schemas import RecommendationSchema
|
from ..schemas import RecommendationSchema
|
||||||
from ..util import SimpleFrozenList
|
from ..util import SimpleFrozenList
|
||||||
from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND
|
from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND
|
||||||
from ._util import string_to_list, import_code
|
from ._util import string_to_list, import_code, _handle_renamed_language_codes
|
||||||
|
|
||||||
from ..errors import RENAMED_LANGUAGE_CODES
|
|
||||||
|
|
||||||
|
|
||||||
ROOT = Path(__file__).parent / "templates"
|
ROOT = Path(__file__).parent / "templates"
|
||||||
|
@ -160,13 +158,10 @@ def init_config(
|
||||||
msg = Printer(no_print=silent)
|
msg = Printer(no_print=silent)
|
||||||
with TEMPLATE_PATH.open("r") as f:
|
with TEMPLATE_PATH.open("r") as f:
|
||||||
template = Template(f.read())
|
template = Template(f.read())
|
||||||
|
|
||||||
# Throw error for renamed language codes in v4
|
# Throw error for renamed language codes in v4
|
||||||
if lang in RENAMED_LANGUAGE_CODES:
|
_handle_renamed_language_codes(lang)
|
||||||
msg.fail(
|
|
||||||
title="Renamed language code",
|
|
||||||
text=f"Language code '{lang}' was replaced with '{RENAMED_LANGUAGE_CODES[lang]}' in v4. Please change your current defined language code from '{lang}' to '{RENAMED_LANGUAGE_CODES[lang]}'.",
|
|
||||||
exits=1,
|
|
||||||
)
|
|
||||||
# Filter out duplicates since tok2vec and transformer are added by template
|
# Filter out duplicates since tok2vec and transformer are added by template
|
||||||
pipeline = [pipe for pipe in pipeline if pipe not in ("tok2vec", "transformer")]
|
pipeline = [pipe for pipe in pipeline if pipe not in ("tok2vec", "transformer")]
|
||||||
defaults = RECOMMENDATIONS["__default__"]
|
defaults = RECOMMENDATIONS["__default__"]
|
||||||
|
|
|
@ -9,9 +9,7 @@ from .. import util
|
||||||
from ..training.initialize import init_nlp, convert_vectors
|
from ..training.initialize import init_nlp, convert_vectors
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
|
from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
|
||||||
from ._util import import_code, setup_gpu
|
from ._util import import_code, setup_gpu, _handle_renamed_language_codes
|
||||||
|
|
||||||
from ..errors import RENAMED_LANGUAGE_CODES
|
|
||||||
|
|
||||||
|
|
||||||
@init_cli.command("vectors")
|
@init_cli.command("vectors")
|
||||||
|
@ -33,13 +31,10 @@ def init_vectors_cli(
|
||||||
a model with vectors.
|
a model with vectors.
|
||||||
"""
|
"""
|
||||||
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
||||||
|
|
||||||
# Throw error for renamed language codes in v4
|
# Throw error for renamed language codes in v4
|
||||||
if lang in RENAMED_LANGUAGE_CODES:
|
_handle_renamed_language_codes(lang)
|
||||||
msg.fail(
|
|
||||||
title="Renamed language code",
|
|
||||||
text=f"Language code '{lang}' was replaced with '{RENAMED_LANGUAGE_CODES[lang]}' in v4. Please change your current defined language code from '{lang}' to '{RENAMED_LANGUAGE_CODES[lang]}'.",
|
|
||||||
exits=1,
|
|
||||||
)
|
|
||||||
msg.info(f"Creating blank nlp object for language '{lang}'")
|
msg.info(f"Creating blank nlp object for language '{lang}'")
|
||||||
nlp = util.get_lang_class(lang)()
|
nlp = util.get_lang_class(lang)()
|
||||||
if jsonl_loc is not None:
|
if jsonl_loc is not None:
|
||||||
|
|
|
@ -965,10 +965,7 @@ OLD_MODEL_SHORTCUTS = {
|
||||||
}
|
}
|
||||||
|
|
||||||
# Renamed language codes in v4
|
# Renamed language codes in v4
|
||||||
RENAMED_LANGUAGE_CODES = {
|
RENAMED_LANGUAGE_CODES = {"xx":"mul", "is":"isl"}
|
||||||
"xx":"mul", "is":"isl"
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
|
|
|
@ -8,9 +8,9 @@ from spacy.util import get_lang_class
|
||||||
LANGUAGES = ["af", "am", "ar", "az", "bg", "bn", "ca", "cs", "da", "de", "el",
|
LANGUAGES = ["af", "am", "ar", "az", "bg", "bn", "ca", "cs", "da", "de", "el",
|
||||||
"en", "es", "et", "eu", "fa", "fi", "fr", "ga", "gu", "he", "hi",
|
"en", "es", "et", "eu", "fa", "fi", "fr", "ga", "gu", "he", "hi",
|
||||||
"hr", "hu", "hy", "id", "isl", "it", "kn", "ky", "lb", "lt", "lv",
|
"hr", "hu", "hy", "id", "isl", "it", "kn", "ky", "lb", "lt", "lv",
|
||||||
"mk", "ml", "mr", "nb", "ne", "nl", "pl", "pt", "ro", "ru", "sa",
|
"mk", "ml", "mr", "mul", "nb", "ne", "nl", "pl", "pt", "ro", "ru", "sa",
|
||||||
"si", "sk", "sl", "sq", "sr", "sv", "ta", "te", "ti", "tl", "tn",
|
"si", "sk", "sl", "sq", "sr", "sv", "ta", "te", "ti", "tl", "tn",
|
||||||
"tr", "tt", "uk", "ur", "mul", "yo"]
|
"tr", "tt", "uk", "ur", "yo"]
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user