spaCy/spacy/cli/init_config.py

from typing import Optional, List, Tuple
from enum import Enum
from pathlib import Path
from wasabi import Printer, diff_strings
from thinc.api import Config
import srsly
import re
from jinja2 import Template

from .. import util
from ..language import DEFAULT_CONFIG_PRETRAIN_PATH
from ..schemas import RecommendationSchema
from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND, string_to_list


ROOT = Path(__file__).parent / "templates"
TEMPLATE_PATH = ROOT / "quickstart_training.jinja"
RECOMMENDATIONS = srsly.read_yaml(ROOT / "quickstart_training_recommendations.yml")


class Optimizations(str, Enum):
    efficiency = "efficiency"
    accuracy = "accuracy"


@init_cli.command("config")
def init_config_cli(
    # fmt: off
    output_file: Path = Arg(..., help="File to save config.cfg to or - for stdout (will only output config and no additional logging info)", allow_dash=True),
    lang: Optional[str] = Opt("en", "--lang", "-l", help="Two-letter code of the language to use"),
    pipeline: Optional[str] = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"),
    optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),
    cpu: bool = Opt(False, "--cpu", "-C", help="Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."),
    pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"),
    # fmt: on
):
    """
    Generate a starter config.cfg for training. Based on your requirements
    specified via the CLI arguments, this command generates a config with the
    optimal settings for your use case. This includes the choice of architecture,
    pretrained weights and related hyperparameters.

    DOCS: https://nightly.spacy.io/api/cli#init-config
    """
    if isinstance(optimize, Optimizations):  # instance of enum from the CLI
        optimize = optimize.value
    pipeline = string_to_list(pipeline)
    init_config(
        output_file,
        lang=lang,
        pipeline=pipeline,
        optimize=optimize,
        cpu=cpu,
        pretraining=pretraining,
    )


@init_cli.command("fill-config")
def init_fill_config_cli(
    # fmt: off
    base_path: Path = Arg(..., help="Base config to fill", exists=True, dir_okay=False),
    output_file: Path = Arg("-", help="File to save config.cfg to (or - for stdout)", allow_dash=True),
    pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"),
    diff: bool = Opt(False, "--diff", "-D", help="Print a visual diff highlighting the changes")
    # fmt: on
):
    """
    Fill partial config.cfg with default values. Will add all missing settings
    from the default config and will create all objects, check the registered
    functions for their default values and update the base config. This command
    can be used with a config generated via the training quickstart widget:
    https://nightly.spacy.io/usage/training#quickstart

    DOCS: https://nightly.spacy.io/api/cli#init-fill-config
    """
    fill_config(output_file, base_path, pretraining=pretraining, diff=diff)


def fill_config(
    output_file: Path,
    base_path: Path,
    *,
    pretraining: bool = False,
    diff: bool = False,
    silent: bool = False,
) -> Tuple[Config, Config]:
    is_stdout = str(output_file) == "-"
    no_print = is_stdout or silent
    msg = Printer(no_print=no_print)
    with show_validation_error(hint_fill=False):
        config = util.load_config(base_path)
        nlp = util.load_model_from_config(config, auto_fill=True, validate=False)
    # Load a second time with validation to be extra sure that the produced
    # config result is a valid config
    nlp = util.load_model_from_config(nlp.config)
    filled = nlp.config
    if pretraining:
        validate_config_for_pretrain(filled, msg)
        pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
        filled = pretrain_config.merge(filled)
    before = config.to_str()
    after = filled.to_str()
    if before == after:
        msg.warn("Nothing to auto-fill: base config is already complete")
    else:
        msg.good("Auto-filled config with all values")
    if diff and not no_print:
        if before == after:
            msg.warn("No diff to show: nothing was auto-filled")
        else:
            msg.divider("START CONFIG DIFF")
            print("")
            print(diff_strings(before, after))
            msg.divider("END CONFIG DIFF")
            print("")
    save_config(filled, output_file, is_stdout=is_stdout, silent=silent)
    return config, filled


def init_config(
    output_file: Path,
    *,
    lang: str,
    pipeline: List[str],
    optimize: str,
    cpu: bool,
    pretraining: bool = False,
) -> None:
    is_stdout = str(output_file) == "-"
    msg = Printer(no_print=is_stdout)
    with TEMPLATE_PATH.open("r") as f:
        template = Template(f.read())
    # Filter out duplicates since tok2vec and transformer are added by template
    pipeline = [pipe for pipe in pipeline if pipe not in ("tok2vec", "transformer")]
    reco = RecommendationSchema(**RECOMMENDATIONS.get(lang, {})).dict()
    variables = {
        "lang": lang,
        "components": pipeline,
        "optimize": optimize,
        "hardware": "cpu" if cpu else "gpu",
        "transformer_data": reco["transformer"],
        "word_vectors": reco["word_vectors"],
        "has_letters": reco["has_letters"],
    }
    if variables["transformer_data"] and not has_spacy_transformers():
        msg.warn(
            "To generate a more effective transformer-based config (GPU-only), "
            "install the spacy-transformers package and re-run this command. "
            "The config generated now does not use transformers."
        )
        variables["transformer_data"] = None
    base_template = template.render(variables).strip()
    # Giving up on getting the newlines right in jinja for now
    base_template = re.sub(r"\n\n\n+", "\n\n", base_template)
    # Access variables declared in templates
    template_vars = template.make_module(variables)
    use_case = {
        "Language": lang,
        "Pipeline": ", ".join(pipeline),
        "Optimize for": optimize,
        "Hardware": variables["hardware"].upper(),
        "Transformer": template_vars.transformer.get("name", False),
    }
    msg.info("Generated template specific for your use case")
    for label, value in use_case.items():
        msg.text(f"- {label}: {value}")
    with show_validation_error(hint_fill=False):
        config = util.load_config_from_str(base_template)
        nlp = util.load_model_from_config(config, auto_fill=True)
        config = nlp.config
        if pretraining:
            validate_config_for_pretrain(config, msg)
            pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
            config = pretrain_config.merge(config)
    msg.good("Auto-filled config with all values")
    save_config(config, output_file, is_stdout=is_stdout)


def save_config(
    config: Config, output_file: Path, is_stdout: bool = False, silent: bool = False
) -> None:
    no_print = is_stdout or silent
    msg = Printer(no_print=no_print)
    if is_stdout:
        print(config.to_str())
    else:
        if not output_file.parent.exists():
            output_file.parent.mkdir(parents=True)
        config.to_disk(output_file, interpolate=False)
        msg.good("Saved config", output_file)
        msg.text("You can now add your data and train your pipeline:")
        variables = ["--paths.train ./train.spacy", "--paths.dev ./dev.spacy"]
        if not no_print:
            print(f"{COMMAND} train {output_file.parts[-1]} {' '.join(variables)}")


def has_spacy_transformers() -> bool:
    try:
        import spacy_transformers  # noqa: F401

        return True
    except ImportError:
        return False


def validate_config_for_pretrain(config: Config, msg: Printer) -> None:
    if "tok2vec" not in config["nlp"]["pipeline"]:
        msg.warn(
            "No tok2vec component found in the pipeline. If your tok2vec "
            "component has a different name, you may need to adjust the "
            "tok2vec_model reference in the [pretraining] block. If you don't "
            "have a tok2vec component, make sure to add it to your [components] "
            "and the pipeline specified in the [nlp] block, so you can pretrain "
            "weights for it."
        )
Add init fill-config 2020-08-14 17:49:26 +03:00			`from typing import Optional, List, Tuple`
Update for new Thinc and adjust config 2020-08-13 18:38:30 +03:00			`from enum import Enum`
Add init CLI and init config (#5854) * Add init CLI and init config draft * Improve config validation * Auto-format * Don't export anything in debug config * Update docs 2020-08-02 16:18:30 +03:00			`from pathlib import Path`
Add init fill-config 2020-08-14 17:49:26 +03:00			`from wasabi import Printer, diff_strings`
			`from thinc.api import Config`
Update for new Thinc and adjust config 2020-08-13 18:38:30 +03:00			`import srsly`
			`import re`
Make jinja2 top-level import No problem anymore since it's now an official dependency 2020-11-27 10:17:14 +03:00			`from jinja2 import Template`
Add init CLI and init config (#5854) * Add init CLI and init config draft * Improve config validation * Auto-format * Don't export anything in debug config * Update docs 2020-08-02 16:18:30 +03:00
Update Thinc and include section order 2020-08-14 15:06:22 +03:00			`from .. import util`
Fix handling of optional [pretraining] block (#5954) * Fix handling of optional [pretraining] block * Remote pretraining from default config * Fix test * Add schema option for empty pretrain block 2020-08-24 16:56:03 +03:00			`from ..language import DEFAULT_CONFIG_PRETRAIN_PATH`
Update init config and recommendations - As much as I dislike YAML, it seemed like a better format here because it allows us to add comments if we want to explain the different recommendations - Don't include the generated JS in the repo by default and build it on the fly when running or deploying the site. This ensures it's always up to date. - Simplify jinja_to_js script and use fewer dependencies 2020-08-19 14:33:15 +03:00			`from ..schemas import RecommendationSchema`
string_to_list to parse comma-separated string into a list 2020-09-12 15:43:22 +03:00			`from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND, string_to_list`
Update for new Thinc and adjust config 2020-08-13 18:38:30 +03:00

Update init config and recommendations - As much as I dislike YAML, it seemed like a better format here because it allows us to add comments if we want to explain the different recommendations - Don't include the generated JS in the repo by default and build it on the fly when running or deploying the site. This ensures it's always up to date. - Simplify jinja_to_js script and use fewer dependencies 2020-08-19 14:33:15 +03:00			`ROOT = Path(__file__).parent / "templates"`
			`TEMPLATE_PATH = ROOT / "quickstart_training.jinja"`
			`RECOMMENDATIONS = srsly.read_yaml(ROOT / "quickstart_training_recommendations.yml")`
Update for new Thinc and adjust config 2020-08-13 18:38:30 +03:00

			`class Optimizations(str, Enum):`
			`efficiency = "efficiency"`
			`accuracy = "accuracy"`
Add init CLI and init config (#5854) * Add init CLI and init config draft * Improve config validation * Auto-format * Don't export anything in debug config * Update docs 2020-08-02 16:18:30 +03:00

			`@init_cli.command("config")`
			`def init_config_cli(`
			`# fmt: off`
Small wording adjustments [ci skip] 2020-08-21 13:06:19 +03:00			`output_file: Path = Arg(..., help="File to save config.cfg to or - for stdout (will only output config and no additional logging info)", allow_dash=True),`
Update for new Thinc and adjust config 2020-08-13 18:38:30 +03:00			`lang: Optional[str] = Opt("en", "--lang", "-l", help="Two-letter code of the language to use"),`
"model" terminology consistency in docs 2020-09-03 14:13:03 +03:00			`pipeline: Optional[str] = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"),`
Update for new Thinc and adjust config 2020-08-13 18:38:30 +03:00			`optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),`
			`cpu: bool = Opt(False, "--cpu", "-C", help="Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."),`
add pretraining option to init config 2020-09-17 17:05:40 +03:00			`pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"),`
Add init CLI and init config (#5854) * Add init CLI and init config draft * Improve config validation * Auto-format * Don't export anything in debug config * Update docs 2020-08-02 16:18:30 +03:00			`# fmt: on`
			`):`
Update for new Thinc and adjust config 2020-08-13 18:38:30 +03:00			`"""`
			`Generate a starter config.cfg for training. Based on your requirements`
			`specified via the CLI arguments, this command generates a config with the`
add entity_linker to jinja template 2020-09-22 11:40:05 +03:00			`optimal settings for your use case. This includes the choice of architecture,`
Update for new Thinc and adjust config 2020-08-13 18:38:30 +03:00			`pretrained weights and related hyperparameters.`
Update docs links in codebase 2020-09-04 13:58:50 +03:00
			`DOCS: https://nightly.spacy.io/api/cli#init-config`
Update for new Thinc and adjust config 2020-08-13 18:38:30 +03:00			`"""`
			`if isinstance(optimize, Optimizations): # instance of enum from the CLI`
			`optimize = optimize.value`
string_to_list to parse comma-separated string into a list 2020-09-12 15:43:22 +03:00			`pipeline = string_to_list(pipeline)`
Use consistent shortcut 2020-09-17 17:57:02 +03:00			`init_config(`
			`output_file,`
			`lang=lang,`
			`pipeline=pipeline,`
			`optimize=optimize,`
			`cpu=cpu,`
			`pretraining=pretraining,`
			`)`
Add init CLI and init config (#5854) * Add init CLI and init config draft * Improve config validation * Auto-format * Don't export anything in debug config * Update docs 2020-08-02 16:18:30 +03:00

Add init fill-config 2020-08-14 17:49:26 +03:00			`@init_cli.command("fill-config")`
			`def init_fill_config_cli(`
			`# fmt: off`
			`base_path: Path = Arg(..., help="Base config to fill", exists=True, dir_okay=False),`
			`output_file: Path = Arg("-", help="File to save config.cfg to (or - for stdout)", allow_dash=True),`
Use consistent shortcut 2020-09-17 17:57:02 +03:00			`pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"),`
Add init fill-config 2020-08-14 17:49:26 +03:00			`diff: bool = Opt(False, "--diff", "-D", help="Print a visual diff highlighting the changes")`
			`# fmt: on`
			`):`
			`"""`
			`Fill partial config.cfg with default values. Will add all missing settings`
			`from the default config and will create all objects, check the registered`
			`functions for their default values and update the base config. This command`
			`can be used with a config generated via the training quickstart widget:`
			`https://nightly.spacy.io/usage/training#quickstart`
Update docs links in codebase 2020-09-04 13:58:50 +03:00
			`DOCS: https://nightly.spacy.io/api/cli#init-fill-config`
Add init fill-config 2020-08-14 17:49:26 +03:00			`"""`
Fix handling of optional [pretraining] block (#5954) * Fix handling of optional [pretraining] block * Remote pretraining from default config * Fix test * Add schema option for empty pretrain block 2020-08-24 16:56:03 +03:00			`fill_config(output_file, base_path, pretraining=pretraining, diff=diff)`
Add init fill-config 2020-08-14 17:49:26 +03:00

			`def fill_config(`
Update fill-config command and add silent mode [ci skip] 2020-09-01 13:07:04 +03:00			`output_file: Path,`
			`base_path: Path,`
			`*,`
			`pretraining: bool = False,`
			`diff: bool = False,`
			`silent: bool = False,`
Add init fill-config 2020-08-14 17:49:26 +03:00			`) -> Tuple[Config, Config]:`
			`is_stdout = str(output_file) == "-"`
Update fill-config command and add silent mode [ci skip] 2020-09-01 13:07:04 +03:00			`no_print = is_stdout or silent`
			`msg = Printer(no_print=no_print)`
Add init fill-config 2020-08-14 17:49:26 +03:00			`with show_validation_error(hint_fill=False):`
Fix CLI consistency [ci skip] 2020-08-16 16:46:29 +03:00			`config = util.load_config(base_path)`
Update config resolution to use new Thinc 2020-09-27 23:21:31 +03:00			`nlp = util.load_model_from_config(config, auto_fill=True, validate=False)`
Support removing extra values in fill-config (#5966) * Support removing extra values in fill-config * Fix test 2020-08-24 23:53:47 +03:00			`# Load a second time with validation to be extra sure that the produced`
			`# config result is a valid config`
Update config resolution to use new Thinc 2020-09-27 23:21:31 +03:00			`nlp = util.load_model_from_config(nlp.config)`
Fix handling of optional [pretraining] block (#5954) * Fix handling of optional [pretraining] block * Remote pretraining from default config * Fix test * Add schema option for empty pretrain block 2020-08-24 16:56:03 +03:00			`filled = nlp.config`
			`if pretraining:`
			`validate_config_for_pretrain(filled, msg)`
			`pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH)`
			`filled = pretrain_config.merge(filled)`
Show warnings if there's nothing to auto-fill 2020-08-16 15:19:43 +03:00			`before = config.to_str()`
Fix handling of optional [pretraining] block (#5954) * Fix handling of optional [pretraining] block * Remote pretraining from default config * Fix test * Add schema option for empty pretrain block 2020-08-24 16:56:03 +03:00			`after = filled.to_str()`
Show warnings if there's nothing to auto-fill 2020-08-16 15:19:43 +03:00			`if before == after:`
			`msg.warn("Nothing to auto-fill: base config is already complete")`
			`else:`
			`msg.good("Auto-filled config with all values")`
Update fill-config command and add silent mode [ci skip] 2020-09-01 13:07:04 +03:00			`if diff and not no_print:`
Show warnings if there's nothing to auto-fill 2020-08-16 15:19:43 +03:00			`if before == after:`
			`msg.warn("No diff to show: nothing was auto-filled")`
			`else:`
			`msg.divider("START CONFIG DIFF")`
			`print("")`
			`print(diff_strings(before, after))`
			`msg.divider("END CONFIG DIFF")`
			`print("")`
Update fill-config command and add silent mode [ci skip] 2020-09-01 13:07:04 +03:00			`save_config(filled, output_file, is_stdout=is_stdout, silent=silent)`
			`return config, filled`
Add init fill-config 2020-08-14 17:49:26 +03:00

Add init CLI and init config (#5854) * Add init CLI and init config draft * Improve config validation * Auto-format * Don't export anything in debug config * Update docs 2020-08-02 16:18:30 +03:00			`def init_config(`
Use consistent shortcut 2020-09-17 17:57:02 +03:00			`output_file: Path,`
			`*,`
			`lang: str,`
			`pipeline: List[str],`
			`optimize: str,`
			`cpu: bool,`
			`pretraining: bool = False,`
Update for new Thinc and adjust config 2020-08-13 18:38:30 +03:00			`) -> None:`
			`is_stdout = str(output_file) == "-"`
			`msg = Printer(no_print=is_stdout)`
			`with TEMPLATE_PATH.open("r") as f:`
			`template = Template(f.read())`
Update init config and recommendations - As much as I dislike YAML, it seemed like a better format here because it allows us to add comments if we want to explain the different recommendations - Don't include the generated JS in the repo by default and build it on the fly when running or deploying the site. This ensures it's always up to date. - Simplify jinja_to_js script and use fewer dependencies 2020-08-19 14:33:15 +03:00			`# Filter out duplicates since tok2vec and transformer are added by template`
			`pipeline = [pipe for pipe in pipeline if pipe not in ("tok2vec", "transformer")]`
			`reco = RecommendationSchema(**RECOMMENDATIONS.get(lang, {})).dict()`
Update for new Thinc and adjust config 2020-08-13 18:38:30 +03:00			`variables = {`
			`"lang": lang,`
			`"components": pipeline,`
			`"optimize": optimize,`
			`"hardware": "cpu" if cpu else "gpu",`
Update quickstart, template and docs 2020-08-15 15:50:29 +03:00			`"transformer_data": reco["transformer"],`
			`"word_vectors": reco["word_vectors"],`
Update init config and recommendations - As much as I dislike YAML, it seemed like a better format here because it allows us to add comments if we want to explain the different recommendations - Don't include the generated JS in the repo by default and build it on the fly when running or deploying the site. This ensures it's always up to date. - Simplify jinja_to_js script and use fewer dependencies 2020-08-19 14:33:15 +03:00			`"has_letters": reco["has_letters"],`
Update for new Thinc and adjust config 2020-08-13 18:38:30 +03:00			`}`
Small wording adjustments [ci skip] 2020-08-21 13:06:19 +03:00			`if variables["transformer_data"] and not has_spacy_transformers():`
			`msg.warn(`
			`"To generate a more effective transformer-based config (GPU-only), "`
			`"install the spacy-transformers package and re-run this command. "`
			`"The config generated now does not use transformers."`
			`)`
			`variables["transformer_data"] = None`
Update quickstart, template and docs 2020-08-15 15:50:29 +03:00			`base_template = template.render(variables).strip()`
Update for new Thinc and adjust config 2020-08-13 18:38:30 +03:00			`# Giving up on getting the newlines right in jinja for now`
			`base_template = re.sub(r"\n\n\n+", "\n\n", base_template)`
Update quickstart, template and docs 2020-08-15 15:50:29 +03:00			`# Access variables declared in templates`
			`template_vars = template.make_module(variables)`
Update for new Thinc and adjust config 2020-08-13 18:38:30 +03:00			`use_case = {`
			`"Language": lang,`
			`"Pipeline": ", ".join(pipeline),`
			`"Optimize for": optimize,`
			`"Hardware": variables["hardware"].upper(),`
Update quickstart, template and docs 2020-08-15 15:50:29 +03:00			`"Transformer": template_vars.transformer.get("name", False),`
Update for new Thinc and adjust config 2020-08-13 18:38:30 +03:00			`}`
Update quickstart, template and docs 2020-08-15 15:50:29 +03:00			`msg.info("Generated template specific for your use case")`
Update for new Thinc and adjust config 2020-08-13 18:38:30 +03:00			`for label, value in use_case.items():`
			`msg.text(f"- {label}: {value}")`
Add init fill-config 2020-08-14 17:49:26 +03:00			`with show_validation_error(hint_fill=False):`
Update quickstart, template and docs 2020-08-15 15:50:29 +03:00			`config = util.load_config_from_str(base_template)`
Update config resolution to use new Thinc 2020-09-27 23:21:31 +03:00			`nlp = util.load_model_from_config(config, auto_fill=True)`
add pretraining option to init config 2020-09-17 17:05:40 +03:00			`config = nlp.config`
			`if pretraining:`
			`validate_config_for_pretrain(config, msg)`
			`pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH)`
			`config = pretrain_config.merge(config)`
Update for new Thinc and adjust config 2020-08-13 18:38:30 +03:00			`msg.good("Auto-filled config with all values")`
add pretraining option to init config 2020-09-17 17:05:40 +03:00			`save_config(config, output_file, is_stdout=is_stdout)`
Add init fill-config 2020-08-14 17:49:26 +03:00

Update fill-config command and add silent mode [ci skip] 2020-09-01 13:07:04 +03:00			`def save_config(`
			`config: Config, output_file: Path, is_stdout: bool = False, silent: bool = False`
			`) -> None:`
			`no_print = is_stdout or silent`
			`msg = Printer(no_print=no_print)`
Update for new Thinc and adjust config 2020-08-13 18:38:30 +03:00			`if is_stdout:`
Add init fill-config 2020-08-14 17:49:26 +03:00			`print(config.to_str())`
Update for new Thinc and adjust config 2020-08-13 18:38:30 +03:00			`else:`
Small cleanup and adjustments 2020-08-26 11:26:57 +03:00			`if not output_file.parent.exists():`
			`output_file.parent.mkdir(parents=True)`
Add init fill-config 2020-08-14 17:49:26 +03:00			`config.to_disk(output_file, interpolate=False)`
Update for new Thinc and adjust config 2020-08-13 18:38:30 +03:00			`msg.good("Saved config", output_file)`
"model" terminology consistency in docs 2020-09-03 14:13:03 +03:00			`msg.text("You can now add your data and train your pipeline:")`
Update for new Thinc and adjust config 2020-08-13 18:38:30 +03:00			`variables = ["--paths.train ./train.spacy", "--paths.dev ./dev.spacy"]`
Update fill-config command and add silent mode [ci skip] 2020-09-01 13:07:04 +03:00			`if not no_print:`
			`print(f"{COMMAND} train {output_file.parts[-1]} {' '.join(variables)}")`
Add init CLI and init config (#5854) * Add init CLI and init config draft * Improve config validation * Auto-format * Don't export anything in debug config * Update docs 2020-08-02 16:18:30 +03:00

Small wording adjustments [ci skip] 2020-08-21 13:06:19 +03:00			`def has_spacy_transformers() -> bool:`
Update for new Thinc and adjust config 2020-08-13 18:38:30 +03:00			`try:`
			`import spacy_transformers # noqa: F401`
Small wording adjustments [ci skip] 2020-08-21 13:06:19 +03:00
			`return True`
Update for new Thinc and adjust config 2020-08-13 18:38:30 +03:00			`except ImportError:`
output_file required, spacy-transformers prefered instead of required 2020-08-18 14:38:43 +03:00			`return False`
Fix handling of optional [pretraining] block (#5954) * Fix handling of optional [pretraining] block * Remote pretraining from default config * Fix test * Add schema option for empty pretrain block 2020-08-24 16:56:03 +03:00

			`def validate_config_for_pretrain(config: Config, msg: Printer) -> None:`
			`if "tok2vec" not in config["nlp"]["pipeline"]:`
			`msg.warn(`
			`"No tok2vec component found in the pipeline. If your tok2vec "`
			`"component has a different name, you may need to adjust the "`
			`"tok2vec_model reference in the [pretraining] block. If you don't "`
			`"have a tok2vec component, make sure to add it to your [components] "`
			`"and the pipeline specified in the [nlp] block, so you can pretrain "`
			`"weights for it."`
			`)`