mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 02:36:32 +03:00
Merge pull request #5916 from explosion/feature/new-thinc-config
This commit is contained in:
commit
45f13cbf64
|
@ -1,8 +1,9 @@
|
||||||
recursive-include include *.h
|
recursive-include include *.h
|
||||||
recursive-include spacy *.pyx *.pxd *.txt *.cfg
|
recursive-include spacy *.pyx *.pxd *.txt *.cfg *.jinja
|
||||||
include LICENSE
|
include LICENSE
|
||||||
include README.md
|
include README.md
|
||||||
include pyproject.toml
|
include pyproject.toml
|
||||||
recursive-exclude spacy/lang *.json
|
recursive-exclude spacy/lang *.json
|
||||||
recursive-include spacy/lang *.json.gz
|
recursive-include spacy/lang *.json.gz
|
||||||
|
recursive-include spacy/cli *.json
|
||||||
recursive-include licenses *
|
recursive-include licenses *
|
||||||
|
|
|
@ -6,7 +6,7 @@ requires = [
|
||||||
"cymem>=2.0.2,<2.1.0",
|
"cymem>=2.0.2,<2.1.0",
|
||||||
"preshed>=3.0.2,<3.1.0",
|
"preshed>=3.0.2,<3.1.0",
|
||||||
"murmurhash>=0.28.0,<1.1.0",
|
"murmurhash>=0.28.0,<1.1.0",
|
||||||
"thinc>=8.0.0a23,<8.0.0a30",
|
"thinc>=8.0.0a27,<8.0.0a30",
|
||||||
"blis>=0.4.0,<0.5.0",
|
"blis>=0.4.0,<0.5.0",
|
||||||
"pytokenizations",
|
"pytokenizations",
|
||||||
"smart_open>=2.0.0,<3.0.0"
|
"smart_open>=2.0.0,<3.0.0"
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# Our libraries
|
# Our libraries
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.0.0a23,<8.0.0a30
|
thinc>=8.0.0a27,<8.0.0a30
|
||||||
blis>=0.4.0,<0.5.0
|
blis>=0.4.0,<0.5.0
|
||||||
ml_datasets>=0.1.1
|
ml_datasets>=0.1.1
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
|
@ -26,3 +26,4 @@ pytest>=4.6.5
|
||||||
pytest-timeout>=1.3.0,<2.0.0
|
pytest-timeout>=1.3.0,<2.0.0
|
||||||
mock>=2.0.0,<3.0.0
|
mock>=2.0.0,<3.0.0
|
||||||
flake8>=3.5.0,<3.6.0
|
flake8>=3.5.0,<3.6.0
|
||||||
|
jinja2
|
||||||
|
|
|
@ -34,13 +34,13 @@ setup_requires =
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
thinc>=8.0.0a23,<8.0.0a30
|
thinc>=8.0.0a27,<8.0.0a30
|
||||||
install_requires =
|
install_requires =
|
||||||
# Our libraries
|
# Our libraries
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.0.0a23,<8.0.0a30
|
thinc>=8.0.0a27,<8.0.0a30
|
||||||
blis>=0.4.0,<0.5.0
|
blis>=0.4.0,<0.5.0
|
||||||
wasabi>=0.7.1,<1.1.0
|
wasabi>=0.7.1,<1.1.0
|
||||||
srsly>=2.1.0,<3.0.0
|
srsly>=2.1.0,<3.0.0
|
||||||
|
|
|
@ -15,7 +15,7 @@ from .debug_model import debug_model # noqa: F401
|
||||||
from .evaluate import evaluate # noqa: F401
|
from .evaluate import evaluate # noqa: F401
|
||||||
from .convert import convert # noqa: F401
|
from .convert import convert # noqa: F401
|
||||||
from .init_model import init_model # noqa: F401
|
from .init_model import init_model # noqa: F401
|
||||||
from .init_config import init_config # noqa: F401
|
from .init_config import init_config, fill_config # noqa: F401
|
||||||
from .validate import validate # noqa: F401
|
from .validate import validate # noqa: F401
|
||||||
from .project.clone import project_clone # noqa: F401
|
from .project.clone import project_clone # noqa: F401
|
||||||
from .project.assets import project_assets # noqa: F401
|
from .project.assets import project_assets # noqa: F401
|
||||||
|
|
|
@ -179,13 +179,13 @@ def show_validation_error(
|
||||||
file_path: Optional[Union[str, Path]] = None,
|
file_path: Optional[Union[str, Path]] = None,
|
||||||
*,
|
*,
|
||||||
title: str = "Config validation error",
|
title: str = "Config validation error",
|
||||||
hint_init: bool = True,
|
hint_fill: bool = True,
|
||||||
):
|
):
|
||||||
"""Helper to show custom config validation errors on the CLI.
|
"""Helper to show custom config validation errors on the CLI.
|
||||||
|
|
||||||
file_path (str / Path): Optional file path of config file, used in hints.
|
file_path (str / Path): Optional file path of config file, used in hints.
|
||||||
title (str): Title of the custom formatted error.
|
title (str): Title of the custom formatted error.
|
||||||
hint_init (bool): Show hint about filling config.
|
hint_fill (bool): Show hint about filling config.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
yield
|
yield
|
||||||
|
@ -195,14 +195,14 @@ def show_validation_error(
|
||||||
# helper for this in Thinc
|
# helper for this in Thinc
|
||||||
err_text = str(e).replace("Config validation error", "").strip()
|
err_text = str(e).replace("Config validation error", "").strip()
|
||||||
print(err_text)
|
print(err_text)
|
||||||
if hint_init and "field required" in err_text:
|
if hint_fill and "field required" in err_text:
|
||||||
config_path = file_path if file_path is not None else "config.cfg"
|
config_path = file_path if file_path is not None else "config.cfg"
|
||||||
msg.text(
|
msg.text(
|
||||||
"If your config contains missing values, you can run the 'init "
|
"If your config contains missing values, you can run the 'init "
|
||||||
"config' command to fill in all the defaults, if possible:",
|
"fill-config' command to fill in all the defaults, if possible:",
|
||||||
spaced=True,
|
spaced=True,
|
||||||
)
|
)
|
||||||
print(f"{COMMAND} init config {config_path} --base {config_path}\n")
|
print(f"{COMMAND} init fill-config {config_path} --base {config_path}\n")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -5,7 +5,6 @@ import sys
|
||||||
import srsly
|
import srsly
|
||||||
from wasabi import Printer, MESSAGES, msg, diff_strings
|
from wasabi import Printer, MESSAGES, msg, diff_strings
|
||||||
import typer
|
import typer
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
|
from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
|
||||||
from ._util import import_code, debug_cli, get_sourced_components
|
from ._util import import_code, debug_cli, get_sourced_components
|
||||||
|
@ -49,7 +48,7 @@ def debug_config_cli(
|
||||||
overrides = parse_config_overrides(ctx.args)
|
overrides = parse_config_overrides(ctx.args)
|
||||||
import_code(code_path)
|
import_code(code_path)
|
||||||
with show_validation_error(config_path):
|
with show_validation_error(config_path):
|
||||||
config = Config().from_disk(config_path, overrides=overrides)
|
config = util.load_config(config_path, overrides=overrides)
|
||||||
try:
|
try:
|
||||||
nlp, _ = util.load_model_from_config(config, auto_fill=auto_fill)
|
nlp, _ = util.load_model_from_config(config, auto_fill=auto_fill)
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
|
@ -134,7 +133,7 @@ def debug_data(
|
||||||
if not config_path.exists():
|
if not config_path.exists():
|
||||||
msg.fail("Config file not found", config_path, exists=1)
|
msg.fail("Config file not found", config_path, exists=1)
|
||||||
with show_validation_error(config_path):
|
with show_validation_error(config_path):
|
||||||
cfg = Config().from_disk(config_path, overrides=config_overrides)
|
cfg = util.load_config(config_path, overrides=config_overrides)
|
||||||
nlp, config = util.load_model_from_config(cfg)
|
nlp, config = util.load_model_from_config(cfg)
|
||||||
# Use original config here, not resolved version
|
# Use original config here, not resolved version
|
||||||
sourced_components = get_sourced_components(cfg)
|
sourced_components = get_sourced_components(cfg)
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
from typing import Dict, Any, Optional
|
from typing import Dict, Any, Optional
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam, Config
|
from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam
|
||||||
from thinc.api import Model, data_validation
|
from thinc.api import Model, data_validation
|
||||||
import typer
|
import typer
|
||||||
|
|
||||||
|
@ -49,16 +49,15 @@ def debug_model_cli(
|
||||||
}
|
}
|
||||||
config_overrides = parse_config_overrides(ctx.args)
|
config_overrides = parse_config_overrides(ctx.args)
|
||||||
with show_validation_error(config_path):
|
with show_validation_error(config_path):
|
||||||
cfg = Config().from_disk(config_path, overrides=config_overrides)
|
config = util.load_config(config_path, overrides=config_overrides)
|
||||||
try:
|
try:
|
||||||
nlp, config = util.load_model_from_config(cfg)
|
nlp, config = util.load_model_from_config(config_path)
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
msg.fail(str(e), exits=1)
|
msg.fail(str(e), exits=1)
|
||||||
seed = config["pretraining"]["seed"]
|
seed = config["pretraining"]["seed"]
|
||||||
if seed is not None:
|
if seed is not None:
|
||||||
msg.info(f"Fixing random seed: {seed}")
|
msg.info(f"Fixing random seed: {seed}")
|
||||||
fix_random_seed(seed)
|
fix_random_seed(seed)
|
||||||
|
|
||||||
pipe = nlp.get_pipe(component)
|
pipe = nlp.get_pipe(component)
|
||||||
if hasattr(pipe, "model"):
|
if hasattr(pipe, "model"):
|
||||||
model = pipe.model
|
model = pipe.model
|
||||||
|
|
|
@ -1,81 +1,185 @@
|
||||||
from typing import Optional, List
|
from typing import Optional, List, Tuple
|
||||||
|
from enum import Enum
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from wasabi import Printer, diff_strings
|
||||||
from thinc.api import Config
|
from thinc.api import Config
|
||||||
from wasabi import msg
|
from pydantic import BaseModel
|
||||||
|
import srsly
|
||||||
|
import re
|
||||||
|
|
||||||
from ..util import load_model_from_config, get_lang_class, load_model
|
from .. import util
|
||||||
from ._util import init_cli, Arg, Opt, show_validation_error
|
from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND
|
||||||
|
|
||||||
|
|
||||||
|
TEMPLATE_ROOT = Path(__file__).parent / "templates"
|
||||||
|
TEMPLATE_PATH = TEMPLATE_ROOT / "quickstart_training.jinja"
|
||||||
|
RECOMMENDATIONS_PATH = TEMPLATE_ROOT / "quickstart_training_recommendations.json"
|
||||||
|
|
||||||
|
|
||||||
|
class Optimizations(str, Enum):
|
||||||
|
efficiency = "efficiency"
|
||||||
|
accuracy = "accuracy"
|
||||||
|
|
||||||
|
|
||||||
|
class RecommendationsTrfItem(BaseModel):
|
||||||
|
name: str
|
||||||
|
size_factor: int
|
||||||
|
|
||||||
|
|
||||||
|
class RecommendationsTrf(BaseModel):
|
||||||
|
efficiency: RecommendationsTrfItem
|
||||||
|
accuracy: RecommendationsTrfItem
|
||||||
|
|
||||||
|
|
||||||
|
class RecommendationSchema(BaseModel):
|
||||||
|
word_vectors: Optional[str] = None
|
||||||
|
transformer: Optional[RecommendationsTrf] = None
|
||||||
|
|
||||||
|
|
||||||
@init_cli.command("config")
|
@init_cli.command("config")
|
||||||
def init_config_cli(
|
def init_config_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
output_path: Path = Arg("-", help="Output path or - for stdout", allow_dash=True),
|
output_file: Path = Arg("-", help="File to save config.cfg to (or - for stdout)", allow_dash=True),
|
||||||
base_path: Optional[Path] = Opt(None, "--base", "-b", help="Optional base config to fill", exists=True, dir_okay=False),
|
lang: Optional[str] = Opt("en", "--lang", "-l", help="Two-letter code of the language to use"),
|
||||||
model: Optional[str] = Opt(None, "--model", "-m", help="Optional model to copy config from"),
|
pipeline: Optional[str] = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include in the model (without 'tok2vec' or 'transformer')"),
|
||||||
lang: Optional[str] = Opt(None, "--lang", "-l", help="Optional language code for blank config"),
|
optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
||||||
pipeline: Optional[str] = Opt(None, "--pipeline", "-p", help="Optional pipeline components to use")
|
cpu: bool = Opt(False, "--cpu", "-C", help="Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""Generate a starter config.cfg for training."""
|
"""
|
||||||
validate_cli_args(base_path, model, lang)
|
Generate a starter config.cfg for training. Based on your requirements
|
||||||
is_stdout = str(output_path) == "-"
|
specified via the CLI arguments, this command generates a config with the
|
||||||
pipeline = [p.strip() for p in pipeline.split(",")] if pipeline else []
|
optimal settings for you use case. This includes the choice of architecture,
|
||||||
cfg = init_config(output_path, base_path, model, lang, pipeline, silent=is_stdout)
|
pretrained weights and related hyperparameters.
|
||||||
if is_stdout:
|
"""
|
||||||
print(cfg.to_str())
|
if isinstance(optimize, Optimizations): # instance of enum from the CLI
|
||||||
|
optimize = optimize.value
|
||||||
|
pipeline = [p.strip() for p in pipeline.split(",")]
|
||||||
|
init_config(output_file, lang=lang, pipeline=pipeline, optimize=optimize, cpu=cpu)
|
||||||
|
|
||||||
|
|
||||||
|
@init_cli.command("fill-config")
|
||||||
|
def init_fill_config_cli(
|
||||||
|
# fmt: off
|
||||||
|
base_path: Path = Arg(..., help="Base config to fill", exists=True, dir_okay=False),
|
||||||
|
output_file: Path = Arg("-", help="File to save config.cfg to (or - for stdout)", allow_dash=True),
|
||||||
|
diff: bool = Opt(False, "--diff", "-D", help="Print a visual diff highlighting the changes")
|
||||||
|
# fmt: on
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Fill partial config.cfg with default values. Will add all missing settings
|
||||||
|
from the default config and will create all objects, check the registered
|
||||||
|
functions for their default values and update the base config. This command
|
||||||
|
can be used with a config generated via the training quickstart widget:
|
||||||
|
https://nightly.spacy.io/usage/training#quickstart
|
||||||
|
"""
|
||||||
|
fill_config(output_file, base_path, diff=diff)
|
||||||
|
|
||||||
|
|
||||||
|
def fill_config(
|
||||||
|
output_file: Path, base_path: Path, *, diff: bool = False
|
||||||
|
) -> Tuple[Config, Config]:
|
||||||
|
is_stdout = str(output_file) == "-"
|
||||||
|
msg = Printer(no_print=is_stdout)
|
||||||
|
with show_validation_error(hint_fill=False):
|
||||||
|
with msg.loading("Auto-filling config..."):
|
||||||
|
config = util.load_config(base_path)
|
||||||
|
try:
|
||||||
|
nlp, _ = util.load_model_from_config(config, auto_fill=True)
|
||||||
|
except ValueError as e:
|
||||||
|
msg.fail(str(e), exits=1)
|
||||||
|
before = config.to_str()
|
||||||
|
after = nlp.config.to_str()
|
||||||
|
if before == after:
|
||||||
|
msg.warn("Nothing to auto-fill: base config is already complete")
|
||||||
else:
|
else:
|
||||||
cfg.to_disk(output_path)
|
msg.good("Auto-filled config with all values")
|
||||||
msg.good("Saved config", output_path)
|
if diff and not is_stdout:
|
||||||
|
if before == after:
|
||||||
|
msg.warn("No diff to show: nothing was auto-filled")
|
||||||
|
else:
|
||||||
|
msg.divider("START CONFIG DIFF")
|
||||||
|
print("")
|
||||||
|
print(diff_strings(before, after))
|
||||||
|
msg.divider("END CONFIG DIFF")
|
||||||
|
print("")
|
||||||
|
save_config(nlp.config, output_file, is_stdout=is_stdout)
|
||||||
|
|
||||||
|
|
||||||
def init_config(
|
def init_config(
|
||||||
output_path: Path,
|
output_file: Path, *, lang: str, pipeline: List[str], optimize: str, cpu: bool
|
||||||
config_path: Optional[Path],
|
) -> None:
|
||||||
model: Optional[str],
|
is_stdout = str(output_file) == "-"
|
||||||
lang: Optional[str],
|
msg = Printer(no_print=is_stdout)
|
||||||
pipeline: Optional[List[str]],
|
|
||||||
silent: bool = False,
|
|
||||||
) -> Config:
|
|
||||||
if config_path is not None:
|
|
||||||
msg.info("Generating config from base config", show=not silent)
|
|
||||||
with show_validation_error(config_path, hint_init=False):
|
|
||||||
config = Config().from_disk(config_path)
|
|
||||||
try:
|
try:
|
||||||
nlp, _ = load_model_from_config(config, auto_fill=True)
|
from jinja2 import Template
|
||||||
|
except ImportError:
|
||||||
|
msg.fail("This command requires jinja2", "pip install jinja2", exits=1)
|
||||||
|
recommendations = srsly.read_json(RECOMMENDATIONS_PATH)
|
||||||
|
lang_defaults = util.get_lang_class(lang).Defaults
|
||||||
|
has_letters = lang_defaults.writing_system.get("has_letters", True)
|
||||||
|
# Filter out duplicates since tok2vec and transformer are added by template
|
||||||
|
pipeline = [pipe for pipe in pipeline if pipe not in ("tok2vec", "transformer")]
|
||||||
|
reco = RecommendationSchema(**recommendations.get(lang, {})).dict()
|
||||||
|
with TEMPLATE_PATH.open("r") as f:
|
||||||
|
template = Template(f.read())
|
||||||
|
variables = {
|
||||||
|
"lang": lang,
|
||||||
|
"components": pipeline,
|
||||||
|
"optimize": optimize,
|
||||||
|
"hardware": "cpu" if cpu else "gpu",
|
||||||
|
"transformer_data": reco["transformer"],
|
||||||
|
"word_vectors": reco["word_vectors"],
|
||||||
|
"has_letters": has_letters,
|
||||||
|
}
|
||||||
|
base_template = template.render(variables).strip()
|
||||||
|
# Giving up on getting the newlines right in jinja for now
|
||||||
|
base_template = re.sub(r"\n\n\n+", "\n\n", base_template)
|
||||||
|
# Access variables declared in templates
|
||||||
|
template_vars = template.make_module(variables)
|
||||||
|
use_case = {
|
||||||
|
"Language": lang,
|
||||||
|
"Pipeline": ", ".join(pipeline),
|
||||||
|
"Optimize for": optimize,
|
||||||
|
"Hardware": variables["hardware"].upper(),
|
||||||
|
"Transformer": template_vars.transformer.get("name", False),
|
||||||
|
}
|
||||||
|
msg.info("Generated template specific for your use case")
|
||||||
|
for label, value in use_case.items():
|
||||||
|
msg.text(f"- {label}: {value}")
|
||||||
|
use_transformer = bool(template_vars.use_transformer)
|
||||||
|
if use_transformer:
|
||||||
|
require_spacy_transformers(msg)
|
||||||
|
with show_validation_error(hint_fill=False):
|
||||||
|
config = util.load_config_from_str(base_template)
|
||||||
|
try:
|
||||||
|
nlp, _ = util.load_model_from_config(config, auto_fill=True)
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
msg.fail(str(e), exits=1)
|
msg.fail(str(e), exits=1)
|
||||||
return nlp.config
|
if use_transformer:
|
||||||
if model is not None:
|
nlp.config.pop("pretraining", {}) # TODO: solve this better
|
||||||
ext = f" with pipeline {pipeline}" if pipeline else ""
|
msg.good("Auto-filled config with all values")
|
||||||
msg.info(f"Generating config from model {model}{ext}", show=not silent)
|
save_config(nlp.config, output_file, is_stdout=is_stdout)
|
||||||
nlp = load_model(model)
|
|
||||||
for existing_pipe_name in nlp.pipe_names:
|
|
||||||
if existing_pipe_name not in pipeline:
|
|
||||||
nlp.remove_pipe(existing_pipe_name)
|
|
||||||
for pipe_name in pipeline:
|
|
||||||
if pipe_name not in nlp.pipe_names:
|
|
||||||
nlp.add_pipe(pipe_name)
|
|
||||||
return nlp.config
|
|
||||||
if lang is not None:
|
|
||||||
ext = f" with pipeline {pipeline}" if pipeline else ""
|
|
||||||
msg.info(f"Generating config for language '{lang}'{ext}", show=not silent)
|
|
||||||
nlp = get_lang_class(lang)()
|
|
||||||
for pipe_name in pipeline:
|
|
||||||
nlp.add_pipe(pipe_name)
|
|
||||||
return nlp.config
|
|
||||||
|
|
||||||
|
|
||||||
def validate_cli_args(
|
def save_config(config: Config, output_file: Path, is_stdout: bool = False) -> None:
|
||||||
config_path: Optional[Path], model: Optional[str], lang: Optional[str]
|
msg = Printer(no_print=is_stdout)
|
||||||
) -> None:
|
if is_stdout:
|
||||||
args = {"--base": config_path, "--model": model, "--lang": lang}
|
print(config.to_str())
|
||||||
if sum(arg is not None for arg in args.values()) != 1:
|
else:
|
||||||
existing = " ".join(f"{a} {v}" for a, v in args.items() if v is not None)
|
config.to_disk(output_file, interpolate=False)
|
||||||
|
msg.good("Saved config", output_file)
|
||||||
|
msg.text("You can now add your data and train your model:")
|
||||||
|
variables = ["--paths.train ./train.spacy", "--paths.dev ./dev.spacy"]
|
||||||
|
print(f"{COMMAND} train {output_file.parts[-1]} {' '.join(variables)}")
|
||||||
|
|
||||||
|
|
||||||
|
def require_spacy_transformers(msg: Printer) -> None:
|
||||||
|
try:
|
||||||
|
import spacy_transformers # noqa: F401
|
||||||
|
except ImportError:
|
||||||
msg.fail(
|
msg.fail(
|
||||||
"The init config command expects only one of the following arguments: "
|
"Using a transformer-based pipeline requires spacy-transformers "
|
||||||
"--base (base config to fill and update), --lang (language code to "
|
"to be installed.",
|
||||||
"use for blank config) or --model (base model to copy config from).",
|
|
||||||
f"Got: {existing if existing else 'no arguments'}",
|
|
||||||
exits=1,
|
exits=1,
|
||||||
)
|
)
|
||||||
|
|
|
@ -5,7 +5,7 @@ import time
|
||||||
import re
|
import re
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from thinc.api import use_pytorch_for_gpu_memory, require_gpu, Config
|
from thinc.api import use_pytorch_for_gpu_memory, require_gpu
|
||||||
from thinc.api import set_dropout_rate, to_categorical, fix_random_seed
|
from thinc.api import set_dropout_rate, to_categorical, fix_random_seed
|
||||||
from thinc.api import CosineDistance, L2Distance
|
from thinc.api import CosineDistance, L2Distance
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
|
@ -88,7 +88,7 @@ def pretrain(
|
||||||
msg.info("Using CPU")
|
msg.info("Using CPU")
|
||||||
msg.info(f"Loading config from: {config_path}")
|
msg.info(f"Loading config from: {config_path}")
|
||||||
with show_validation_error(config_path):
|
with show_validation_error(config_path):
|
||||||
config = Config().from_disk(config_path, overrides=config_overrides)
|
config = util.load_config(config_path, overrides=config_overrides)
|
||||||
nlp, config = util.load_model_from_config(config)
|
nlp, config = util.load_model_from_config(config)
|
||||||
# TODO: validate that [pretraining] block exists
|
# TODO: validate that [pretraining] block exists
|
||||||
if not output_dir.exists():
|
if not output_dir.exists():
|
||||||
|
|
237
spacy/cli/templates/quickstart_training.jinja
Normal file
237
spacy/cli/templates/quickstart_training.jinja
Normal file
|
@ -0,0 +1,237 @@
|
||||||
|
{# This is a template for training configs used for the quickstart widget in
|
||||||
|
the docs and the init config command. It encodes various best practices and
|
||||||
|
can help generate the best possible configuration, given a user's requirements. #}
|
||||||
|
{%- set use_transformer = (transformer_data and hardware != "cpu") -%}
|
||||||
|
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
|
||||||
|
[paths]
|
||||||
|
train = ""
|
||||||
|
dev = ""
|
||||||
|
|
||||||
|
[system]
|
||||||
|
use_pytorch_for_gpu_memory = {{ "true" if use_transformer else "false" }}
|
||||||
|
|
||||||
|
[nlp]
|
||||||
|
lang = "{{ lang }}"
|
||||||
|
{%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components %}
|
||||||
|
pipeline = {{ full_pipeline|pprint()|replace("'", '"')|safe }}
|
||||||
|
tokenizer = {"@tokenizers": "spacy.Tokenizer.v1"}
|
||||||
|
|
||||||
|
[components]
|
||||||
|
|
||||||
|
{# TRANSFORMER PIPELINE #}
|
||||||
|
{%- if use_transformer -%}
|
||||||
|
[components.transformer]
|
||||||
|
factory = "transformer"
|
||||||
|
|
||||||
|
[components.transformer.model]
|
||||||
|
@architectures = "spacy-transformers.TransformerModel.v1"
|
||||||
|
name = "{{ transformer["name"] }}"
|
||||||
|
tokenizer_config = {"use_fast": true}
|
||||||
|
|
||||||
|
[components.transformer.model.get_spans]
|
||||||
|
@span_getters = "strided_spans.v1"
|
||||||
|
window = 128
|
||||||
|
stride = 96
|
||||||
|
|
||||||
|
{% if "tagger" in components %}
|
||||||
|
[components.tagger]
|
||||||
|
factory = "tagger"
|
||||||
|
|
||||||
|
[components.tagger.model]
|
||||||
|
@architectures = "spacy.Tagger.v1"
|
||||||
|
nO = null
|
||||||
|
|
||||||
|
[components.tagger.model.tok2vec]
|
||||||
|
@architectures = "spacy-transformers.Tok2VecListener.v1"
|
||||||
|
grad_factor = 1.0
|
||||||
|
|
||||||
|
[components.tagger.model.tok2vec.pooling]
|
||||||
|
@layers = "reduce_mean.v1"
|
||||||
|
{%- endif %}
|
||||||
|
|
||||||
|
{% if "parser" in components -%}
|
||||||
|
[components.parser]
|
||||||
|
factory = "parser"
|
||||||
|
|
||||||
|
[components.parser.model]
|
||||||
|
@architectures = "spacy.TransitionBasedParser.v1"
|
||||||
|
nr_feature_tokens = 8
|
||||||
|
hidden_width = 128
|
||||||
|
maxout_pieces = 3
|
||||||
|
use_upper = false
|
||||||
|
nO = null
|
||||||
|
|
||||||
|
[components.parser.model.tok2vec]
|
||||||
|
@architectures = "spacy-transformers.Tok2VecListener.v1"
|
||||||
|
grad_factor = 1.0
|
||||||
|
|
||||||
|
[components.parser.model.tok2vec.pooling]
|
||||||
|
@layers = "reduce_mean.v1"
|
||||||
|
{%- endif %}
|
||||||
|
|
||||||
|
{% if "ner" in components -%}
|
||||||
|
[components.ner]
|
||||||
|
factory = "ner"
|
||||||
|
|
||||||
|
[components.ner.model]
|
||||||
|
@architectures = "spacy.TransitionBasedParser.v1"
|
||||||
|
nr_feature_tokens = 3
|
||||||
|
hidden_width = 64
|
||||||
|
maxout_pieces = 2
|
||||||
|
use_upper = false
|
||||||
|
nO = null
|
||||||
|
|
||||||
|
[components.ner.model.tok2vec]
|
||||||
|
@architectures = "spacy-transformers.Tok2VecListener.v1"
|
||||||
|
grad_factor = 1.0
|
||||||
|
|
||||||
|
[components.ner.model.tok2vec.pooling]
|
||||||
|
@layers = "reduce_mean.v1"
|
||||||
|
{% endif -%}
|
||||||
|
|
||||||
|
{# NON-TRANSFORMER PIPELINE #}
|
||||||
|
{% else -%}
|
||||||
|
|
||||||
|
{%- if hardware == "gpu" -%}
|
||||||
|
# There are no recommended transformer weights available for language '{{ lang }}'
|
||||||
|
# yet, so the pipeline described here is not transformer-based.
|
||||||
|
{%- endif %}
|
||||||
|
|
||||||
|
[components.tok2vec]
|
||||||
|
factory = "tok2vec"
|
||||||
|
|
||||||
|
[components.tok2vec.model]
|
||||||
|
@architectures = "spacy.Tok2Vec.v1"
|
||||||
|
|
||||||
|
[components.tok2vec.model.embed]
|
||||||
|
@architectures = "spacy.MultiHashEmbed.v1"
|
||||||
|
width = ${components.tok2vec.model.encode:width}
|
||||||
|
rows = {{ 2000 if optimize == "efficiency" else 7000 }}
|
||||||
|
also_embed_subwords = {{ true if has_letters else false }}
|
||||||
|
also_use_static_vectors = {{ true if optimize == "accuracy" else false }}
|
||||||
|
|
||||||
|
[components.tok2vec.model.encode]
|
||||||
|
@architectures = "spacy.MaxoutWindowEncoder.v1"
|
||||||
|
width = {{ 96 if optimize == "efficiency" else 256 }}
|
||||||
|
depth = {{ 4 if optimize == "efficiency" else 8 }}
|
||||||
|
window_size = 1
|
||||||
|
maxout_pieces = 3
|
||||||
|
|
||||||
|
{% if "tagger" in components %}
|
||||||
|
[components.tagger]
|
||||||
|
factory = "tagger"
|
||||||
|
|
||||||
|
[components.tagger.model]
|
||||||
|
@architectures = "spacy.Tagger.v1"
|
||||||
|
nO = null
|
||||||
|
|
||||||
|
[components.tagger.model.tok2vec]
|
||||||
|
@architectures = "spacy.Tok2VecListener.v1"
|
||||||
|
width = ${components.tok2vec.model.encode:width}
|
||||||
|
{%- endif %}
|
||||||
|
|
||||||
|
{% if "parser" in components -%}
|
||||||
|
[components.parser]
|
||||||
|
factory = "parser"
|
||||||
|
|
||||||
|
[components.parser.model]
|
||||||
|
@architectures = "spacy.TransitionBasedParser.v1"
|
||||||
|
nr_feature_tokens = 8
|
||||||
|
hidden_width = 128
|
||||||
|
maxout_pieces = 3
|
||||||
|
use_upper = true
|
||||||
|
nO = null
|
||||||
|
|
||||||
|
[components.parser.model.tok2vec]
|
||||||
|
@architectures = "spacy.Tok2VecListener.v1"
|
||||||
|
width = ${components.tok2vec.model.encode:width}
|
||||||
|
{%- endif %}
|
||||||
|
|
||||||
|
{% if "ner" in components %}
|
||||||
|
[components.ner]
|
||||||
|
factory = "ner"
|
||||||
|
|
||||||
|
[components.ner.model]
|
||||||
|
@architectures = "spacy.TransitionBasedParser.v1"
|
||||||
|
nr_feature_tokens = 6
|
||||||
|
hidden_width = 64
|
||||||
|
maxout_pieces = 2
|
||||||
|
use_upper = true
|
||||||
|
nO = null
|
||||||
|
|
||||||
|
[components.ner.model.tok2vec]
|
||||||
|
@architectures = "spacy.Tok2VecListener.v1"
|
||||||
|
width = ${components.tok2vec.model.encode:width}
|
||||||
|
{% endif %}
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
|
{% for pipe in components %}
|
||||||
|
{% if pipe not in ["tagger", "parser", "ner"] %}
|
||||||
|
{# Other components defined by the user: we just assume they're factories #}
|
||||||
|
[components.{{ pipe }}]
|
||||||
|
factory = "{{ pipe }}"
|
||||||
|
{% endif %}
|
||||||
|
{% endfor %}
|
||||||
|
|
||||||
|
[training]
|
||||||
|
{% if use_transformer or optimize == "efficiency" or not word_vectors -%}
|
||||||
|
vectors = null
|
||||||
|
{% else -%}
|
||||||
|
vectors = "{{ word_vectors }}"
|
||||||
|
{% endif -%}
|
||||||
|
{% if use_transformer -%}
|
||||||
|
accumulate_gradient = {{ transformer["size_factor"] }}
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
|
[training.optimizer]
|
||||||
|
@optimizers = "Adam.v1"
|
||||||
|
|
||||||
|
[training.optimizer.learn_rate]
|
||||||
|
@schedules = "warmup_linear.v1"
|
||||||
|
warmup_steps = 250
|
||||||
|
total_steps = 20000
|
||||||
|
initial_rate = 5e-5
|
||||||
|
|
||||||
|
[training.train_corpus]
|
||||||
|
@readers = "spacy.Corpus.v1"
|
||||||
|
path = ${paths:train}
|
||||||
|
max_length = {{ 500 if hardware == "gpu" else 0 }}
|
||||||
|
|
||||||
|
[training.dev_corpus]
|
||||||
|
@readers = "spacy.Corpus.v1"
|
||||||
|
path = ${paths:dev}
|
||||||
|
max_length = 0
|
||||||
|
|
||||||
|
{% if use_transformer %}
|
||||||
|
[training.batcher]
|
||||||
|
@batchers = "batch_by_padded.v1"
|
||||||
|
discard_oversize = true
|
||||||
|
size = 2000
|
||||||
|
buffer = 256
|
||||||
|
{%- else %}
|
||||||
|
[training.batcher]
|
||||||
|
@batchers = "batch_by_words.v1"
|
||||||
|
discard_oversize = false
|
||||||
|
tolerance = 0.2
|
||||||
|
|
||||||
|
[training.batcher.size]
|
||||||
|
@schedules = "compounding.v1"
|
||||||
|
start = 100
|
||||||
|
stop = 1000
|
||||||
|
compound = 1.001
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
|
[training.score_weights]
|
||||||
|
{%- if "tagger" in components %}
|
||||||
|
tag_acc = {{ (1.0 / components|length)|round(2) }}
|
||||||
|
{%- endif -%}
|
||||||
|
{%- if "parser" in components %}
|
||||||
|
dep_uas = 0.0
|
||||||
|
dep_las = {{ (1.0 / components|length)|round(2) }}
|
||||||
|
sents_f = 0.0
|
||||||
|
{%- endif %}
|
||||||
|
{%- if "ner" in components %}
|
||||||
|
ents_f = {{ (1.0 / components|length)|round(2) }}
|
||||||
|
ents_p = 0.0
|
||||||
|
ents_r = 0.0
|
||||||
|
{%- endif -%}
|
13
spacy/cli/templates/quickstart_training_recommendations.json
Normal file
13
spacy/cli/templates/quickstart_training_recommendations.json
Normal file
|
@ -0,0 +1,13 @@
|
||||||
|
{
|
||||||
|
"en": {
|
||||||
|
"word_vectors": "en_vectors_web_lg",
|
||||||
|
"transformer": {
|
||||||
|
"efficiency": { "name": "roberta-base", "size_factor": 3 },
|
||||||
|
"accuracy": { "name": "roberta-base", "size_factor": 3 }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"de": {
|
||||||
|
"word_vectors": null,
|
||||||
|
"transformer": null
|
||||||
|
}
|
||||||
|
}
|
|
@ -75,7 +75,7 @@ def train(
|
||||||
msg.info("Using CPU")
|
msg.info("Using CPU")
|
||||||
msg.info(f"Loading config and nlp from: {config_path}")
|
msg.info(f"Loading config and nlp from: {config_path}")
|
||||||
with show_validation_error(config_path):
|
with show_validation_error(config_path):
|
||||||
config = Config().from_disk(config_path, overrides=config_overrides)
|
config = util.load_config(config_path, overrides=config_overrides)
|
||||||
if config.get("training", {}).get("seed") is not None:
|
if config.get("training", {}).get("seed") is not None:
|
||||||
fix_random_seed(config["training"]["seed"])
|
fix_random_seed(config["training"]["seed"])
|
||||||
# Use original config here before it's resolved to functions
|
# Use original config here before it's resolved to functions
|
||||||
|
|
|
@ -21,7 +21,7 @@ from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
|
||||||
from .gold import Example, validate_examples
|
from .gold import Example, validate_examples
|
||||||
from .scorer import Scorer
|
from .scorer import Scorer
|
||||||
from .util import create_default_optimizer, registry
|
from .util import create_default_optimizer, registry
|
||||||
from .util import SimpleFrozenDict, combine_score_weights
|
from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER
|
||||||
from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS
|
from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS
|
||||||
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
||||||
from .lang.punctuation import TOKENIZER_INFIXES
|
from .lang.punctuation import TOKENIZER_INFIXES
|
||||||
|
@ -36,7 +36,7 @@ from . import about
|
||||||
|
|
||||||
# This is the base config will all settings (training etc.)
|
# This is the base config will all settings (training etc.)
|
||||||
DEFAULT_CONFIG_PATH = Path(__file__).parent / "default_config.cfg"
|
DEFAULT_CONFIG_PATH = Path(__file__).parent / "default_config.cfg"
|
||||||
DEFAULT_CONFIG = Config().from_disk(DEFAULT_CONFIG_PATH)
|
DEFAULT_CONFIG = util.load_config(DEFAULT_CONFIG_PATH)
|
||||||
|
|
||||||
|
|
||||||
class BaseDefaults:
|
class BaseDefaults:
|
||||||
|
@ -45,7 +45,7 @@ class BaseDefaults:
|
||||||
Language.Defaults.
|
Language.Defaults.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
config: Config = Config()
|
config: Config = Config(section_order=CONFIG_SECTION_ORDER)
|
||||||
tokenizer_exceptions: Dict[str, List[dict]] = BASE_EXCEPTIONS
|
tokenizer_exceptions: Dict[str, List[dict]] = BASE_EXCEPTIONS
|
||||||
prefixes: Optional[List[Union[str, Pattern]]] = TOKENIZER_PREFIXES
|
prefixes: Optional[List[Union[str, Pattern]]] = TOKENIZER_PREFIXES
|
||||||
suffixes: Optional[List[Union[str, Pattern]]] = TOKENIZER_SUFFIXES
|
suffixes: Optional[List[Union[str, Pattern]]] = TOKENIZER_SUFFIXES
|
||||||
|
@ -134,7 +134,7 @@ class Language:
|
||||||
# of the rest.
|
# of the rest.
|
||||||
util.registry._entry_point_factories.get_all()
|
util.registry._entry_point_factories.get_all()
|
||||||
|
|
||||||
self._config = util.deep_merge_configs(self.default_config, DEFAULT_CONFIG)
|
self._config = DEFAULT_CONFIG.merge(self.default_config)
|
||||||
self._meta = dict(meta)
|
self._meta = dict(meta)
|
||||||
self._path = None
|
self._path = None
|
||||||
self._optimizer = None
|
self._optimizer = None
|
||||||
|
@ -167,9 +167,7 @@ class Language:
|
||||||
|
|
||||||
def __init_subclass__(cls, **kwargs):
|
def __init_subclass__(cls, **kwargs):
|
||||||
super().__init_subclass__(**kwargs)
|
super().__init_subclass__(**kwargs)
|
||||||
cls.default_config = util.deep_merge_configs(
|
cls.default_config = DEFAULT_CONFIG.merge(cls.Defaults.config)
|
||||||
cls.Defaults.config, DEFAULT_CONFIG
|
|
||||||
)
|
|
||||||
cls.default_config["nlp"]["lang"] = cls.lang
|
cls.default_config["nlp"]["lang"] = cls.lang
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -532,6 +530,7 @@ class Language:
|
||||||
name: Optional[str] = None,
|
name: Optional[str] = None,
|
||||||
*,
|
*,
|
||||||
config: Optional[Dict[str, Any]] = SimpleFrozenDict(),
|
config: Optional[Dict[str, Any]] = SimpleFrozenDict(),
|
||||||
|
raw_config: Optional[Config] = None,
|
||||||
validate: bool = True,
|
validate: bool = True,
|
||||||
) -> Callable[[Doc], Doc]:
|
) -> Callable[[Doc], Doc]:
|
||||||
"""Create a pipeline component. Mostly used internally. To create and
|
"""Create a pipeline component. Mostly used internally. To create and
|
||||||
|
@ -542,6 +541,7 @@ class Language:
|
||||||
Defaults to factory name if not set.
|
Defaults to factory name if not set.
|
||||||
config (Optional[Dict[str, Any]]): Config parameters to use for this
|
config (Optional[Dict[str, Any]]): Config parameters to use for this
|
||||||
component. Will be merged with default config, if available.
|
component. Will be merged with default config, if available.
|
||||||
|
raw_config (Optional[Config]): Internals: the non-interpolated config.
|
||||||
validate (bool): Whether to validate the component config against the
|
validate (bool): Whether to validate the component config against the
|
||||||
arguments and types expected by the factory.
|
arguments and types expected by the factory.
|
||||||
RETURNS (Callable[[Doc], Doc]): The pipeline component.
|
RETURNS (Callable[[Doc], Doc]): The pipeline component.
|
||||||
|
@ -568,7 +568,7 @@ class Language:
|
||||||
# This is unideal, but the alternative would mean you always need to
|
# This is unideal, but the alternative would mean you always need to
|
||||||
# specify the full config settings, which is not really viable.
|
# specify the full config settings, which is not really viable.
|
||||||
if pipe_meta.default_config:
|
if pipe_meta.default_config:
|
||||||
config = util.deep_merge_configs(config, pipe_meta.default_config)
|
config = Config(pipe_meta.default_config).merge(config)
|
||||||
# We need to create a top-level key because Thinc doesn't allow resolving
|
# We need to create a top-level key because Thinc doesn't allow resolving
|
||||||
# top-level references to registered functions. Also gives nicer errors.
|
# top-level references to registered functions. Also gives nicer errors.
|
||||||
# The name allows components to know their pipe name and use it in the
|
# The name allows components to know their pipe name and use it in the
|
||||||
|
@ -582,12 +582,14 @@ class Language:
|
||||||
cfg = {factory_name: config}
|
cfg = {factory_name: config}
|
||||||
# We're calling the internal _fill here to avoid constructing the
|
# We're calling the internal _fill here to avoid constructing the
|
||||||
# registered functions twice
|
# registered functions twice
|
||||||
# TODO: customize validation to make it more readable / relate it to
|
|
||||||
# pipeline component and why it failed, explain default config
|
|
||||||
resolved, filled = registry.resolve(cfg, validate=validate)
|
resolved, filled = registry.resolve(cfg, validate=validate)
|
||||||
filled = filled[factory_name]
|
filled = Config(filled[factory_name])
|
||||||
filled["factory"] = factory_name
|
filled["factory"] = factory_name
|
||||||
filled.pop("@factories", None)
|
filled.pop("@factories", None)
|
||||||
|
# Merge the final filled config with the raw config (including non-
|
||||||
|
# interpolated variables)
|
||||||
|
if raw_config:
|
||||||
|
filled = filled.merge(raw_config)
|
||||||
self._pipe_configs[name] = filled
|
self._pipe_configs[name] = filled
|
||||||
return resolved[factory_name]
|
return resolved[factory_name]
|
||||||
|
|
||||||
|
@ -613,7 +615,10 @@ class Language:
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
pipe = source.get_pipe(source_name)
|
pipe = source.get_pipe(source_name)
|
||||||
pipe_config = util.copy_config(source.config["components"][source_name])
|
# Make sure the source config is interpolated so we don't end up with
|
||||||
|
# orphaned variables in our final config
|
||||||
|
source_config = source.config.interpolate()
|
||||||
|
pipe_config = util.copy_config(source_config["components"][source_name])
|
||||||
self._pipe_configs[name] = pipe_config
|
self._pipe_configs[name] = pipe_config
|
||||||
return pipe, pipe_config["factory"]
|
return pipe, pipe_config["factory"]
|
||||||
|
|
||||||
|
@ -628,6 +633,7 @@ class Language:
|
||||||
last: Optional[bool] = None,
|
last: Optional[bool] = None,
|
||||||
source: Optional["Language"] = None,
|
source: Optional["Language"] = None,
|
||||||
config: Optional[Dict[str, Any]] = SimpleFrozenDict(),
|
config: Optional[Dict[str, Any]] = SimpleFrozenDict(),
|
||||||
|
raw_config: Optional[Config] = None,
|
||||||
validate: bool = True,
|
validate: bool = True,
|
||||||
) -> Callable[[Doc], Doc]:
|
) -> Callable[[Doc], Doc]:
|
||||||
"""Add a component to the processing pipeline. Valid components are
|
"""Add a component to the processing pipeline. Valid components are
|
||||||
|
@ -649,6 +655,7 @@ class Language:
|
||||||
component from.
|
component from.
|
||||||
config (Optional[Dict[str, Any]]): Config parameters to use for this
|
config (Optional[Dict[str, Any]]): Config parameters to use for this
|
||||||
component. Will be merged with default config, if available.
|
component. Will be merged with default config, if available.
|
||||||
|
raw_config (Optional[Config]): Internals: the non-interpolated config.
|
||||||
validate (bool): Whether to validate the component config against the
|
validate (bool): Whether to validate the component config against the
|
||||||
arguments and types expected by the factory.
|
arguments and types expected by the factory.
|
||||||
RETURNS (Callable[[Doc], Doc]): The pipeline component.
|
RETURNS (Callable[[Doc], Doc]): The pipeline component.
|
||||||
|
@ -678,7 +685,11 @@ class Language:
|
||||||
lang_code=self.lang,
|
lang_code=self.lang,
|
||||||
)
|
)
|
||||||
pipe_component = self.create_pipe(
|
pipe_component = self.create_pipe(
|
||||||
factory_name, name=name, config=config, validate=validate,
|
factory_name,
|
||||||
|
name=name,
|
||||||
|
config=config,
|
||||||
|
raw_config=raw_config,
|
||||||
|
validate=validate,
|
||||||
)
|
)
|
||||||
pipe_index = self._get_pipe_index(before, after, first, last)
|
pipe_index = self._get_pipe_index(before, after, first, last)
|
||||||
self._pipe_meta[name] = self.get_factory_meta(factory_name)
|
self._pipe_meta[name] = self.get_factory_meta(factory_name)
|
||||||
|
@ -1379,7 +1390,9 @@ class Language:
|
||||||
DOCS: https://spacy.io/api/language#from_config
|
DOCS: https://spacy.io/api/language#from_config
|
||||||
"""
|
"""
|
||||||
if auto_fill:
|
if auto_fill:
|
||||||
config = util.deep_merge_configs(config, cls.default_config)
|
config = Config(
|
||||||
|
cls.default_config, section_order=CONFIG_SECTION_ORDER
|
||||||
|
).merge(config)
|
||||||
if "nlp" not in config:
|
if "nlp" not in config:
|
||||||
raise ValueError(Errors.E985.format(config=config))
|
raise ValueError(Errors.E985.format(config=config))
|
||||||
config_lang = config["nlp"]["lang"]
|
config_lang = config["nlp"]["lang"]
|
||||||
|
@ -1417,16 +1430,20 @@ class Language:
|
||||||
or lang_cls is not cls
|
or lang_cls is not cls
|
||||||
):
|
):
|
||||||
raise ValueError(Errors.E943.format(value=type(lang_cls)))
|
raise ValueError(Errors.E943.format(value=type(lang_cls)))
|
||||||
|
# Note that we don't load vectors here, instead they get loaded explicitly
|
||||||
|
# inside stuff like the spacy train function. If we loaded them here,
|
||||||
|
# then we would load them twice at runtime: once when we make from config,
|
||||||
|
# and then again when we load from disk.
|
||||||
nlp = lang_cls(vocab=vocab, create_tokenizer=create_tokenizer)
|
nlp = lang_cls(vocab=vocab, create_tokenizer=create_tokenizer)
|
||||||
if after_creation is not None:
|
if after_creation is not None:
|
||||||
nlp = after_creation(nlp)
|
nlp = after_creation(nlp)
|
||||||
if not isinstance(nlp, cls):
|
if not isinstance(nlp, cls):
|
||||||
raise ValueError(Errors.E942.format(name="creation", value=type(nlp)))
|
raise ValueError(Errors.E942.format(name="creation", value=type(nlp)))
|
||||||
# Note that we don't load vectors here, instead they get loaded explicitly
|
# To create the components we need to use the final interpolated config
|
||||||
# inside stuff like the spacy train function. If we loaded them here,
|
# so all values are available (if component configs use variables).
|
||||||
# then we would load them twice at runtime: once when we make from config,
|
# Later we replace the component config with the raw config again.
|
||||||
# and then again when we load from disk.
|
interpolated = filled.interpolate() if not filled.is_interpolated else filled
|
||||||
pipeline = config.get("components", {})
|
pipeline = interpolated.get("components", {})
|
||||||
# If components are loaded from a source (existing models), we cache
|
# If components are loaded from a source (existing models), we cache
|
||||||
# them here so they're only loaded once
|
# them here so they're only loaded once
|
||||||
source_nlps = {}
|
source_nlps = {}
|
||||||
|
@ -1435,6 +1452,7 @@ class Language:
|
||||||
opts = ", ".join(pipeline.keys())
|
opts = ", ".join(pipeline.keys())
|
||||||
raise ValueError(Errors.E956.format(name=pipe_name, opts=opts))
|
raise ValueError(Errors.E956.format(name=pipe_name, opts=opts))
|
||||||
pipe_cfg = util.copy_config(pipeline[pipe_name])
|
pipe_cfg = util.copy_config(pipeline[pipe_name])
|
||||||
|
raw_config = Config(filled["components"][pipe_name])
|
||||||
if pipe_name not in disable:
|
if pipe_name not in disable:
|
||||||
if "factory" not in pipe_cfg and "source" not in pipe_cfg:
|
if "factory" not in pipe_cfg and "source" not in pipe_cfg:
|
||||||
err = Errors.E984.format(name=pipe_name, config=pipe_cfg)
|
err = Errors.E984.format(name=pipe_name, config=pipe_cfg)
|
||||||
|
@ -1444,7 +1462,11 @@ class Language:
|
||||||
# The pipe name (key in the config) here is the unique name
|
# The pipe name (key in the config) here is the unique name
|
||||||
# of the component, not necessarily the factory
|
# of the component, not necessarily the factory
|
||||||
nlp.add_pipe(
|
nlp.add_pipe(
|
||||||
factory, name=pipe_name, config=pipe_cfg, validate=validate,
|
factory,
|
||||||
|
name=pipe_name,
|
||||||
|
config=pipe_cfg,
|
||||||
|
validate=validate,
|
||||||
|
raw_config=raw_config,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
model = pipe_cfg["source"]
|
model = pipe_cfg["source"]
|
||||||
|
|
|
@ -4,7 +4,7 @@ import spacy
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.lang.de import German
|
from spacy.lang.de import German
|
||||||
from spacy.language import Language
|
from spacy.language import Language
|
||||||
from spacy.util import registry, deep_merge_configs, load_model_from_config
|
from spacy.util import registry, load_model_from_config
|
||||||
from spacy.ml.models import build_Tok2Vec_model, build_tb_parser_model
|
from spacy.ml.models import build_Tok2Vec_model, build_tb_parser_model
|
||||||
from spacy.ml.models import MultiHashEmbed, MaxoutWindowEncoder
|
from spacy.ml.models import MultiHashEmbed, MaxoutWindowEncoder
|
||||||
|
|
||||||
|
@ -194,37 +194,6 @@ def test_serialize_parser():
|
||||||
assert upper.get_dim("nI") == 66
|
assert upper.get_dim("nI") == 66
|
||||||
|
|
||||||
|
|
||||||
def test_deep_merge_configs():
|
|
||||||
config = {"a": "hello", "b": {"c": "d"}}
|
|
||||||
defaults = {"a": "world", "b": {"c": "e", "f": "g"}}
|
|
||||||
merged = deep_merge_configs(config, defaults)
|
|
||||||
assert len(merged) == 2
|
|
||||||
assert merged["a"] == "hello"
|
|
||||||
assert merged["b"] == {"c": "d", "f": "g"}
|
|
||||||
config = {"a": "hello", "b": {"@test": "x", "foo": 1}}
|
|
||||||
defaults = {"a": "world", "b": {"@test": "x", "foo": 100, "bar": 2}, "c": 100}
|
|
||||||
merged = deep_merge_configs(config, defaults)
|
|
||||||
assert len(merged) == 3
|
|
||||||
assert merged["a"] == "hello"
|
|
||||||
assert merged["b"] == {"@test": "x", "foo": 1, "bar": 2}
|
|
||||||
assert merged["c"] == 100
|
|
||||||
config = {"a": "hello", "b": {"@test": "x", "foo": 1}, "c": 100}
|
|
||||||
defaults = {"a": "world", "b": {"@test": "y", "foo": 100, "bar": 2}}
|
|
||||||
merged = deep_merge_configs(config, defaults)
|
|
||||||
assert len(merged) == 3
|
|
||||||
assert merged["a"] == "hello"
|
|
||||||
assert merged["b"] == {"@test": "x", "foo": 1}
|
|
||||||
assert merged["c"] == 100
|
|
||||||
# Test that leaving out the factory just adds to existing
|
|
||||||
config = {"a": "hello", "b": {"foo": 1}, "c": 100}
|
|
||||||
defaults = {"a": "world", "b": {"@test": "y", "foo": 100, "bar": 2}}
|
|
||||||
merged = deep_merge_configs(config, defaults)
|
|
||||||
assert len(merged) == 3
|
|
||||||
assert merged["a"] == "hello"
|
|
||||||
assert merged["b"] == {"@test": "y", "foo": 1, "bar": 2}
|
|
||||||
assert merged["c"] == 100
|
|
||||||
|
|
||||||
|
|
||||||
def test_config_nlp_roundtrip():
|
def test_config_nlp_roundtrip():
|
||||||
"""Test that a config prduced by the nlp object passes training config
|
"""Test that a config prduced by the nlp object passes training config
|
||||||
validation."""
|
validation."""
|
||||||
|
@ -311,3 +280,22 @@ def test_config_overrides():
|
||||||
nlp = spacy.load(d)
|
nlp = spacy.load(d)
|
||||||
assert isinstance(nlp, English)
|
assert isinstance(nlp, English)
|
||||||
assert nlp.pipe_names == ["tok2vec", "tagger"]
|
assert nlp.pipe_names == ["tok2vec", "tagger"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_config_interpolation():
|
||||||
|
config = Config().from_str(nlp_config_string, interpolate=False)
|
||||||
|
assert config["training"]["train_corpus"]["path"] == "${paths:train}"
|
||||||
|
interpolated = config.interpolate()
|
||||||
|
assert interpolated["training"]["train_corpus"]["path"] == ""
|
||||||
|
nlp = English.from_config(config)
|
||||||
|
assert nlp.config["training"]["train_corpus"]["path"] == "${paths:train}"
|
||||||
|
# Ensure that variables are preserved in nlp config
|
||||||
|
width = "${components.tok2vec.model:width}"
|
||||||
|
assert config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
|
||||||
|
assert nlp.config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
|
||||||
|
interpolated2 = nlp.config.interpolate()
|
||||||
|
assert interpolated2["training"]["train_corpus"]["path"] == ""
|
||||||
|
assert interpolated2["components"]["tagger"]["model"]["tok2vec"]["width"] == 342
|
||||||
|
nlp2 = English.from_config(interpolated)
|
||||||
|
assert nlp2.config["training"]["train_corpus"]["path"] == ""
|
||||||
|
assert nlp2.config["components"]["tagger"]["model"]["tok2vec"]["width"] == 342
|
||||||
|
|
|
@ -1,11 +1,14 @@
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from spacy.gold import docs_to_json, biluo_tags_from_offsets
|
from spacy.gold import docs_to_json, biluo_tags_from_offsets
|
||||||
from spacy.gold.converters import iob2docs, conll_ner2docs, conllu2docs
|
from spacy.gold.converters import iob2docs, conll_ner2docs, conllu2docs
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.schemas import ProjectConfigSchema, validate
|
from spacy.schemas import ProjectConfigSchema, validate
|
||||||
from spacy.cli.pretrain import make_docs
|
from spacy.cli.pretrain import make_docs
|
||||||
|
from spacy.cli.init_config import init_config, RECOMMENDATIONS_PATH
|
||||||
|
from spacy.cli.init_config import RecommendationSchema
|
||||||
from spacy.cli._util import validate_project_commands, parse_config_overrides
|
from spacy.cli._util import validate_project_commands, parse_config_overrides
|
||||||
|
from spacy.util import get_lang_class
|
||||||
|
import srsly
|
||||||
|
|
||||||
|
|
||||||
def test_cli_converters_conllu2json():
|
def test_cli_converters_conllu2json():
|
||||||
|
@ -319,3 +322,20 @@ def test_parse_config_overrides(args, expected):
|
||||||
def test_parse_config_overrides_invalid(args):
|
def test_parse_config_overrides_invalid(args):
|
||||||
with pytest.raises(SystemExit):
|
with pytest.raises(SystemExit):
|
||||||
parse_config_overrides(args)
|
parse_config_overrides(args)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("lang", ["en", "nl"])
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"pipeline", [["tagger", "parser", "ner"], [], ["ner", "textcat", "sentencizer"]]
|
||||||
|
)
|
||||||
|
@pytest.mark.parametrize("optimize", ["efficiency", "accuracy"])
|
||||||
|
def test_init_config(lang, pipeline, optimize):
|
||||||
|
# TODO: add more tests and also check for GPU with transformers
|
||||||
|
init_config("-", lang=lang, pipeline=pipeline, optimize=optimize, cpu=True)
|
||||||
|
|
||||||
|
|
||||||
|
def test_model_recommendations():
|
||||||
|
recommendations = srsly.read_json(RECOMMENDATIONS_PATH)
|
||||||
|
for lang, data in recommendations.items():
|
||||||
|
assert get_lang_class(lang)
|
||||||
|
assert RecommendationSchema(**data)
|
||||||
|
|
|
@ -58,6 +58,12 @@ if TYPE_CHECKING:
|
||||||
OOV_RANK = numpy.iinfo(numpy.uint64).max
|
OOV_RANK = numpy.iinfo(numpy.uint64).max
|
||||||
LEXEME_NORM_LANGS = ["da", "de", "el", "en", "id", "lb", "pt", "ru", "sr", "ta", "th"]
|
LEXEME_NORM_LANGS = ["da", "de", "el", "en", "id", "lb", "pt", "ru", "sr", "ta", "th"]
|
||||||
|
|
||||||
|
# Default order of sections in the config.cfg. Not all sections needs to exist,
|
||||||
|
# and additional sections are added at the end, in alphabetical order.
|
||||||
|
# fmt: off
|
||||||
|
CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "training", "pretraining"]
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
|
|
||||||
logging.basicConfig()
|
logging.basicConfig()
|
||||||
logger = logging.getLogger("spacy")
|
logger = logging.getLogger("spacy")
|
||||||
|
@ -263,9 +269,7 @@ def load_model_from_path(
|
||||||
if not meta:
|
if not meta:
|
||||||
meta = get_model_meta(model_path)
|
meta = get_model_meta(model_path)
|
||||||
config_path = model_path / "config.cfg"
|
config_path = model_path / "config.cfg"
|
||||||
if not config_path.exists() or not config_path.is_file():
|
config = load_config(config_path, overrides=dict_to_dot(config))
|
||||||
raise IOError(Errors.E053.format(path=config_path, name="config.cfg"))
|
|
||||||
config = Config().from_disk(config_path, overrides=dict_to_dot(config))
|
|
||||||
nlp, _ = load_model_from_config(config, vocab=vocab, disable=disable)
|
nlp, _ = load_model_from_config(config, vocab=vocab, disable=disable)
|
||||||
return nlp.from_disk(model_path, exclude=disable)
|
return nlp.from_disk(model_path, exclude=disable)
|
||||||
|
|
||||||
|
@ -316,6 +320,29 @@ def load_model_from_init_py(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def load_config(
|
||||||
|
path: Union[str, Path],
|
||||||
|
overrides: Dict[str, Any] = SimpleFrozenDict(),
|
||||||
|
interpolate: bool = False,
|
||||||
|
) -> Config:
|
||||||
|
"""Load a config file. Takes care of path validation and section order."""
|
||||||
|
config_path = ensure_path(path)
|
||||||
|
if not config_path.exists() or not config_path.is_file():
|
||||||
|
raise IOError(Errors.E053.format(path=config_path, name="config.cfg"))
|
||||||
|
return Config(section_order=CONFIG_SECTION_ORDER).from_disk(
|
||||||
|
config_path, overrides=overrides, interpolate=interpolate
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def load_config_from_str(
|
||||||
|
text: str, overrides: Dict[str, Any] = SimpleFrozenDict(), interpolate: bool = False
|
||||||
|
):
|
||||||
|
"""Load a full config from a string."""
|
||||||
|
return Config(section_order=CONFIG_SECTION_ORDER).from_str(
|
||||||
|
text, overrides=overrides, interpolate=interpolate,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def get_installed_models() -> List[str]:
|
def get_installed_models() -> List[str]:
|
||||||
"""List all model packages currently installed in the environment.
|
"""List all model packages currently installed in the environment.
|
||||||
|
|
||||||
|
@ -901,45 +928,6 @@ def copy_config(config: Union[Dict[str, Any], Config]) -> Config:
|
||||||
raise ValueError(Errors.E961.format(config=config)) from None
|
raise ValueError(Errors.E961.format(config=config)) from None
|
||||||
|
|
||||||
|
|
||||||
def deep_merge_configs(
|
|
||||||
config: Union[Dict[str, Any], Config], defaults: Union[Dict[str, Any], Config]
|
|
||||||
) -> Config:
|
|
||||||
"""Deep merge two configs, a base config and its defaults. Ignores
|
|
||||||
references to registered functions to avoid filling in
|
|
||||||
|
|
||||||
config (Dict[str, Any]): The config.
|
|
||||||
destination (Dict[str, Any]): The config defaults.
|
|
||||||
RETURNS (Dict[str, Any]): The merged config.
|
|
||||||
"""
|
|
||||||
config = copy_config(config)
|
|
||||||
merged = _deep_merge_configs(config, defaults)
|
|
||||||
return Config(merged)
|
|
||||||
|
|
||||||
|
|
||||||
def _deep_merge_configs(
|
|
||||||
config: Union[Dict[str, Any], Config], defaults: Union[Dict[str, Any], Config]
|
|
||||||
) -> Union[Dict[str, Any], Config]:
|
|
||||||
for key, value in defaults.items():
|
|
||||||
if isinstance(value, dict):
|
|
||||||
node = config.setdefault(key, {})
|
|
||||||
if not isinstance(node, dict):
|
|
||||||
continue
|
|
||||||
promises = [key for key in value if key.startswith("@")]
|
|
||||||
promise = promises[0] if promises else None
|
|
||||||
# We only update the block from defaults if it refers to the same
|
|
||||||
# registered function
|
|
||||||
if (
|
|
||||||
promise
|
|
||||||
and any(k.startswith("@") for k in node)
|
|
||||||
and (promise in node and node[promise] != value[promise])
|
|
||||||
):
|
|
||||||
continue
|
|
||||||
defaults = _deep_merge_configs(node, value)
|
|
||||||
elif key not in config:
|
|
||||||
config[key] = value
|
|
||||||
return config
|
|
||||||
|
|
||||||
|
|
||||||
def dot_to_dict(values: Dict[str, Any]) -> Dict[str, dict]:
|
def dot_to_dict(values: Dict[str, Any]) -> Dict[str, dict]:
|
||||||
"""Convert dot notation to a dict. For example: {"token.pos": True,
|
"""Convert dot notation to a dict. For example: {"token.pos": True,
|
||||||
"token._.xyz": True} becomes {"token": {"pos": True, "_": {"xyz": True }}}.
|
"token._.xyz": True} becomes {"token": {"pos": True, "_": {"xyz": True }}}.
|
||||||
|
|
|
@ -101,39 +101,62 @@ files and model directories.
|
||||||
|
|
||||||
### init config {#init-config new="3"}
|
### init config {#init-config new="3"}
|
||||||
|
|
||||||
Initialize and export a [`config.cfg` file](/usage/training#config) for training
|
Initialize and save a [`config.cfg` file](/usage/training#config) using the
|
||||||
and update it with all default values, if possible. Config files used for
|
**recommended settings** for your use case. It works just like the
|
||||||
training should always be complete and not contain any hidden defaults or
|
[quickstart widget](/usage/training#quickstart), only that it also auto-fills
|
||||||
missing values, so this command helps you create your final config. It takes
|
all default values and exports a [training](/usage/training#config)-ready
|
||||||
**one** of the following options:
|
config. The settings you specify will impact the suggested model architectures
|
||||||
|
and pipeline setup, as well as the hyperparameters. You can also adjust and
|
||||||
- `--base`: Base **config** to auto-fill, e.g. created using the
|
customize those settings in your config file later.
|
||||||
[training quickstart](/usage/training#quickstart) widget.
|
|
||||||
- `--lang`: Base **language** code to use for blank config.
|
|
||||||
- `--model`: Base **model** to copy config from.
|
|
||||||
|
|
||||||
> ```bash
|
> ```bash
|
||||||
> ### with base config {wrap="true"}
|
> ### Example {wrap="true"}
|
||||||
> $ python -m spacy init config config.cfg --base base.cfg
|
> $ python -m spacy init config config.cfg --lang en --pipeline ner,textcat --optimize accuracy
|
||||||
> ```
|
|
||||||
>
|
|
||||||
> ```bash
|
|
||||||
> ### blank language {wrap="true"}
|
|
||||||
> $ python -m spacy init config config.cfg --lang en --pipeline tagger,parser
|
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ python -m spacy init config [output] [--base] [--lang] [--model] [--pipeline]
|
$ python -m spacy init config [output_file] [--lang] [--pipeline]
|
||||||
|
[--optimize] [--cpu]
|
||||||
```
|
```
|
||||||
|
|
||||||
| Argument | Type | Description |
|
| Argument | Type | Description |
|
||||||
| ------------------ | ---------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------------ | ---------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `output` | positional | Path to output `.cfg` file. If not set, the config is written to stdout so you can pipe it forward to a file. |
|
| `output_file` | positional | Path to output `.cfg` file. If not set, the config is written to stdout so you can pipe it forward to a file. |
|
||||||
| `--base`, `-b` | option | Optional base config file to auto-fill with defaults. |
|
| `--lang`, `-l` | option | Optional code of the [language](/usage/models#languages) to use. Defaults to `"en"`. |
|
||||||
| `--lang`, `-l` | option | Optional language code to use for blank config. If a `--pipeline` is specified, the components will be added in order. |
|
| `--pipeline`, `-p` | option | Comma-separated list of trainable [pipeline components](/usage/processing-pipelines#built-in) to include in the model. Defaults to `"tagger,parser,ner"`. |
|
||||||
| `--model`, `-m` | option | Optional base model to copy config from. If a `--pipeline` is specified, only those components will be kept, and all other components not in the model will be added. |
|
| `--optimize`, `-o` | option | `"efficiency"` or `"accuracy"`. Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters. Defaults to `"efficiency"`. |
|
||||||
| `--pipeline`, `-p` | option | Optional comma-separated pipeline of components to add to blank language or model. |
|
| `--cpu`, `-C` | flag | Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters. |
|
||||||
| **CREATES** | config | Complete and auto-filled config file for training. |
|
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||||
|
| **CREATES** | file | The config file for training. |
|
||||||
|
|
||||||
|
### init fill-config {#init-fill-config new="3"}
|
||||||
|
|
||||||
|
Auto-fill a partial [`config.cfg` file](/usage/training#config) file with **all
|
||||||
|
default values**, e.g. a config generated with the
|
||||||
|
[quickstart widget](/usage/training#quickstart). Config files used for training
|
||||||
|
should always be complete and not contain any hidden defaults or missing values,
|
||||||
|
so this command helps you create your final training config. In order to find
|
||||||
|
the available settings and defaults, all functions referenced in the config will
|
||||||
|
be created, and their signatures are used to find the defaults. If your config
|
||||||
|
contains a problem that can't be resolved automatically, spaCy will show you a
|
||||||
|
validation error with more details.
|
||||||
|
|
||||||
|
> ```bash
|
||||||
|
> ### Example {wrap="true"}
|
||||||
|
> $ python -m spacy init fill-config base.cfg config.cfg
|
||||||
|
> ```
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ python -m spacy init fill-config [base_path] [output_file] [--diff]
|
||||||
|
```
|
||||||
|
|
||||||
|
| Argument | Type | Description |
|
||||||
|
| -------------- | ---------- | ------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `base_path` | positional | Path to base config to fill, e.g. generated by the [quickstart widget](/usage/training#quickstart). |
|
||||||
|
| `output_file` | positional | Path to output `.cfg` file. If not set, the config is written to stdout so you can pipe it forward to a file. |
|
||||||
|
| `--diff`, `-D` | flag | Print a visual diff highlighting the changes. |
|
||||||
|
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||||
|
| **CREATES** | file | Complete and auto-filled config file for training. |
|
||||||
|
|
||||||
### init model {#init-model new="2"}
|
### init model {#init-model new="2"}
|
||||||
|
|
||||||
|
|
|
@ -20,8 +20,9 @@ Config files define the training process and model pipeline and can be passed to
|
||||||
[`spacy train`](/api/cli#train). They use
|
[`spacy train`](/api/cli#train). They use
|
||||||
[Thinc's configuration system](https://thinc.ai/docs/usage-config) under the
|
[Thinc's configuration system](https://thinc.ai/docs/usage-config) under the
|
||||||
hood. For details on how to use training configs, see the
|
hood. For details on how to use training configs, see the
|
||||||
[usage documentation](/usage/training#config). To get started with a blank
|
[usage documentation](/usage/training#config). To get started with the
|
||||||
config or fill a partial config with all defaults, you can use the
|
recommended settings for your use case, check out the
|
||||||
|
[quickstart widget](/usage/training#quickstart) or run the
|
||||||
[`init config`](/api/cli#init-config) command.
|
[`init config`](/api/cli#init-config) command.
|
||||||
|
|
||||||
> #### What does the @ mean?
|
> #### What does the @ mean?
|
||||||
|
|
|
@ -37,27 +37,37 @@ The recommended way to train your spaCy models is via the
|
||||||
single [`config.cfg`](#config) **configuration file** that includes all settings
|
single [`config.cfg`](#config) **configuration file** that includes all settings
|
||||||
and hyperparameters. You can optionally [overwritten](#config-overrides)
|
and hyperparameters. You can optionally [overwritten](#config-overrides)
|
||||||
settings on the command line, and load in a Python file to register
|
settings on the command line, and load in a Python file to register
|
||||||
[custom functions](#custom-code) and architectures.
|
[custom functions](#custom-code) and architectures. This quickstart widget helps
|
||||||
|
you generate a starter config with the **recommended settings** for your
|
||||||
|
specific use case. It's also available in spaCy as the
|
||||||
|
[`init config`](/api/cli#init-config) command.
|
||||||
|
|
||||||
> #### Instructions
|
> #### Instructions: widget
|
||||||
>
|
>
|
||||||
> 1. Select your requirements and settings.
|
> 1. Select your requirements and settings.
|
||||||
> 2. Use the buttons at the bottom to save the result to your clipboard or a
|
> 2. Use the buttons at the bottom to save the result to your clipboard or a
|
||||||
> file `base_config.cfg`.
|
> file `base_config.cfg`.
|
||||||
> 3. Run [`init config`](/api/cli#init-config) to create a full training config.
|
> 3. Run [`init fill-config`](/api/cli#init-fill-config) to create a full
|
||||||
|
> config.
|
||||||
> 4. Run [`train`](/api/cli#train) with your config and data.
|
> 4. Run [`train`](/api/cli#train) with your config and data.
|
||||||
|
>
|
||||||
|
> #### Instructions: CLI
|
||||||
|
>
|
||||||
|
> 1. Run the [`init config`](/api/cli#init-config) command and specify your
|
||||||
|
> requirements and settings as CLI arguments.
|
||||||
|
> 2. Run [`train`](/api/cli#train) with the exported config and data.
|
||||||
|
|
||||||
import QuickstartTraining from 'widgets/quickstart-training.js'
|
import QuickstartTraining from 'widgets/quickstart-training.js'
|
||||||
|
|
||||||
<QuickstartTraining download="base_config.cfg" />
|
<QuickstartTraining download="base_config.cfg" />
|
||||||
|
|
||||||
After you've saved the starter config to a file `base_config.cfg`, you can use
|
After you've saved the starter config to a file `base_config.cfg`, you can use
|
||||||
the [`init config`](/api/cli#init-config) command to fill in the remaining
|
the [`init fill-config`](/api/cli#init-fill-config) command to fill in the
|
||||||
defaults. Training configs should always be **complete and without hidden
|
remaining defaults. Training configs should always be **complete and without
|
||||||
defaults**, to keep your experiments reproducible.
|
hidden defaults**, to keep your experiments reproducible.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ python -m spacy init config config.cfg --base base_config.cfg
|
$ python -m spacy init fill-config base_config.cfg config.cfg
|
||||||
```
|
```
|
||||||
|
|
||||||
> #### Tip: Debug your data
|
> #### Tip: Debug your data
|
||||||
|
@ -70,10 +80,13 @@ $ python -m spacy init config config.cfg --base base_config.cfg
|
||||||
> $ python -m spacy debug data config.cfg --verbose
|
> $ python -m spacy debug data config.cfg --verbose
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
You can now add your data and run [`train`](/api/cli#train) with your config.
|
Instead of exporting your starter config from the quickstart widget and
|
||||||
See the [`convert`](/api/cli#convert) command for details on how to convert your
|
auto-filling it, you can also use the [`init config`](/api/cli#init-config)
|
||||||
data to spaCy's binary `.spacy` format. You can either include the data paths in
|
command and specify your requirement and settings and CLI arguments. You can now
|
||||||
the `[paths]` section of your config, or pass them in via the command line.
|
add your data and run [`train`](/api/cli#train) with your config. See the
|
||||||
|
[`convert`](/api/cli#convert) command for details on how to convert your data to
|
||||||
|
spaCy's binary `.spacy` format. You can either include the data paths in the
|
||||||
|
`[paths]` section of your config, or pass them in via the command line.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./dev.spacy
|
$ python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./dev.spacy
|
||||||
|
@ -601,7 +614,7 @@ settings in the block will be passed to the function as keyword arguments. Keep
|
||||||
in mind that the config shouldn't have any hidden defaults and all arguments on
|
in mind that the config shouldn't have any hidden defaults and all arguments on
|
||||||
the functions need to be represented in the config. If your function defines
|
the functions need to be represented in the config. If your function defines
|
||||||
**default argument values**, spaCy is able to auto-fill your config when you run
|
**default argument values**, spaCy is able to auto-fill your config when you run
|
||||||
[`init config`](/api/cli#init-config).
|
[`init fill-config`](/api/cli#init-fill-config).
|
||||||
|
|
||||||
```ini
|
```ini
|
||||||
### config.cfg (excerpt)
|
### config.cfg (excerpt)
|
||||||
|
|
|
@ -163,8 +163,9 @@ resolved, the function is created and passed into the model as an argument.
|
||||||
Remember that the `config.cfg` used for training should contain **no missing
|
Remember that the `config.cfg` used for training should contain **no missing
|
||||||
values** and requires all settings to be defined. You don't want any hidden
|
values** and requires all settings to be defined. You don't want any hidden
|
||||||
defaults creeping in and changing your results! spaCy will tell you if settings
|
defaults creeping in and changing your results! spaCy will tell you if settings
|
||||||
are missing, and you can run [`spacy init config`](/api/cli#init-config) with to
|
are missing, and you can run
|
||||||
automatically fill in all defaults.
|
[`spacy init fill-config`](/api/cli#init-fill-config) to automatically fill in
|
||||||
|
all defaults.
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
|
|
|
@ -152,7 +152,8 @@ The following methods, attributes and commands are new in spaCy v3.0.
|
||||||
| [`Language.config`](/api/language#config) | The [config](/usage/training#config) used to create the current `nlp` object. An instance of [`Config`](https://thinc.ai/docs/api-config#config) and can be saved to disk and used for training. |
|
| [`Language.config`](/api/language#config) | The [config](/usage/training#config) used to create the current `nlp` object. An instance of [`Config`](https://thinc.ai/docs/api-config#config) and can be saved to disk and used for training. |
|
||||||
| [`Pipe.score`](/api/pipe#score) | Method on trainable pipeline components that returns a dictionary of evaluation scores. |
|
| [`Pipe.score`](/api/pipe#score) | Method on trainable pipeline components that returns a dictionary of evaluation scores. |
|
||||||
| [`registry`](/api/top-level#registry) | Function registry to map functions to string names that can be referenced in [configs](/usage/training#config). |
|
| [`registry`](/api/top-level#registry) | Function registry to map functions to string names that can be referenced in [configs](/usage/training#config). |
|
||||||
| [`init config`](/api/cli#init-config) | CLI command for initializing a [training config](/usage/training) file for a blank language or auto-filling a partial config. |
|
| [`init config`](/api/cli#init-config) | CLI command for initializing a [training config](/usage/training) file with the recommended settings. |
|
||||||
|
| [`init fill-config`](/api/cli#init-fill-config) | CLI command for auto-filling a partial config with all defaults and missing values. |
|
||||||
| [`debug config`](/api/cli#debug-config) | CLI command for debugging a [training config](/usage/training) file and showing validation errors. |
|
| [`debug config`](/api/cli#debug-config) | CLI command for debugging a [training config](/usage/training) file and showing validation errors. |
|
||||||
| [`project`](/api/cli#project) | Suite of CLI commands for cloning, running and managing [spaCy projects](/usage/projects). |
|
| [`project`](/api/cli#project) | Suite of CLI commands for cloning, running and managing [spaCy projects](/usage/projects). |
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,6 @@
|
||||||
# Forked from: https://github.com/jonbretman/jinja-to-js
|
# Forked from: https://github.com/jonbretman/jinja-to-js
|
||||||
|
# With additional functionality: in/not in, replace, pprint, round, + for lists,
|
||||||
|
# rendering empty dicts
|
||||||
# This script is mostly used to generate the JavaScript function for the
|
# This script is mostly used to generate the JavaScript function for the
|
||||||
# training quicktart widget.
|
# training quicktart widget.
|
||||||
import contextlib
|
import contextlib
|
||||||
|
@ -315,7 +317,7 @@ class JinjaToJS(object):
|
||||||
if callable(handler):
|
if callable(handler):
|
||||||
handler(node, **kwargs)
|
handler(node, **kwargs)
|
||||||
else:
|
else:
|
||||||
raise Exception("Unknown node %s" % node)
|
raise Exception(f"Unknown node {node} ({node_name})")
|
||||||
|
|
||||||
def _process_extends(self, node, **kwargs):
|
def _process_extends(self, node, **kwargs):
|
||||||
"""
|
"""
|
||||||
|
@ -431,6 +433,13 @@ class JinjaToJS(object):
|
||||||
|
|
||||||
self.output.write(node.name)
|
self.output.write(node.name)
|
||||||
|
|
||||||
|
def _process_dict(self, node, **kwargs):
|
||||||
|
with self._interpolation():
|
||||||
|
with self._python_bool_wrapper(**kwargs):
|
||||||
|
if node.items:
|
||||||
|
raise ValueError(f"Can't process non-empty dict in epxression: {node}")
|
||||||
|
self.output.write("{}")
|
||||||
|
|
||||||
def _process_getattr(self, node, **kwargs):
|
def _process_getattr(self, node, **kwargs):
|
||||||
"""
|
"""
|
||||||
Processes a `GetAttr` node. e.g. {{ foo.bar }}
|
Processes a `GetAttr` node. e.g. {{ foo.bar }}
|
||||||
|
@ -697,6 +706,27 @@ class JinjaToJS(object):
|
||||||
self._process_node(node.node, **new_kwargs)
|
self._process_node(node.node, **new_kwargs)
|
||||||
self.output.write(")")
|
self.output.write(")")
|
||||||
|
|
||||||
|
def _process_filter_replace(self, node, **kwargs):
|
||||||
|
# We're getting a quoted string from Python/Jinja as the pattern to
|
||||||
|
# replace, but to replace all occurrences in JS, we typically need a
|
||||||
|
# regex, which would be annoying to convert. So we're using split/join
|
||||||
|
# instead here.
|
||||||
|
with self._interpolation():
|
||||||
|
with self._python_bool_wrapper(**kwargs) as new_kwargs:
|
||||||
|
self._process_node(node.node, **new_kwargs)
|
||||||
|
self.output.write(".split(")
|
||||||
|
self._process_node(node.args[0], **new_kwargs)
|
||||||
|
self.output.write(").join(")
|
||||||
|
self._process_node(node.args[1], **new_kwargs)
|
||||||
|
self.output.write(")")
|
||||||
|
|
||||||
|
def _process_filter_pprint(self, node, **kwargs):
|
||||||
|
with self._interpolation():
|
||||||
|
with self._python_bool_wrapper(**kwargs) as new_kwargs:
|
||||||
|
self.output.write("JSON.stringify(")
|
||||||
|
self._process_node(node.node, **new_kwargs)
|
||||||
|
self.output.write(")")
|
||||||
|
|
||||||
def _process_filter_attr(self, node, **kwargs):
|
def _process_filter_attr(self, node, **kwargs):
|
||||||
with self._interpolation():
|
with self._interpolation():
|
||||||
with self._python_bool_wrapper(**kwargs) as new_kwargs:
|
with self._python_bool_wrapper(**kwargs) as new_kwargs:
|
||||||
|
@ -746,7 +776,10 @@ class JinjaToJS(object):
|
||||||
with self._python_bool_wrapper(**kwargs) as new_kwargs:
|
with self._python_bool_wrapper(**kwargs) as new_kwargs:
|
||||||
self.output.write("Math.round((")
|
self.output.write("Math.round((")
|
||||||
self._process_node(node.node, **new_kwargs)
|
self._process_node(node.node, **new_kwargs)
|
||||||
self.output.write("+ Number.EPSILON) * 100) / 100")
|
self.output.write("+ Number.EPSILON) * 10**")
|
||||||
|
self._process_node(node.args[0], **new_kwargs)
|
||||||
|
self.output.write(") / 10**")
|
||||||
|
self._process_node(node.args[0], **new_kwargs)
|
||||||
|
|
||||||
def _process_filter_last(self, node, **kwargs):
|
def _process_filter_last(self, node, **kwargs):
|
||||||
with self._interpolation():
|
with self._interpolation():
|
||||||
|
@ -867,8 +900,10 @@ class JinjaToJS(object):
|
||||||
)
|
)
|
||||||
|
|
||||||
with option(kwargs, use_python_bool_wrapper=False):
|
with option(kwargs, use_python_bool_wrapper=False):
|
||||||
if operand.op == "in":
|
if operand.op == "in" or operand.op == "notin":
|
||||||
# Special case for "in" operator
|
# Special case for "in" operator
|
||||||
|
if operand.op == "notin":
|
||||||
|
self.output.write("!")
|
||||||
self._process_node(operand.expr, **kwargs)
|
self._process_node(operand.expr, **kwargs)
|
||||||
self.output.write(".includes(")
|
self.output.write(".includes(")
|
||||||
self._process_node(node.expr, **kwargs)
|
self._process_node(node.expr, **kwargs)
|
||||||
|
@ -1027,6 +1062,17 @@ class JinjaToJS(object):
|
||||||
self.output.write(")")
|
self.output.write(")")
|
||||||
|
|
||||||
def _process_add(self, node, **kwargs):
|
def _process_add(self, node, **kwargs):
|
||||||
|
# Handle + operator for lists, which behaves differently in JS. Currently
|
||||||
|
# only works if we have an explicit list node on either side (in which
|
||||||
|
# case we assume both are lists).
|
||||||
|
if isinstance(node.left, nodes.List) or isinstance(node.right, nodes.List):
|
||||||
|
with self._interpolation():
|
||||||
|
with self._python_bool_wrapper(**kwargs) as new_kwargs:
|
||||||
|
self._process_node(node.left, **new_kwargs)
|
||||||
|
self.output.write(".concat(")
|
||||||
|
self._process_node(node.right, **new_kwargs)
|
||||||
|
self.output.write(")")
|
||||||
|
else:
|
||||||
self._process_math(node, math_operator=" + ", **kwargs)
|
self._process_math(node, math_operator=" + ", **kwargs)
|
||||||
|
|
||||||
def _process_sub(self, node, **kwargs):
|
def _process_sub(self, node, **kwargs):
|
||||||
|
@ -1190,16 +1236,22 @@ def main(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
template_path: Path = typer.Argument(..., exists=True, dir_okay=False, help="Path to .jinja file"),
|
template_path: Path = typer.Argument(..., exists=True, dir_okay=False, help="Path to .jinja file"),
|
||||||
output: Path = typer.Argument(None, help="Path to output module (stdout if unset)"),
|
output: Path = typer.Argument(None, help="Path to output module (stdout if unset)"),
|
||||||
|
data_path: Path = typer.Option(None, "--data", help="Optional JSON file with additional data to be included as DATA")
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""Convert a jinja2 template to a JavaScript module."""
|
"""Convert a jinja2 template to a JavaScript module."""
|
||||||
compiler = JinjaToJS(
|
data = "{}"
|
||||||
template_path.parent, template_path.parts[-1], js_module_format="es6"
|
if data_path is not None:
|
||||||
)
|
with data_path.open("r", encoding="utf8") as f:
|
||||||
|
data = json.dumps(json.loads(f.read())) # dump and load for compactness
|
||||||
|
tpl_file = template_path.parts[-1]
|
||||||
|
compiler = JinjaToJS(template_path.parent, tpl_file, js_module_format="es6")
|
||||||
|
header = f"// This file was auto-generated by {__file__} based on {tpl_file}"
|
||||||
|
data_str = f"export const DATA = {data}"
|
||||||
result = compiler.get_output()
|
result = compiler.get_output()
|
||||||
if output is not None:
|
if output is not None:
|
||||||
with output.open("w") as f:
|
with output.open("w") as f:
|
||||||
f.write(result)
|
f.write(f"{header}\n{result}\n{data_str}")
|
||||||
print(f"Updated {output.parts[-1]}")
|
print(f"Updated {output.parts[-1]}")
|
||||||
else:
|
else:
|
||||||
print(result)
|
print(result)
|
||||||
|
|
|
@ -1,107 +0,0 @@
|
||||||
{# Template for "CPU" configs. The transformer will use a different template. #}
|
|
||||||
# This is an auto-generated partial config for training a model.
|
|
||||||
# To use it for training, auto-fill it with all default values.
|
|
||||||
# python -m spacy init config config.cfg --base base_config.cfg
|
|
||||||
[paths]
|
|
||||||
train = ""
|
|
||||||
dev = ""
|
|
||||||
|
|
||||||
[nlp]
|
|
||||||
lang = "{{ lang }}"
|
|
||||||
pipeline = {{ pipeline|safe }}
|
|
||||||
vectors = {{ ('"en_vectors_web_lg"' if optimize == "accuracy" else false)|safe }}
|
|
||||||
tokenizer = {"@tokenizers": "spacy.Tokenizer.v1"}
|
|
||||||
|
|
||||||
[components]
|
|
||||||
|
|
||||||
[components.tok2vec]
|
|
||||||
factory = "tok2vec"
|
|
||||||
|
|
||||||
[components.tok2vec.model]
|
|
||||||
@architectures = "spacy.Tok2Vec.v1"
|
|
||||||
|
|
||||||
[components.tok2vec.model.embed]
|
|
||||||
@architectures = "spacy.MultiHashEmbed.v1"
|
|
||||||
width = ${components.tok2vec.model.encode:width}
|
|
||||||
rows = {{ 2000 if optimize == "efficiency" else 7000 }}
|
|
||||||
also_embed_subwords = {{ true if has_letters else false }}
|
|
||||||
also_use_static_vectors = {{ true if optimize == "accuracy" else false }}
|
|
||||||
|
|
||||||
[components.tok2vec.model.encode]
|
|
||||||
@architectures = "spacy.MaxoutWindowEncoder.v1"
|
|
||||||
width = {{ 96 if optimize == "efficiency" else 256 }}
|
|
||||||
depth = {{ 4 if optimize == "efficiency" else 8 }}
|
|
||||||
window_size = 1
|
|
||||||
maxout_pieces = 3
|
|
||||||
|
|
||||||
{% if "tagger" in components %}
|
|
||||||
[components.tagger]
|
|
||||||
factory = "tagger"
|
|
||||||
|
|
||||||
[components.tagger.model]
|
|
||||||
@architectures = "spacy.Tagger.v1"
|
|
||||||
nO = null
|
|
||||||
|
|
||||||
[components.tagger.model.tok2vec]
|
|
||||||
@architectures = "spacy.Tok2VecListener.v1"
|
|
||||||
width = ${components.tok2vec.model.encode:width}
|
|
||||||
{%- endif %}
|
|
||||||
|
|
||||||
{% if "parser" in components -%}
|
|
||||||
[components.parser]
|
|
||||||
factory = "parser"
|
|
||||||
|
|
||||||
[components.parser.model]
|
|
||||||
@architectures = "spacy.TransitionBasedParser.v1"
|
|
||||||
nr_feature_tokens = 8
|
|
||||||
hidden_width = 128
|
|
||||||
maxout_pieces = 3
|
|
||||||
use_upper = true
|
|
||||||
nO = null
|
|
||||||
|
|
||||||
[components.parser.model.tok2vec]
|
|
||||||
@architectures = "spacy.Tok2VecListener.v1"
|
|
||||||
width = ${components.tok2vec.model.encode:width}
|
|
||||||
{%- endif %}
|
|
||||||
|
|
||||||
{% if "ner" in components -%}
|
|
||||||
[components.ner]
|
|
||||||
factory = "ner"
|
|
||||||
|
|
||||||
[components.ner.model]
|
|
||||||
@architectures = "spacy.TransitionBasedParser.v1"
|
|
||||||
nr_feature_tokens = 6
|
|
||||||
hidden_width = 64
|
|
||||||
maxout_pieces = 2
|
|
||||||
use_upper = true
|
|
||||||
nO = null
|
|
||||||
|
|
||||||
[components.ner.model.tok2vec]
|
|
||||||
@architectures = "spacy.Tok2VecListener.v1"
|
|
||||||
width = ${components.tok2vec.model.encode:width}
|
|
||||||
{% endif -%}
|
|
||||||
|
|
||||||
[training]
|
|
||||||
|
|
||||||
[training.train_corpus]
|
|
||||||
@readers = "spacy.Corpus.v1"
|
|
||||||
path = ${paths:train}
|
|
||||||
|
|
||||||
[training.dev_corpus]
|
|
||||||
@readers = "spacy.Corpus.v1"
|
|
||||||
path = ${paths:dev}
|
|
||||||
|
|
||||||
[training.score_weights]
|
|
||||||
{%- if "tagger" in components %}
|
|
||||||
tag_acc = {{ (1.0 / components|length)|round() }}
|
|
||||||
{%- endif -%}
|
|
||||||
{%- if "parser" in components %}
|
|
||||||
dep_uas = 0.0
|
|
||||||
dep_las = {{ (1.0 / components|length)|round() }}
|
|
||||||
sents_f = 0.0
|
|
||||||
{%- endif %}
|
|
||||||
{%- if "ner" in components %}
|
|
||||||
ents_f = {{ (1.0 / components|length)|round() }}
|
|
||||||
ents_p = 0.0
|
|
||||||
ents_r = 0.0
|
|
||||||
{%- endif -%}
|
|
|
@ -1,139 +0,0 @@
|
||||||
{# Template for "CPU" configs. The transformer will use a different template. #}
|
|
||||||
# This is an auto-generated partial config for training a model.
|
|
||||||
# To use it for training, auto-fill it with all default values.
|
|
||||||
# python -m spacy init config config.cfg --base base_config.cfg
|
|
||||||
[paths]
|
|
||||||
train = ""
|
|
||||||
dev = ""
|
|
||||||
|
|
||||||
[nlp]
|
|
||||||
lang = "{{ lang }}"
|
|
||||||
pipeline = {{ pipeline|safe }}
|
|
||||||
vectors = null
|
|
||||||
tokenizer = {"@tokenizers": "spacy.Tokenizer.v1"}
|
|
||||||
|
|
||||||
[components]
|
|
||||||
|
|
||||||
[components.transformer]
|
|
||||||
factory = "transformer"
|
|
||||||
|
|
||||||
[components.transformer.model]
|
|
||||||
@architectures = "spacy-transformers.TransformerModel.v1"
|
|
||||||
{#- name = {{ transformer_info["name"] }} #}
|
|
||||||
name = "roberta-base"
|
|
||||||
tokenizer_config = {"use_fast": true}
|
|
||||||
|
|
||||||
[components.transformer.model.get_spans]
|
|
||||||
@span_getters = "strided_spans.v1"
|
|
||||||
window = 128
|
|
||||||
stride = 96
|
|
||||||
|
|
||||||
{% if "tagger" in components %}
|
|
||||||
[components.tagger]
|
|
||||||
factory = "tagger"
|
|
||||||
|
|
||||||
[components.tagger.model]
|
|
||||||
@architectures = "spacy.Tagger.v1"
|
|
||||||
nO = null
|
|
||||||
|
|
||||||
[components.tagger.model.tok2vec]
|
|
||||||
@architectures = "spacy-transformers.TransformerListener.v1"
|
|
||||||
grad_factor = 1.0
|
|
||||||
|
|
||||||
[components.ner.model.tok2vec.pooling]
|
|
||||||
@layers = "reduce_mean.v1"
|
|
||||||
{%- endif %}
|
|
||||||
|
|
||||||
{% if "parser" in components -%}
|
|
||||||
[components.parser]
|
|
||||||
factory = "parser"
|
|
||||||
|
|
||||||
[components.parser.model]
|
|
||||||
@architectures = "spacy.TransitionBasedParser.v1"
|
|
||||||
nr_feature_tokens = 8
|
|
||||||
hidden_width = 128
|
|
||||||
maxout_pieces = 3
|
|
||||||
use_upper = false
|
|
||||||
nO = null
|
|
||||||
|
|
||||||
[components.parser.model.tok2vec]
|
|
||||||
@architectures = "spacy-transformers.TransformerListener.v1"
|
|
||||||
grad_factor = 1.0
|
|
||||||
|
|
||||||
[components.ner.model.tok2vec.pooling]
|
|
||||||
@layers = "reduce_mean.v1"
|
|
||||||
{%- endif %}
|
|
||||||
|
|
||||||
{% if "ner" in components -%}
|
|
||||||
[components.ner]
|
|
||||||
factory = "ner"
|
|
||||||
|
|
||||||
[components.ner.model]
|
|
||||||
@architectures = "spacy.TransitionBasedParser.v1"
|
|
||||||
nr_feature_tokens = 3
|
|
||||||
hidden_width = 64
|
|
||||||
maxout_pieces = 2
|
|
||||||
use_upper = false
|
|
||||||
nO = null
|
|
||||||
|
|
||||||
[components.ner.model.tok2vec]
|
|
||||||
@architectures = "spacy-transformers.TransformerListener.v1"
|
|
||||||
grad_factor = 1.0
|
|
||||||
|
|
||||||
[components.parser.model.tok2vec.pooling]
|
|
||||||
@layers = "reduce_mean.v1"
|
|
||||||
{% endif -%}
|
|
||||||
|
|
||||||
[training]
|
|
||||||
{#- accumulate_gradient = {{ transformer_info["size_factor"] }} #}
|
|
||||||
accumulate_gradient = 3
|
|
||||||
|
|
||||||
[training.optimizer]
|
|
||||||
@optimizers = "Adam.v1"
|
|
||||||
beta1 = 0.9
|
|
||||||
beta2 = 0.999
|
|
||||||
L2_is_weight_decay = true
|
|
||||||
L2 = 0.01
|
|
||||||
grad_clip = 1.0
|
|
||||||
use_averages = false
|
|
||||||
eps = 1e-8
|
|
||||||
|
|
||||||
[training.optimizer.learn_rate]
|
|
||||||
@schedules = "warmup_linear.v1"
|
|
||||||
warmup_steps = 250
|
|
||||||
total_steps = 20000
|
|
||||||
initial_rate = 5e-5
|
|
||||||
|
|
||||||
[training.train_corpus]
|
|
||||||
@readers = "spacy.Corpus.v1"
|
|
||||||
path = ${paths:train}
|
|
||||||
gold_preproc = false
|
|
||||||
max_length = 500
|
|
||||||
limit = 0
|
|
||||||
|
|
||||||
[training.dev_corpus]
|
|
||||||
@readers = "spacy.Corpus.v1"
|
|
||||||
path = ${paths:dev}
|
|
||||||
gold_preproc = false
|
|
||||||
max_length = 0
|
|
||||||
limit = 0
|
|
||||||
|
|
||||||
[training.batcher]
|
|
||||||
@batchers = "batch_by_padded.v1"
|
|
||||||
discard_oversize = true
|
|
||||||
batch_size = 2000
|
|
||||||
|
|
||||||
[training.score_weights]
|
|
||||||
{%- if "tagger" in components %}
|
|
||||||
tag_acc = {{ (1.0 / components|length)|round(2) }}
|
|
||||||
{%- endif -%}
|
|
||||||
{%- if "parser" in components %}
|
|
||||||
dep_uas = 0.0
|
|
||||||
dep_las = {{ (1.0 / components|length)|round(2) }}
|
|
||||||
sents_f = 0.0
|
|
||||||
{%- endif %}
|
|
||||||
{%- if "ner" in components %}
|
|
||||||
ents_f = {{ (1.0 / components|length)|round(2) }}
|
|
||||||
ents_p = 0.0
|
|
||||||
ents_r = 0.0
|
|
||||||
{%- endif -%}
|
|
|
@ -1 +1 @@
|
||||||
python jinja_to_js.py quickstart_training_cpu.jinja ../src/widgets/quickstart-training-generator.js
|
python jinja_to_js.py ../../spacy/cli/templates/quickstart_training.jinja ../src/widgets/quickstart-training-generator.js --data ../../spacy/cli/templates/quickstart_training_recommendations.json
|
||||||
|
|
|
@ -125,9 +125,9 @@
|
||||||
display: block
|
display: block
|
||||||
|
|
||||||
.small
|
.small
|
||||||
font-size: var(--font-size-sm)
|
font-size: var(--font-size-code)
|
||||||
line-height: 1.65
|
line-height: 1.65
|
||||||
white-space: pre
|
white-space: pre-wrap
|
||||||
max-height: 400px
|
max-height: 400px
|
||||||
overflow-y: auto
|
overflow-y: auto
|
||||||
|
|
||||||
|
|
File diff suppressed because one or more lines are too long
|
@ -2,14 +2,17 @@ import React, { useState } from 'react'
|
||||||
import { StaticQuery, graphql } from 'gatsby'
|
import { StaticQuery, graphql } from 'gatsby'
|
||||||
import highlightCode from 'gatsby-remark-prismjs/highlight-code.js'
|
import highlightCode from 'gatsby-remark-prismjs/highlight-code.js'
|
||||||
|
|
||||||
import { Quickstart, QS } from '../components/quickstart'
|
import { Quickstart } from '../components/quickstart'
|
||||||
import generator from './quickstart-training-generator'
|
import generator, { DATA as GENERATOR_DATA } from './quickstart-training-generator'
|
||||||
import { isString, htmlToReact } from '../components/util'
|
import { isString, htmlToReact } from '../components/util'
|
||||||
|
|
||||||
const DEFAULT_LANG = 'en'
|
const DEFAULT_LANG = 'en'
|
||||||
const DEFAULT_HARDWARE = 'gpu'
|
const DEFAULT_HARDWARE = 'gpu'
|
||||||
const DEFAULT_OPT = 'efficiency'
|
const DEFAULT_OPT = 'efficiency'
|
||||||
const COMPONENTS = ['tagger', 'parser', 'ner', 'textcat']
|
const COMPONENTS = ['tagger', 'parser', 'ner', 'textcat']
|
||||||
|
const COMMENT = `# This is an auto-generated partial config. To use it with 'spacy train'
|
||||||
|
# you can run spacy init fill-config to auto-fill all default settings:
|
||||||
|
# python -m spacy init fill-config ./base_config.cfg ./config.cfg`
|
||||||
|
|
||||||
const DATA = [
|
const DATA = [
|
||||||
{
|
{
|
||||||
|
@ -61,14 +64,17 @@ export default function QuickstartTraining({ id, title, download = 'config.cfg'
|
||||||
hardware: setHardware,
|
hardware: setHardware,
|
||||||
optimize: setOptimize,
|
optimize: setOptimize,
|
||||||
}
|
}
|
||||||
|
const reco = GENERATOR_DATA[lang] || {}
|
||||||
const content = generator({
|
const content = generator({
|
||||||
lang,
|
lang,
|
||||||
pipeline: stringify(components),
|
|
||||||
components,
|
components,
|
||||||
optimize,
|
optimize,
|
||||||
hardware,
|
hardware,
|
||||||
|
transformer_data: reco.transformer,
|
||||||
|
word_vectors: reco.word_vectors,
|
||||||
})
|
})
|
||||||
const rawContent = content.trim().replace(/\n\n\n+/g, '\n\n')
|
const rawStr = content.trim().replace(/\n\n\n+/g, '\n\n')
|
||||||
|
const rawContent = `${COMMENT}\n${rawStr}`
|
||||||
const displayContent = highlightCode('ini', rawContent)
|
const displayContent = highlightCode('ini', rawContent)
|
||||||
.split('\n')
|
.split('\n')
|
||||||
.map(line => (line.startsWith('#') ? `<span class="token comment">${line}</span>` : line))
|
.map(line => (line.startsWith('#') ? `<span class="token comment">${line}</span>` : line))
|
||||||
|
|
Loading…
Reference in New Issue
Block a user