diff --git a/spacy/__init__.py b/spacy/__init__.py
index da2b23a20..73e828936 100644
--- a/spacy/__init__.py
+++ b/spacy/__init__.py
@@ -8,6 +8,7 @@ warnings.filterwarnings("ignore", message="numpy.ufunc size changed") # noqa
# These are imported as part of the API
from thinc.api import prefer_gpu, require_gpu # noqa: F401
+from thinc.api import Config
from . import pipeline # noqa: F401
from .cli.info import info # noqa: F401
@@ -26,17 +27,17 @@ if sys.maxunicode == 65535:
def load(
name: Union[str, Path],
disable: Iterable[str] = tuple(),
- component_cfg: Dict[str, Dict[str, Any]] = util.SimpleFrozenDict(),
+ config: Union[Dict[str, Any], Config] = util.SimpleFrozenDict(),
) -> Language:
"""Load a spaCy model from an installed package or a local path.
name (str): Package name or model path.
disable (Iterable[str]): Names of pipeline components to disable.
- component_cfg (Dict[str, dict]): Config overrides for pipeline components,
- keyed by component names.
+ config (Dict[str, Any] / Config): Config overrides as nested dict or dict
+ keyed by section values in dot notation.
RETURNS (Language): The loaded nlp object.
"""
- return util.load_model(name, disable=disable, component_cfg=component_cfg)
+ return util.load_model(name, disable=disable, config=config)
def blank(name: str, **overrides) -> Language:
diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index 1fd9fd813..6c8c85e30 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -49,11 +49,9 @@ def debug_config_cli(
overrides = parse_config_overrides(ctx.args)
import_code(code_path)
with show_validation_error(config_path):
- config = Config().from_disk(config_path)
+ config = Config().from_disk(config_path, overrides=overrides)
try:
- nlp, _ = util.load_model_from_config(
- config, overrides=overrides, auto_fill=auto_fill
- )
+ nlp, _ = util.load_model_from_config(config, auto_fill=auto_fill)
except ValueError as e:
msg.fail(str(e), exits=1)
if auto_fill:
@@ -136,8 +134,8 @@ def debug_data(
if not config_path.exists():
msg.fail("Config file not found", config_path, exists=1)
with show_validation_error(config_path):
- cfg = Config().from_disk(config_path)
- nlp, config = util.load_model_from_config(cfg, overrides=config_overrides)
+ cfg = Config().from_disk(config_path, overrides=config_overrides)
+ nlp, config = util.load_model_from_config(cfg)
# Use original config here, not resolved version
sourced_components = get_sourced_components(cfg)
frozen_components = config["training"]["frozen_components"]
diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py
index 480c6b2c4..7c6c76a34 100644
--- a/spacy/cli/debug_model.py
+++ b/spacy/cli/debug_model.py
@@ -49,9 +49,9 @@ def debug_model_cli(
}
config_overrides = parse_config_overrides(ctx.args)
with show_validation_error(config_path):
- cfg = Config().from_disk(config_path)
+ cfg = Config().from_disk(config_path, overrides=config_overrides)
try:
- nlp, config = util.load_model_from_config(cfg, overrides=config_overrides)
+ nlp, config = util.load_model_from_config(cfg)
except ValueError as e:
msg.fail(str(e), exits=1)
seed = config.get("training", {}).get("seed", None)
diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py
index 23de5f452..7202ccacf 100644
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@@ -88,8 +88,8 @@ def pretrain(
msg.info("Using CPU")
msg.info(f"Loading config from: {config_path}")
with show_validation_error(config_path):
- config = Config().from_disk(config_path)
- nlp, config = util.load_model_from_config(config, overrides=config_overrides)
+ config = Config().from_disk(config_path, overrides=config_overrides)
+ nlp, config = util.load_model_from_config(config)
# TODO: validate that [pretraining] block exists
if not output_dir.exists():
output_dir.mkdir()
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 7e9ec9ec9..c5c6e7252 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -75,13 +75,13 @@ def train(
msg.info("Using CPU")
msg.info(f"Loading config and nlp from: {config_path}")
with show_validation_error(config_path):
- config = Config().from_disk(config_path)
+ config = Config().from_disk(config_path, overrides=config_overrides)
if config.get("training", {}).get("seed") is not None:
fix_random_seed(config["training"]["seed"])
# Use original config here before it's resolved to functions
sourced_components = get_sourced_components(config)
with show_validation_error(config_path):
- nlp, config = util.load_model_from_config(config, overrides=config_overrides)
+ nlp, config = util.load_model_from_config(config)
if config["training"]["vectors"] is not None:
util.load_vectors_into_model(nlp, config["training"]["vectors"])
verify_config(nlp)
@@ -144,7 +144,7 @@ def train(
max_steps=T_cfg["max_steps"],
eval_frequency=T_cfg["eval_frequency"],
raw_text=None,
- exclude=frozen_components
+ exclude=frozen_components,
)
msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}")
print_row = setup_printer(T_cfg, nlp)
diff --git a/spacy/language.py b/spacy/language.py
index 4b44b9820..e9d7e9eb6 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -558,7 +558,6 @@ class Language:
name: Optional[str] = None,
*,
config: Optional[Dict[str, Any]] = SimpleFrozenDict(),
- overrides: Optional[Dict[str, Any]] = SimpleFrozenDict(),
validate: bool = True,
) -> Callable[[Doc], Doc]:
"""Create a pipeline component. Mostly used internally. To create and
@@ -569,8 +568,6 @@ class Language:
Defaults to factory name if not set.
config (Optional[Dict[str, Any]]): Config parameters to use for this
component. Will be merged with default config, if available.
- overrides (Optional[Dict[str, Any]]): Config overrides, typically
- passed in via the CLI.
validate (bool): Whether to validate the component config against the
arguments and types expected by the factory.
RETURNS (Callable[[Doc], Doc]): The pipeline component.
@@ -613,7 +610,7 @@ class Language:
# registered functions twice
# TODO: customize validation to make it more readable / relate it to
# pipeline component and why it failed, explain default config
- resolved, filled = registry.resolve(cfg, validate=validate, overrides=overrides)
+ resolved, filled = registry.resolve(cfg, validate=validate)
filled = filled[factory_name]
filled["factory"] = factory_name
filled.pop("@factories", None)
@@ -657,7 +654,6 @@ class Language:
last: Optional[bool] = None,
source: Optional["Language"] = None,
config: Optional[Dict[str, Any]] = SimpleFrozenDict(),
- overrides: Optional[Dict[str, Any]] = SimpleFrozenDict(),
validate: bool = True,
) -> Callable[[Doc], Doc]:
"""Add a component to the processing pipeline. Valid components are
@@ -679,8 +675,6 @@ class Language:
component from.
config (Optional[Dict[str, Any]]): Config parameters to use for this
component. Will be merged with default config, if available.
- overrides (Optional[Dict[str, Any]]): Config overrides, typically
- passed in via the CLI.
validate (bool): Whether to validate the component config against the
arguments and types expected by the factory.
RETURNS (Callable[[Doc], Doc]): The pipeline component.
@@ -710,11 +704,7 @@ class Language:
lang_code=self.lang,
)
pipe_component = self.create_pipe(
- factory_name,
- name=name,
- config=config,
- overrides=overrides,
- validate=validate,
+ factory_name, name=name, config=config, validate=validate,
)
pipe_index = self._get_pipe_index(before, after, first, last)
self._pipe_meta[name] = self.get_factory_meta(factory_name)
@@ -1416,7 +1406,6 @@ class Language:
*,
vocab: Union[Vocab, bool] = True,
disable: Iterable[str] = tuple(),
- overrides: Dict[str, Any] = {},
auto_fill: bool = True,
validate: bool = True,
) -> "Language":
@@ -1456,9 +1445,8 @@ class Language:
config = util.copy_config(config)
orig_pipeline = config.pop("components", {})
config["components"] = {}
- non_pipe_overrides, pipe_overrides = _get_config_overrides(overrides)
resolved, filled = registry.resolve(
- config, validate=validate, schema=ConfigSchema, overrides=non_pipe_overrides
+ config, validate=validate, schema=ConfigSchema
)
filled["components"] = orig_pipeline
config["components"] = orig_pipeline
@@ -1507,11 +1495,7 @@ class Language:
# The pipe name (key in the config) here is the unique name
# of the component, not necessarily the factory
nlp.add_pipe(
- factory,
- name=pipe_name,
- config=pipe_cfg,
- overrides=pipe_overrides,
- validate=validate,
+ factory, name=pipe_name, config=pipe_cfg, validate=validate,
)
else:
model = pipe_cfg["source"]
@@ -1696,15 +1680,6 @@ class FactoryMeta:
default_score_weights: Optional[Dict[str, float]] = None # noqa: E704
-def _get_config_overrides(
- items: Dict[str, Any], prefix: str = "components"
-) -> Tuple[Dict[str, Any], Dict[str, Any]]:
- prefix = f"{prefix}."
- non_pipe = {k: v for k, v in items.items() if not k.startswith(prefix)}
- pipe = {k.replace(prefix, ""): v for k, v in items.items() if k.startswith(prefix)}
- return non_pipe, pipe
-
-
def _fix_pretrained_vectors_name(nlp: Language) -> None:
# TODO: Replace this once we handle vectors consistently as static
# data
diff --git a/spacy/tests/regression/test_issue5137.py b/spacy/tests/regression/test_issue5137.py
index 095ca8495..cc7a9bd38 100644
--- a/spacy/tests/regression/test_issue5137.py
+++ b/spacy/tests/regression/test_issue5137.py
@@ -27,6 +27,6 @@ def test_issue5137():
with make_tempdir() as tmpdir:
nlp.to_disk(tmpdir)
- overrides = {"my_component": {"categories": "my_categories"}}
- nlp2 = spacy.load(tmpdir, component_cfg=overrides)
+ overrides = {"components": {"my_component": {"categories": "my_categories"}}}
+ nlp2 = spacy.load(tmpdir, config=overrides)
assert nlp2.get_pipe("my_component").categories == "my_categories"
diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py
index 9b4d841b2..0d3c90c92 100644
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@@ -2,6 +2,7 @@ import pytest
from thinc.config import Config, ConfigValidationError
import spacy
from spacy.lang.en import English
+from spacy.lang.de import German
from spacy.language import Language
from spacy.util import registry, deep_merge_configs, load_model_from_config
from spacy.ml.models import build_Tok2Vec_model, build_tb_parser_model
@@ -282,3 +283,33 @@ def test_serialize_config_missing_pipes():
assert "tok2vec" not in config["components"]
with pytest.raises(ValueError):
load_model_from_config(config, auto_fill=True)
+
+
+def test_config_overrides():
+ overrides_nested = {"nlp": {"lang": "de", "pipeline": ["tagger"]}}
+ overrides_dot = {"nlp.lang": "de", "nlp.pipeline": ["tagger"]}
+ # load_model from config with overrides passed directly to Config
+ config = Config().from_str(nlp_config_string, overrides=overrides_dot)
+ nlp, _ = load_model_from_config(config, auto_fill=True)
+ assert isinstance(nlp, German)
+ assert nlp.pipe_names == ["tagger"]
+ # Serialized roundtrip with config passed in
+ base_config = Config().from_str(nlp_config_string)
+ base_nlp, _ = load_model_from_config(base_config, auto_fill=True)
+ assert isinstance(base_nlp, English)
+ assert base_nlp.pipe_names == ["tok2vec", "tagger"]
+ with make_tempdir() as d:
+ base_nlp.to_disk(d)
+ nlp = spacy.load(d, config=overrides_nested)
+ assert isinstance(nlp, German)
+ assert nlp.pipe_names == ["tagger"]
+ with make_tempdir() as d:
+ base_nlp.to_disk(d)
+ nlp = spacy.load(d, config=overrides_dot)
+ assert isinstance(nlp, German)
+ assert nlp.pipe_names == ["tagger"]
+ with make_tempdir() as d:
+ base_nlp.to_disk(d)
+ nlp = spacy.load(d)
+ assert isinstance(nlp, English)
+ assert nlp.pipe_names == ["tok2vec", "tagger"]
diff --git a/spacy/util.py b/spacy/util.py
index b5140d420..52073097e 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -210,7 +210,7 @@ def load_model(
*,
vocab: Union["Vocab", bool] = True,
disable: Iterable[str] = tuple(),
- component_cfg: Dict[str, Dict[str, Any]] = SimpleFrozenDict(),
+ config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
) -> "Language":
"""Load a model from a package or data path.
@@ -218,11 +218,11 @@ def load_model(
vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
a new Vocab object will be created.
disable (Iterable[str]): Names of pipeline components to disable.
- component_cfg (Dict[str, dict]): Config overrides for pipeline components,
- keyed by component names.
+ config (Dict[str, Any] / Config): Config overrides as nested dict or dict
+ keyed by section values in dot notation.
RETURNS (Language): The loaded nlp object.
"""
- kwargs = {"vocab": vocab, "disable": disable, "component_cfg": component_cfg}
+ kwargs = {"vocab": vocab, "disable": disable, "config": config}
if isinstance(name, str): # name or string path
if name.startswith("blank:"): # shortcut for blank model
return get_lang_class(name.replace("blank:", ""))()
@@ -240,11 +240,11 @@ def load_model_from_package(
*,
vocab: Union["Vocab", bool] = True,
disable: Iterable[str] = tuple(),
- component_cfg: Dict[str, Dict[str, Any]] = SimpleFrozenDict(),
+ config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
) -> "Language":
"""Load a model from an installed package."""
cls = importlib.import_module(name)
- return cls.load(vocab=vocab, disable=disable, component_cfg=component_cfg)
+ return cls.load(vocab=vocab, disable=disable, config=config)
def load_model_from_path(
@@ -253,7 +253,7 @@ def load_model_from_path(
meta: Optional[Dict[str, Any]] = None,
vocab: Union["Vocab", bool] = True,
disable: Iterable[str] = tuple(),
- component_cfg: Dict[str, Dict[str, Any]] = SimpleFrozenDict(),
+ config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
) -> "Language":
"""Load a model from a data directory path. Creates Language class with
pipeline from config.cfg and then calls from_disk() with path."""
@@ -264,12 +264,8 @@ def load_model_from_path(
config_path = model_path / "config.cfg"
if not config_path.exists() or not config_path.is_file():
raise IOError(Errors.E053.format(path=config_path, name="config.cfg"))
- config = Config().from_disk(config_path)
- override_cfg = {"components": {p: dict_to_dot(c) for p, c in component_cfg.items()}}
- overrides = dict_to_dot(override_cfg)
- nlp, _ = load_model_from_config(
- config, vocab=vocab, disable=disable, overrides=overrides
- )
+ config = Config().from_disk(config_path, overrides=dict_to_dot(config))
+ nlp, _ = load_model_from_config(config, vocab=vocab, disable=disable)
return nlp.from_disk(model_path, exclude=disable)
@@ -278,7 +274,6 @@ def load_model_from_config(
*,
vocab: Union["Vocab", bool] = True,
disable: Iterable[str] = tuple(),
- overrides: Dict[str, Any] = {},
auto_fill: bool = False,
validate: bool = True,
) -> Tuple["Language", Config]:
@@ -294,12 +289,7 @@ def load_model_from_config(
# registry, including custom subclasses provided via entry points
lang_cls = get_lang_class(nlp_config["lang"])
nlp = lang_cls.from_config(
- config,
- vocab=vocab,
- disable=disable,
- overrides=overrides,
- auto_fill=auto_fill,
- validate=validate,
+ config, vocab=vocab, disable=disable, auto_fill=auto_fill, validate=validate,
)
return nlp, nlp.resolved
@@ -309,14 +299,10 @@ def load_model_from_init_py(
*,
vocab: Union["Vocab", bool] = True,
disable: Iterable[str] = tuple(),
- component_cfg: Dict[str, Dict[str, Any]] = SimpleFrozenDict(),
+ config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
) -> "Language":
"""Helper function to use in the `load()` method of a model package's
__init__.py.
-
- init_file (str): Path to model's __init__.py, i.e. `__file__`.
- **overrides: Specific overrides, like pipeline components to disable.
- RETURNS (Language): `Language` class with loaded model.
"""
model_path = Path(init_file).parent
meta = get_model_meta(model_path)
@@ -325,7 +311,7 @@ def load_model_from_init_py(
if not model_path.exists():
raise IOError(Errors.E052.format(path=data_path))
return load_model_from_path(
- data_path, vocab=vocab, meta=meta, disable=disable, component_cfg=component_cfg
+ data_path, vocab=vocab, meta=meta, disable=disable, config=config
)
diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md
index 5b3326739..71b53f844 100644
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@@ -32,13 +32,13 @@ loaded in via [`Language.from_disk`](/api/language#from_disk).
> nlp = spacy.load("en_core_web_sm", disable=["parser", "tagger"])
> ```
-| Name | Type | Description |
-| ------------------------------------------ | ----------------- | --------------------------------------------------------------------------------- |
-| `name` | str / `Path` | Model to load, i.e. package name or path. |
-| _keyword-only_ | | |
-| `disable` | `List[str]` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). |
-| `component_cfg` 3 | `Dict[str, dict]` | Optional config overrides for pipeline components, keyed by component names. |
-| **RETURNS** | `Language` | A `Language` object with the loaded model. |
+| Name | Type | Description |
+| ----------------------------------- | ---------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------- |
+| `name` | str / `Path` | Model to load, i.e. package name or path. |
+| _keyword-only_ | | |
+| `disable` | `List[str]` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). |
+| `config` 3 | `Dict[str, Any]` / [`Config`](https://thinc.ai/docs/api-config#config) | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. |
+| **RETURNS** | `Language` | A `Language` object with the loaded model. |
Essentially, `spacy.load()` is a convenience wrapper that reads the language ID
and pipeline components from a model's `meta.json`, initializes the `Language`