diff --git a/pyproject.toml b/pyproject.toml index a28cb1ffc..77b78a067 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ requires = [ "cymem>=2.0.2,<2.1.0", "preshed>=3.0.2,<3.1.0", "murmurhash>=0.28.0,<1.1.0", - "thinc>=8.0.0a25,<8.0.0a30", + "thinc>=8.0.0a26,<8.0.0a30", "blis>=0.4.0,<0.5.0", "pytokenizations", "smart_open>=2.0.0,<3.0.0" diff --git a/requirements.txt b/requirements.txt index 488559550..1f6a28d59 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # Our libraries cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 -thinc>=8.0.0a25,<8.0.0a30 +thinc>=8.0.0a26,<8.0.0a30 blis>=0.4.0,<0.5.0 ml_datasets>=0.1.1 murmurhash>=0.28.0,<1.1.0 diff --git a/setup.cfg b/setup.cfg index d86c71630..1d60fd710 100644 --- a/setup.cfg +++ b/setup.cfg @@ -34,13 +34,13 @@ setup_requires = cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 murmurhash>=0.28.0,<1.1.0 - thinc>=8.0.0a25,<8.0.0a30 + thinc>=8.0.0a26,<8.0.0a30 install_requires = # Our libraries murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 - thinc>=8.0.0a25,<8.0.0a30 + thinc>=8.0.0a26,<8.0.0a30 blis>=0.4.0,<0.5.0 wasabi>=0.7.1,<1.1.0 srsly>=2.1.0,<3.0.0 diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index cb368cb94..3df224131 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -5,7 +5,6 @@ import sys import srsly from wasabi import Printer, MESSAGES, msg, diff_strings import typer -from thinc.api import Config from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides from ._util import import_code, debug_cli, get_sourced_components @@ -49,7 +48,7 @@ def debug_config_cli( overrides = parse_config_overrides(ctx.args) import_code(code_path) with show_validation_error(config_path): - config = Config().from_disk(config_path, overrides=overrides, interpolate=False) + config = util.load_config(config_path, overrides=overrides) try: nlp, _ = util.load_model_from_config(config, auto_fill=auto_fill) except ValueError as e: @@ -134,9 +133,7 @@ def debug_data( if not config_path.exists(): msg.fail("Config file not found", config_path, exists=1) with show_validation_error(config_path): - cfg = Config().from_disk( - config_path, overrides=config_overrides, interpolate=False - ) + cfg = util.load_config(config_path, overrides=config_overrides) nlp, config = util.load_model_from_config(cfg) # Use original config here, not resolved version sourced_components = get_sourced_components(cfg) diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py index 6b7bad484..0143d2dbe 100644 --- a/spacy/cli/debug_model.py +++ b/spacy/cli/debug_model.py @@ -49,10 +49,9 @@ def debug_model_cli( } config_overrides = parse_config_overrides(ctx.args) with show_validation_error(config_path): + config = util.load_config(config_path, overrides=config_overrides) try: - nlp, config = util.load_model_from_config_path( - config_path, overrides=config_overrides - ) + nlp, config = util.load_model_from_config(config_path) except ValueError as e: msg.fail(str(e), exits=1) seed = config["pretraining"]["seed"] diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py index 5cb425c7c..cc4c980be 100644 --- a/spacy/cli/init_config.py +++ b/spacy/cli/init_config.py @@ -1,12 +1,11 @@ from typing import Optional, List from enum import Enum from pathlib import Path -from thinc.api import Config from wasabi import Printer import srsly import re -from ..util import load_model_from_config, get_lang_class +from .. import util from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND @@ -50,7 +49,7 @@ def init_config( from jinja2 import Template except ImportError: msg.fail("This command requires jinja2", "pip install jinja2", exits=1) - lang_defaults = get_lang_class(lang).Defaults + lang_defaults = util.get_lang_class(lang).Defaults has_letters = lang_defaults.writing_system.get("has_letters", True) has_transformer = False # TODO: check this somehow if has_transformer: @@ -80,9 +79,9 @@ def init_config( msg.text(f"- {label}: {value}") with show_validation_error(hint_init=False): with msg.loading("Auto-filling config..."): - config = Config().from_str(base_template, interpolate=False) + config = util.load_config_from_str(base_template) try: - nlp, _ = load_model_from_config(config, auto_fill=True) + nlp, _ = util.load_model_from_config(config, auto_fill=True) except ValueError as e: msg.fail(str(e), exits=1) msg.good("Auto-filled config with all values") diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 7c262f9a2..82950f402 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -88,9 +88,8 @@ def pretrain( msg.info("Using CPU") msg.info(f"Loading config from: {config_path}") with show_validation_error(config_path): - nlp, config = util.load_model_from_config_path( - config_path, overrides=config_overrides - ) + config = util.load_config(config_path, overrides=config_overrides) + nlp, config = util.load_model_from_config(config) # TODO: validate that [pretraining] block exists if not output_dir.exists(): output_dir.mkdir() diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 3436c6669..0489bc50f 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -75,9 +75,7 @@ def train( msg.info("Using CPU") msg.info(f"Loading config and nlp from: {config_path}") with show_validation_error(config_path): - config = Config().from_disk( - config_path, overrides=config_overrides, interpolate=False - ) + config = util.load_config(config_path, overrides=config_overrides) if config.get("training", {}).get("seed") is not None: fix_random_seed(config["training"]["seed"]) # Use original config here before it's resolved to functions diff --git a/spacy/language.py b/spacy/language.py index 266c955ef..b67c55e3b 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -21,7 +21,7 @@ from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis from .gold import Example, validate_examples from .scorer import Scorer from .util import create_default_optimizer, registry -from .util import SimpleFrozenDict, combine_score_weights +from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES from .lang.punctuation import TOKENIZER_INFIXES @@ -36,7 +36,7 @@ from . import about # This is the base config will all settings (training etc.) DEFAULT_CONFIG_PATH = Path(__file__).parent / "default_config.cfg" -DEFAULT_CONFIG = Config().from_disk(DEFAULT_CONFIG_PATH, interpolate=False) +DEFAULT_CONFIG = util.load_config(DEFAULT_CONFIG_PATH) class BaseDefaults: @@ -45,7 +45,7 @@ class BaseDefaults: Language.Defaults. """ - config: Config = Config() + config: Config = Config(section_order=CONFIG_SECTION_ORDER) tokenizer_exceptions: Dict[str, List[dict]] = BASE_EXCEPTIONS prefixes: Optional[List[Union[str, Pattern]]] = TOKENIZER_PREFIXES suffixes: Optional[List[Union[str, Pattern]]] = TOKENIZER_SUFFIXES @@ -583,7 +583,7 @@ class Language: # We're calling the internal _fill here to avoid constructing the # registered functions twice resolved, filled = registry.resolve(cfg, validate=validate) - filled = filled[factory_name] + filled = Config(filled[factory_name]) filled["factory"] = factory_name filled.pop("@factories", None) # Merge the final filled config with the raw config (including non- @@ -1390,7 +1390,9 @@ class Language: DOCS: https://spacy.io/api/language#from_config """ if auto_fill: - config = Config(cls.default_config).merge(config) + config = Config( + cls.default_config, section_order=CONFIG_SECTION_ORDER + ).merge(config) if "nlp" not in config: raise ValueError(Errors.E985.format(config=config)) config_lang = config["nlp"]["lang"] diff --git a/spacy/util.py b/spacy/util.py index 3459472b6..fd0fe1b7a 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -58,6 +58,12 @@ _PRINT_ENV = False OOV_RANK = numpy.iinfo(numpy.uint64).max LEXEME_NORM_LANGS = ["da", "de", "el", "en", "id", "lb", "pt", "ru", "sr", "ta", "th"] +# Default order of sections in the config.cfg. Not all sections needs to exist, +# and additional sections are added at the end, in alphabetical order. +# fmt: off +CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "training", "pretraining"] +# fmt: on + class registry(thinc.registry): languages = catalogue.create("spacy", "languages", entry_points=True) @@ -264,33 +270,11 @@ def load_model_from_path( if not meta: meta = get_model_meta(model_path) config_path = model_path / "config.cfg" - nlp, _ = load_model_from_config_path( - config_path, overrides=dict_to_dot(config), vocab=vocab, disable=disable - ) + config = load_config(config_path, overrides=dict_to_dot(config)) + nlp, _ = load_model_from_config(config, vocab=vocab, disable=disable) return nlp.from_disk(model_path, exclude=disable) -def load_model_from_config_path( - config_path: Union[str, Path], - *, - vocab: Union["Vocab", bool] = True, - disable: Iterable[str] = tuple(), - auto_fill: bool = False, - validate: bool = True, - overrides: Dict[str, Any] = SimpleFrozenDict(), - interpolate: bool = False, -) -> Tuple["Language", Config]: - config_path = ensure_path(config_path) - if not config_path.exists() or not config_path.is_file(): - raise IOError(Errors.E053.format(path=config_path, name="config.cfg")) - config = Config().from_disk( - config_path, overrides=overrides, interpolate=interpolate - ) - return load_model_from_config( - config, vocab=vocab, disable=disable, auto_fill=auto_fill, validate=validate, - ) - - def load_model_from_config( config: Union[Dict[str, Any], Config], *, @@ -337,6 +321,29 @@ def load_model_from_init_py( ) +def load_config( + path: Union[str, Path], + overrides: Dict[str, Any] = SimpleFrozenDict(), + interpolate: bool = False, +) -> Config: + """Load a config file. Takes care of path validation and section order.""" + config_path = ensure_path(path) + if not config_path.exists() or not config_path.is_file(): + raise IOError(Errors.E053.format(path=config_path, name="config.cfg")) + return Config(section_order=CONFIG_SECTION_ORDER).from_disk( + config_path, overrides=overrides, interpolate=interpolate + ) + + +def load_config_from_str( + text: str, overrides: Dict[str, Any] = SimpleFrozenDict(), interpolate: bool = False +): + """Load a full config from a string.""" + return Config(section_order=CONFIG_SECTION_ORDER).from_str( + text, overrides=overrides, interpolate=interpolate, + ) + + def get_installed_models() -> List[str]: """List all model packages currently installed in the environment.