Update Thinc and include section order

2025-11-08 03:47:39 +03:00 · 2020-08-14 14:06:22 +02:00 · 2020-08-14 14:06:22 +02:00 · 67cc39af7f
commit 67cc39af7f
parent 8736bfc052
10 changed files with 53 additions and 52 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@ -6,7 +6,7 @@ requires = [
    "cymem>=2.0.2,<2.1.0",
    "preshed>=3.0.2,<3.1.0",
    "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=8.0.0a25,<8.0.0a30",
+    "thinc>=8.0.0a26,<8.0.0a30",
    "blis>=0.4.0,<0.5.0",
    "pytokenizations",
    "smart_open>=2.0.0,<3.0.0"
--- a/requirements.txt
+++ b/requirements.txt
@ -1,7 +1,7 @@
 # Our libraries
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=8.0.0a25,<8.0.0a30
+thinc>=8.0.0a26,<8.0.0a30
 blis>=0.4.0,<0.5.0
 ml_datasets>=0.1.1
 murmurhash>=0.28.0,<1.1.0
--- a/setup.cfg
+++ b/setup.cfg
@ -34,13 +34,13 @@ setup_requires =
    cymem>=2.0.2,<2.1.0
    preshed>=3.0.2,<3.1.0
    murmurhash>=0.28.0,<1.1.0
-    thinc>=8.0.0a25,<8.0.0a30
+    thinc>=8.0.0a26,<8.0.0a30
 install_requires =
    # Our libraries
    murmurhash>=0.28.0,<1.1.0
    cymem>=2.0.2,<2.1.0
    preshed>=3.0.2,<3.1.0
-    thinc>=8.0.0a25,<8.0.0a30
+    thinc>=8.0.0a26,<8.0.0a30
    blis>=0.4.0,<0.5.0
    wasabi>=0.7.1,<1.1.0
    srsly>=2.1.0,<3.0.0
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@ -5,7 +5,6 @@ import sys
 import srsly
 from wasabi import Printer, MESSAGES, msg, diff_strings
 import typer
 from thinc.api import Config
 from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
 from ._util import import_code, debug_cli, get_sourced_components
@ -49,7 +48,7 @@ def debug_config_cli(
    overrides = parse_config_overrides(ctx.args)
    import_code(code_path)
    with show_validation_error(config_path):
-        config = Config().from_disk(config_path, overrides=overrides, interpolate=False)
+        config = util.load_config(config_path, overrides=overrides)
        try:
            nlp, _ = util.load_model_from_config(config, auto_fill=auto_fill)
        except ValueError as e:
@ -134,9 +133,7 @@ def debug_data(
    if not config_path.exists():
        msg.fail("Config file not found", config_path, exists=1)
    with show_validation_error(config_path):
-        cfg = Config().from_disk(
+        cfg = util.load_config(config_path, overrides=config_overrides)
            config_path, overrides=config_overrides, interpolate=False
        )
        nlp, config = util.load_model_from_config(cfg)
    # Use original config here, not resolved version
    sourced_components = get_sourced_components(cfg)
--- a/spacy/cli/debug_model.py
+++ b/spacy/cli/debug_model.py
@ -49,10 +49,9 @@ def debug_model_cli(
    }
    config_overrides = parse_config_overrides(ctx.args)
    with show_validation_error(config_path):
        config = util.load_config(config_path, overrides=config_overrides)
        try:
-            nlp, config = util.load_model_from_config_path(
+            nlp, config = util.load_model_from_config(config_path)
                config_path, overrides=config_overrides
            )
        except ValueError as e:
            msg.fail(str(e), exits=1)
    seed = config["pretraining"]["seed"]
--- a/spacy/cli/init_config.py
+++ b/spacy/cli/init_config.py
@ -1,12 +1,11 @@
 from typing import Optional, List
 from enum import Enum
 from pathlib import Path
 from thinc.api import Config
 from wasabi import Printer
 import srsly
 import re
-from ..util import load_model_from_config, get_lang_class
+from .. import util
 from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND
@ -50,7 +49,7 @@ def init_config(
        from jinja2 import Template
    except ImportError:
        msg.fail("This command requires jinja2", "pip install jinja2", exits=1)
-    lang_defaults = get_lang_class(lang).Defaults
+    lang_defaults = util.get_lang_class(lang).Defaults
    has_letters = lang_defaults.writing_system.get("has_letters", True)
    has_transformer = False  # TODO: check this somehow
    if has_transformer:
@ -80,9 +79,9 @@ def init_config(
        msg.text(f"- {label}: {value}")
    with show_validation_error(hint_init=False):
        with msg.loading("Auto-filling config..."):
-            config = Config().from_str(base_template, interpolate=False)
+            config = util.load_config_from_str(base_template)
            try:
-                nlp, _ = load_model_from_config(config, auto_fill=True)
+                nlp, _ = util.load_model_from_config(config, auto_fill=True)
            except ValueError as e:
                msg.fail(str(e), exits=1)
    msg.good("Auto-filled config with all values")
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@ -88,9 +88,8 @@ def pretrain(
        msg.info("Using CPU")
    msg.info(f"Loading config from: {config_path}")
    with show_validation_error(config_path):
-        nlp, config = util.load_model_from_config_path(
+        config = util.load_config(config_path, overrides=config_overrides)
-            config_path, overrides=config_overrides
+        nlp, config = util.load_model_from_config(config)
        )
    # TODO: validate that [pretraining] block exists
    if not output_dir.exists():
        output_dir.mkdir()
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -75,9 +75,7 @@ def train(
        msg.info("Using CPU")
    msg.info(f"Loading config and nlp from: {config_path}")
    with show_validation_error(config_path):
-        config = Config().from_disk(
+        config = util.load_config(config_path, overrides=config_overrides)
            config_path, overrides=config_overrides, interpolate=False
        )
    if config.get("training", {}).get("seed") is not None:
        fix_random_seed(config["training"]["seed"])
    # Use original config here before it's resolved to functions
--- a/spacy/language.py
+++ b/spacy/language.py
@ -21,7 +21,7 @@ from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
 from .gold import Example, validate_examples
 from .scorer import Scorer
 from .util import create_default_optimizer, registry
-from .util import SimpleFrozenDict, combine_score_weights
+from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER
 from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS
 from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
 from .lang.punctuation import TOKENIZER_INFIXES
@ -36,7 +36,7 @@ from . import about
 # This is the base config will all settings (training etc.)
 DEFAULT_CONFIG_PATH = Path(__file__).parent / "default_config.cfg"
-DEFAULT_CONFIG = Config().from_disk(DEFAULT_CONFIG_PATH, interpolate=False)
+DEFAULT_CONFIG = util.load_config(DEFAULT_CONFIG_PATH)
 class BaseDefaults:
@ -45,7 +45,7 @@ class BaseDefaults:
    Language.Defaults.
    """
-    config: Config = Config()
+    config: Config = Config(section_order=CONFIG_SECTION_ORDER)
    tokenizer_exceptions: Dict[str, List[dict]] = BASE_EXCEPTIONS
    prefixes: Optional[List[Union[str, Pattern]]] = TOKENIZER_PREFIXES
    suffixes: Optional[List[Union[str, Pattern]]] = TOKENIZER_SUFFIXES
@ -583,7 +583,7 @@ class Language:
        # We're calling the internal _fill here to avoid constructing the
        # registered functions twice
        resolved, filled = registry.resolve(cfg, validate=validate)
-        filled = filled[factory_name]
+        filled = Config(filled[factory_name])
        filled["factory"] = factory_name
        filled.pop("@factories", None)
        # Merge the final filled config with the raw config (including non-
@ -1390,7 +1390,9 @@ class Language:
        DOCS: https://spacy.io/api/language#from_config
        """
        if auto_fill:
-            config = Config(cls.default_config).merge(config)
+            config = Config(
                cls.default_config, section_order=CONFIG_SECTION_ORDER
            ).merge(config)
        if "nlp" not in config:
            raise ValueError(Errors.E985.format(config=config))
        config_lang = config["nlp"]["lang"]
--- a/spacy/util.py
+++ b/spacy/util.py
@ -58,6 +58,12 @@ _PRINT_ENV = False
 OOV_RANK = numpy.iinfo(numpy.uint64).max
 LEXEME_NORM_LANGS = ["da", "de", "el", "en", "id", "lb", "pt", "ru", "sr", "ta", "th"]
 # Default order of sections in the config.cfg. Not all sections needs to exist,
 # and additional sections are added at the end, in alphabetical order.
 # fmt: off
 CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "training", "pretraining"]
 # fmt: on
 class registry(thinc.registry):
    languages = catalogue.create("spacy", "languages", entry_points=True)
@ -264,33 +270,11 @@ def load_model_from_path(
    if not meta:
        meta = get_model_meta(model_path)
    config_path = model_path / "config.cfg"
-    nlp, _ = load_model_from_config_path(
+    config = load_config(config_path, overrides=dict_to_dot(config))
-        config_path, overrides=dict_to_dot(config), vocab=vocab, disable=disable
+    nlp, _ = load_model_from_config(config, vocab=vocab, disable=disable)
    )
    return nlp.from_disk(model_path, exclude=disable)
 def load_model_from_config_path(
    config_path: Union[str, Path],
    *,
    vocab: Union["Vocab", bool] = True,
    disable: Iterable[str] = tuple(),
    auto_fill: bool = False,
    validate: bool = True,
    overrides: Dict[str, Any] = SimpleFrozenDict(),
    interpolate: bool = False,
 ) -> Tuple["Language", Config]:
    config_path = ensure_path(config_path)
    if not config_path.exists() or not config_path.is_file():
        raise IOError(Errors.E053.format(path=config_path, name="config.cfg"))
    config = Config().from_disk(
        config_path, overrides=overrides, interpolate=interpolate
    )
    return load_model_from_config(
        config, vocab=vocab, disable=disable, auto_fill=auto_fill, validate=validate,
    )
 def load_model_from_config(
    config: Union[Dict[str, Any], Config],
    *,
@ -337,6 +321,29 @@ def load_model_from_init_py(
    )
 def load_config(
    path: Union[str, Path],
    overrides: Dict[str, Any] = SimpleFrozenDict(),
    interpolate: bool = False,
 ) -> Config:
    """Load a config file. Takes care of path validation and section order."""
    config_path = ensure_path(path)
    if not config_path.exists() or not config_path.is_file():
        raise IOError(Errors.E053.format(path=config_path, name="config.cfg"))
    return Config(section_order=CONFIG_SECTION_ORDER).from_disk(
        config_path, overrides=overrides, interpolate=interpolate
    )
 def load_config_from_str(
    text: str, overrides: Dict[str, Any] = SimpleFrozenDict(), interpolate: bool = False
 ):
    """Load a full config from a string."""
    return Config(section_order=CONFIG_SECTION_ORDER).from_str(
        text, overrides=overrides, interpolate=interpolate,
    )
 def get_installed_models() -> List[str]:
    """List all model packages currently installed in the environment.