Merge branch 'develop' of https://github.com/explosion/spaCy into tmp/fix-tagger-begin-train

2025-12-06 09:44:21 +03:00 · 2020-08-28 15:58:18 +02:00 · 2020-08-28 15:58:18 +02:00 · ef9888c1f7
commit ef9888c1f7
parent 472eb28716 89f692bc8a
11 changed files with 263 additions and 50 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -7,7 +7,7 @@ ml_datasets>=0.1.1
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.8.0,<1.1.0
 srsly>=2.1.0,<3.0.0
-catalogue>=0.0.7,<1.1.0
+catalogue>=2.0.1,<2.1.0
 typer>=0.3.0,<0.4.0
 pathy
 # Third party dependencies
--- a/setup.cfg
+++ b/setup.cfg
@ -44,7 +44,7 @@ install_requires =
    blis>=0.4.0,<0.5.0
    wasabi>=0.8.0,<1.1.0
    srsly>=2.1.0,<3.0.0
-    catalogue>=0.0.7,<1.1.0
+    catalogue>=2.0.1,<2.1.0
    typer>=0.3.0,<0.4.0
    pathy
    # Third-party dependencies
--- a/spacy/cli/init.py
+++ b/spacy/cli/init.py
@ -11,6 +11,7 @@ from .profile import profile  # noqa: F401
 from .train import train_cli  # noqa: F401
 from .pretrain import pretrain  # noqa: F401
 from .debug_data import debug_data  # noqa: F401
+from .debug_config import debug_config  # noqa: F401
 from .debug_model import debug_model  # noqa: F401
 from .evaluate import evaluate  # noqa: F401
 from .convert import convert  # noqa: F401
--- a/spacy/cli/debug_config.py
+++ b/spacy/cli/debug_config.py
@ -0,0 +1,93 @@
+from typing import Optional, Dict, Any, Union, List
+from pathlib import Path
+from wasabi import msg, table
+from thinc.api import Config
+from thinc.config import VARIABLE_RE
+import typer
+
+from ._util import Arg, Opt, show_validation_error, parse_config_overrides
+from ._util import import_code, debug_cli
+from .. import util
+
+
+@debug_cli.command(
+    "config",
+    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
+)
+def debug_config_cli(
+    # fmt: off
+    ctx: typer.Context,  # This is only used to read additional arguments
+    config_path: Path = Arg(..., help="Path to config file", exists=True),
+    code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
+    show_funcs: bool = Opt(False, "--show-functions", "-F", help="Show an overview of all registered functions used in the config and where they come from (modules, files etc.)"),
+    show_vars: bool = Opt(False, "--show-variables", "-V", help="Show an overview of all variables referenced in the config and their values. This will also reflect variables overwritten on the CLI.")
+    # fmt: on
+):
+    """Debug a config.cfg file and show validation errors. The command will
+    create all objects in the tree and validate them. Note that some config
+    validation errors are blocking and will prevent the rest of the config from
+    being resolved. This means that you may not see all validation errors at
+    once and some issues are only shown once previous errors have been fixed.
+    Similar as with the 'train' command, you can override settings from the config
+    as command line options. For instance, --training.batch_size 128 overrides
+    the value of "batch_size" in the block "[training]".
+    """
+    overrides = parse_config_overrides(ctx.args)
+    import_code(code_path)
+    debug_config(
+        config_path, overrides=overrides, show_funcs=show_funcs, show_vars=show_vars
+    )
+
+
+def debug_config(
+    config_path: Path,
+    *,
+    overrides: Dict[str, Any] = {},
+    show_funcs: bool = False,
+    show_vars: bool = False,
+):
+    msg.divider("Config validation")
+    with show_validation_error(config_path):
+        config = util.load_config(config_path, overrides=overrides)
+        nlp, _ = util.load_model_from_config(config)
+    msg.good("Config is valid")
+    if show_vars:
+        variables = get_variables(config)
+        msg.divider(f"Variables ({len(variables)})")
+        head = ("Variable", "Value")
+        msg.table(variables, header=head, divider=True, widths=(41, 34), spacing=2)
+    if show_funcs:
+        funcs = get_registered_funcs(config)
+        msg.divider(f"Registered functions ({len(funcs)})")
+        for func in funcs:
+            func_data = {
+                "Registry": f"@{func['registry']}",
+                "Name": func["name"],
+                "Module": func["module"],
+                "File": f"{func['file']} (line {func['line_no']})",
+            }
+            msg.info(f"[{func['path']}]")
+            print(table(func_data).strip())
+
+
+def get_registered_funcs(config: Config) -> List[Dict[str, Optional[Union[str, int]]]]:
+    result = []
+    for key, value in util.walk_dict(config):
+        if not key[-1].startswith("@"):
+            continue
+        # We have a reference to a registered function
+        reg_name = key[-1][1:]
+        registry = getattr(util.registry, reg_name)
+        path = ".".join(key[:-1])
+        info = registry.find(value)
+        result.append({"name": value, "registry": reg_name, "path": path, **info})
+    return result
+
+
+def get_variables(config: Config) -> Dict[str, Any]:
+    result = {}
+    for variable in sorted(set(VARIABLE_RE.findall(config.to_str()))):
+        path = variable[2:-1].replace(":", ".")
+        value = util.dot_to_object(config, path)
+        result[variable] = repr(value)
+    return result
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@ -23,34 +23,6 @@ BLANK_MODEL_MIN_THRESHOLD = 100
 BLANK_MODEL_THRESHOLD = 2000


-@debug_cli.command(
-    "config",
-    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
-)
-def debug_config_cli(
-    # fmt: off
-    ctx: typer.Context,  # This is only used to read additional arguments
-    config_path: Path = Arg(..., help="Path to config file", exists=True),
-    code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
-    # fmt: on
-):
-    """Debug a config.cfg file and show validation errors. The command will
-    create all objects in the tree and validate them. Note that some config
-    validation errors are blocking and will prevent the rest of the config from
-    being resolved. This means that you may not see all validation errors at
-    once and some issues are only shown once previous errors have been fixed.
-    Similar as with the 'train' command, you can override settings from the config
-    as command line options. For instance, --training.batch_size 128 overrides
-    the value of "batch_size" in the block "[training]".
-    """
-    overrides = parse_config_overrides(ctx.args)
-    import_code(code_path)
-    with show_validation_error(config_path):
-        config = util.load_config(config_path, overrides=overrides)
-        nlp, _ = util.load_model_from_config(config)
-    msg.good("Original config is valid")
-
-
@debug_cli.command(
    "data", context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
 )
--- a/spacy/gold/loggers.py
+++ b/spacy/gold/loggers.py
@ -1,6 +1,7 @@
-from typing import Dict, Any, Tuple, Callable
+from typing import Dict, Any, Tuple, Callable, List

 from ..util import registry
+from .. import util
 from ..errors import Errors
 from wasabi import msg

@ -66,7 +67,7 @@ def console_logger():


@registry.loggers("spacy.WandbLogger.v1")
-def wandb_logger(project_name: str):
+def wandb_logger(project_name: str, remove_config_values: List[str] = []):
    import wandb

    console = console_logger()
@ -75,16 +76,19 @@ def wandb_logger(project_name: str):
        nlp: "Language"
    ) -> Tuple[Callable[[Dict[str, Any]], None], Callable]:
        config = nlp.config.interpolate()
+        config_dot = util.dict_to_dot(config)
+        for field in remove_config_values:
+            del config_dot[field]
+        config = util.dot_to_dict(config_dot)
        wandb.init(project=project_name, config=config)
        console_log_step, console_finalize = console(nlp)

        def log_step(info: Dict[str, Any]):
            console_log_step(info)
-            epoch = info["epoch"]
            score = info["score"]
            other_scores = info["other_scores"]
            losses = info["losses"]
-            wandb.log({"score": score, "epoch": epoch})
+            wandb.log({"score": score})
            if losses:
                wandb.log({f"loss_{k}": v for k, v in losses.items()})
            if isinstance(other_scores, dict):
--- a/spacy/language.py
+++ b/spacy/language.py
@ -1542,7 +1542,9 @@ class Language:
        path = util.ensure_path(path)
        deserializers = {}
        if Path(path / "config.cfg").exists():
-            deserializers["config.cfg"] = lambda p: self.config.from_disk(p)
+            deserializers["config.cfg"] = lambda p: self.config.from_disk(
+                p, interpolate=False
+            )
        deserializers["meta.json"] = deserialize_meta
        deserializers["vocab"] = deserialize_vocab
        deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(
@ -1605,7 +1607,9 @@ class Language:
            self.vocab.vectors.name = data.get("vectors", {}).get("name")

        deserializers = {}
-        deserializers["config.cfg"] = lambda b: self.config.from_bytes(b)
+        deserializers["config.cfg"] = lambda b: self.config.from_bytes(
+            b, interpolate=False
+        )
        deserializers["meta.json"] = deserialize_meta
        deserializers["vocab"] = self.vocab.from_bytes
        deserializers["tokenizer"] = lambda b: self.tokenizer.from_bytes(
--- a/spacy/tests/pipeline/test_pipe_methods.py
+++ b/spacy/tests/pipeline/test_pipe_methods.py
@ -81,9 +81,9 @@ def test_replace_last_pipe(nlp):
 def test_replace_pipe_config(nlp):
    nlp.add_pipe("entity_linker")
    nlp.add_pipe("sentencizer")
-    assert nlp.get_pipe("entity_linker").cfg["incl_prior"] == True
+    assert nlp.get_pipe("entity_linker").cfg["incl_prior"] is True
    nlp.replace_pipe("entity_linker", "entity_linker", config={"incl_prior": False})
-    assert nlp.get_pipe("entity_linker").cfg["incl_prior"] == False
+    assert nlp.get_pipe("entity_linker").cfg["incl_prior"] is False


@pytest.mark.parametrize("old_name,new_name", [("old_pipe", "new_pipe")])
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@ -209,6 +209,20 @@ def test_config_nlp_roundtrip():
    assert new_nlp._factory_meta == nlp._factory_meta


+def test_config_nlp_roundtrip_bytes_disk():
+    """Test that the config is serialized correctly and not interpolated
+    by mistake."""
+    nlp = English()
+    nlp_bytes = nlp.to_bytes()
+    new_nlp = English().from_bytes(nlp_bytes)
+    assert new_nlp.config == nlp.config
+    nlp = English()
+    with make_tempdir() as d:
+        nlp.to_disk(d)
+        new_nlp = spacy.load(d)
+    assert new_nlp.config == nlp.config
+
+
 def test_serialize_config_language_specific():
    """Test that config serialization works as expected with language-specific
    factories."""
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@ -1,4 +1,6 @@
 import pytest
+from click import NoSuchOption
+
 from spacy.gold import docs_to_json, biluo_tags_from_offsets
 from spacy.gold.converters import iob2docs, conll_ner2docs, conllu2docs
 from spacy.lang.en import English
@ -372,9 +374,18 @@ def test_parse_config_overrides(args, expected):

@pytest.mark.parametrize(
    "args",
-    [["--foo"], ["--x.foo", "bar", "--baz"], ["--x.foo", "bar", "baz"], ["x.foo"]],
+    [["--foo"], ["--x.foo", "bar", "--baz"]],
 )
 def test_parse_config_overrides_invalid(args):
+    with pytest.raises(NoSuchOption):
+        parse_config_overrides(args)
+
+
+@pytest.mark.parametrize(
+    "args",
+    [["--x.foo", "bar", "baz"], ["x.foo"]],
+)
+def test_parse_config_overrides_invalid_2(args):
    with pytest.raises(SystemExit):
        parse_config_overrides(args)

--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@ -246,19 +246,19 @@ some config validation errors are blocking and will prevent the rest of the
 config from being resolved. This means that you may not see all validation
 errors at once and some issues are only shown once previous errors have been
 fixed. To auto-fill a partial config and save the result, you can use the
-[`init fillconfig`](/api/cli#init-fill-config) command.
+[`init fill-config`](/api/cli#init-fill-config) command.

 ```cli
-$ python -m spacy debug config [config_path] [--code_path] [overrides]
+$ python -m spacy debug config [config_path] [--code-path] [--show-functions] [--show-variables] [overrides]
 ```

 > #### Example
 >
 > ```cli
-> $ python -m spacy debug config ./config.cfg
+> $ python -m spacy debug config config.cfg
 > ```

-<Accordion title="Example output" spaced>
+<Accordion title="Example output (validation error)">

 ```
 ✘ Config validation error
@ -277,13 +277,127 @@ python -m spacy init fill-config tmp/starter-config_invalid.cfg --base tmp/start

 </Accordion>

-| Name                | Description                                                                                                                                                                                |
-| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `config_path`       | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~                                                                |
-| `--code_path`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~       |
-| `--help`, `-h`      | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                 |
-| overrides           | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
-| **PRINTS**          | Config validation errors, if available.                                                                                                                                                    |
+<Accordion title="Example output (valid config and all options)" spaced>
+
+```cli
+$ python -m spacy debug config ./config.cfg --show-functions --show-variables
+```
+
+```
+============================= Config validation =============================
+✔ Config is valid
+
+=============================== Variables (6) ===============================
+
+Variable                                   Value
+-----------------------------------------  ----------------------------------
+${components.tok2vec.model.encode.width}   96
+${paths.dev}                               'hello'
+${paths.init_tok2vec}                      None
+${paths.raw}                               None
+${paths.train}                             ''
+${system.seed}                             0
+
+
+========================= Registered functions (17) =========================
+ℹ [nlp.tokenizer]
+Registry   @tokenizers
+Name       spacy.Tokenizer.v1
+Module     spacy.language
+File       /path/to/spacy/language.py (line 64)
+ℹ [components.ner.model]
+Registry   @architectures
+Name       spacy.TransitionBasedParser.v1
+Module     spacy.ml.models.parser
+File       /path/to/spacy/ml/models/parser.py (line 11)
+ℹ [components.ner.model.tok2vec]
+Registry   @architectures
+Name       spacy.Tok2VecListener.v1
+Module     spacy.ml.models.tok2vec
+File       /path/to/spacy/ml/models/tok2vec.py (line 16)
+ℹ [components.parser.model]
+Registry   @architectures
+Name       spacy.TransitionBasedParser.v1
+Module     spacy.ml.models.parser
+File       /path/to/spacy/ml/models/parser.py (line 11)
+ℹ [components.parser.model.tok2vec]
+Registry   @architectures
+Name       spacy.Tok2VecListener.v1
+Module     spacy.ml.models.tok2vec
+File       /path/to/spacy/ml/models/tok2vec.py (line 16)
+ℹ [components.tagger.model]
+Registry   @architectures
+Name       spacy.Tagger.v1
+Module     spacy.ml.models.tagger
+File       /path/to/spacy/ml/models/tagger.py (line 9)
+ℹ [components.tagger.model.tok2vec]
+Registry   @architectures
+Name       spacy.Tok2VecListener.v1
+Module     spacy.ml.models.tok2vec
+File       /path/to/spacy/ml/models/tok2vec.py (line 16)
+ℹ [components.tok2vec.model]
+Registry   @architectures
+Name       spacy.Tok2Vec.v1
+Module     spacy.ml.models.tok2vec
+File       /path/to/spacy/ml/models/tok2vec.py (line 72)
+ℹ [components.tok2vec.model.embed]
+Registry   @architectures
+Name       spacy.MultiHashEmbed.v1
+Module     spacy.ml.models.tok2vec
+File       /path/to/spacy/ml/models/tok2vec.py (line 93)
+ℹ [components.tok2vec.model.encode]
+Registry   @architectures
+Name       spacy.MaxoutWindowEncoder.v1
+Module     spacy.ml.models.tok2vec
+File       /path/to/spacy/ml/models/tok2vec.py (line 207)
+ℹ [training.logger]
+Registry   @loggers
+Name       spacy.ConsoleLogger.v1
+Module     spacy.gold.loggers
+File       /path/to/spacy/gold/loggers.py (line 8)
+ℹ [training.batcher]
+Registry   @batchers
+Name       batch_by_words.v1
+Module     spacy.gold.batchers
+File       /path/to/spacy/gold/batchers.py (line 49)
+ℹ [training.batcher.size]
+Registry   @schedules
+Name       compounding.v1
+Module     thinc.schedules
+File       /Users/ines/Repos/explosion/thinc/thinc/schedules.py (line 43)
+ℹ [training.dev_corpus]
+Registry   @readers
+Name       spacy.Corpus.v1
+Module     spacy.gold.corpus
+File       /path/to/spacy/gold/corpus.py (line 18)
+ℹ [training.optimizer]
+Registry   @optimizers
+Name       Adam.v1
+Module     thinc.optimizers
+File       /Users/ines/Repos/explosion/thinc/thinc/optimizers.py (line 58)
+ℹ [training.optimizer.learn_rate]
+Registry   @schedules
+Name       warmup_linear.v1
+Module     thinc.schedules
+File       /Users/ines/Repos/explosion/thinc/thinc/schedules.py (line 91)
+ℹ [training.train_corpus]
+Registry   @readers
+Name       spacy.Corpus.v1
+Module     spacy.gold.corpus
+File       /path/to/spacy/gold/corpus.py (line 18)
+```
+
+</Accordion>
+
+| Name                     | Description                                                                                                                                                                                                                    |
+| ------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `config_path`            | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~                                                                                                    |
+| `--code-path`, `-c`      | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~                                           |
+| `--show-functions`, `-F` | Show an overview of all registered function blocks used in the config and where those functions come from, including the module name, Python file and line number. ~~bool (flag)~~                                             |
+| `--show-variables`, `-V` | Show an overview of all variables referenced in the config, e.g. `${paths.train}` and their values that will be used. This also reflects any config overrides provided on the CLI, e.g. `--paths.train /path`. ~~bool (flag)~~ |
+| `--help`, `-h`           | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                                     |
+| overrides                | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~                                     |
+| **PRINTS**               | Config validation errors, if available.                                                                                                                                                                                        |

 ### debug data {#debug-data tag="command"}