Merge branch 'develop' of https://github.com/explosion/spaCy into tmp/fix-tagger-begin-train

2025-07-10 16:22:29 +03:00 · 2020-08-28 15:58:18 +02:00 · 2020-08-28 15:58:18 +02:00 · ef9888c1f7
commit ef9888c1f7
parent 472eb28716 89f692bc8a
11 changed files with 263 additions and 50 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -7,7 +7,7 @@ ml_datasets>=0.1.1
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.8.0,<1.1.0
 srsly>=2.1.0,<3.0.0
-catalogue>=0.0.7,<1.1.0
+catalogue>=2.0.1,<2.1.0
 typer>=0.3.0,<0.4.0
 pathy
 # Third party dependencies
--- a/setup.cfg
+++ b/setup.cfg
@ -44,7 +44,7 @@ install_requires =
    blis>=0.4.0,<0.5.0
    wasabi>=0.8.0,<1.1.0
    srsly>=2.1.0,<3.0.0
-    catalogue>=0.0.7,<1.1.0
+    catalogue>=2.0.1,<2.1.0
    typer>=0.3.0,<0.4.0
    pathy
    # Third-party dependencies
--- a/spacy/cli/init.py
+++ b/spacy/cli/init.py
@ -11,6 +11,7 @@ from .profile import profile  # noqa: F401
 from .train import train_cli  # noqa: F401
 from .pretrain import pretrain  # noqa: F401
 from .debug_data import debug_data  # noqa: F401
 from .debug_config import debug_config  # noqa: F401
 from .debug_model import debug_model  # noqa: F401
 from .evaluate import evaluate  # noqa: F401
 from .convert import convert  # noqa: F401
--- a/spacy/cli/debug_config.py
+++ b/spacy/cli/debug_config.py
@ -0,0 +1,93 @@
 from typing import Optional, Dict, Any, Union, List
 from pathlib import Path
 from wasabi import msg, table
 from thinc.api import Config
 from thinc.config import VARIABLE_RE
 import typer
 from ._util import Arg, Opt, show_validation_error, parse_config_overrides
 from ._util import import_code, debug_cli
 from .. import util
@debug_cli.command(
    "config",
    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
 )
 def debug_config_cli(
    # fmt: off
    ctx: typer.Context,  # This is only used to read additional arguments
    config_path: Path = Arg(..., help="Path to config file", exists=True),
    code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
    show_funcs: bool = Opt(False, "--show-functions", "-F", help="Show an overview of all registered functions used in the config and where they come from (modules, files etc.)"),
    show_vars: bool = Opt(False, "--show-variables", "-V", help="Show an overview of all variables referenced in the config and their values. This will also reflect variables overwritten on the CLI.")
    # fmt: on
 ):
    """Debug a config.cfg file and show validation errors. The command will
    create all objects in the tree and validate them. Note that some config
    validation errors are blocking and will prevent the rest of the config from
    being resolved. This means that you may not see all validation errors at
    once and some issues are only shown once previous errors have been fixed.
    Similar as with the 'train' command, you can override settings from the config
    as command line options. For instance, --training.batch_size 128 overrides
    the value of "batch_size" in the block "[training]".
    """
    overrides = parse_config_overrides(ctx.args)
    import_code(code_path)
    debug_config(
        config_path, overrides=overrides, show_funcs=show_funcs, show_vars=show_vars
    )
 def debug_config(
    config_path: Path,
    *,
    overrides: Dict[str, Any] = {},
    show_funcs: bool = False,
    show_vars: bool = False,
 ):
    msg.divider("Config validation")
    with show_validation_error(config_path):
        config = util.load_config(config_path, overrides=overrides)
        nlp, _ = util.load_model_from_config(config)
    msg.good("Config is valid")
    if show_vars:
        variables = get_variables(config)
        msg.divider(f"Variables ({len(variables)})")
        head = ("Variable", "Value")
        msg.table(variables, header=head, divider=True, widths=(41, 34), spacing=2)
    if show_funcs:
        funcs = get_registered_funcs(config)
        msg.divider(f"Registered functions ({len(funcs)})")
        for func in funcs:
            func_data = {
                "Registry": f"@{func['registry']}",
                "Name": func["name"],
                "Module": func["module"],
                "File": f"{func['file']} (line {func['line_no']})",
            }
            msg.info(f"[{func['path']}]")
            print(table(func_data).strip())
 def get_registered_funcs(config: Config) -> List[Dict[str, Optional[Union[str, int]]]]:
    result = []
    for key, value in util.walk_dict(config):
        if not key[-1].startswith("@"):
            continue
        # We have a reference to a registered function
        reg_name = key[-1][1:]
        registry = getattr(util.registry, reg_name)
        path = ".".join(key[:-1])
        info = registry.find(value)
        result.append({"name": value, "registry": reg_name, "path": path, **info})
    return result
 def get_variables(config: Config) -> Dict[str, Any]:
    result = {}
    for variable in sorted(set(VARIABLE_RE.findall(config.to_str()))):
        path = variable[2:-1].replace(":", ".")
        value = util.dot_to_object(config, path)
        result[variable] = repr(value)
    return result
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@ -23,34 +23,6 @@ BLANK_MODEL_MIN_THRESHOLD = 100
 BLANK_MODEL_THRESHOLD = 2000
@debug_cli.command(
    "config",
    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
 )
 def debug_config_cli(
    # fmt: off
    ctx: typer.Context,  # This is only used to read additional arguments
    config_path: Path = Arg(..., help="Path to config file", exists=True),
    code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
    # fmt: on
 ):
    """Debug a config.cfg file and show validation errors. The command will
    create all objects in the tree and validate them. Note that some config
    validation errors are blocking and will prevent the rest of the config from
    being resolved. This means that you may not see all validation errors at
    once and some issues are only shown once previous errors have been fixed.
    Similar as with the 'train' command, you can override settings from the config
    as command line options. For instance, --training.batch_size 128 overrides
    the value of "batch_size" in the block "[training]".
    """
    overrides = parse_config_overrides(ctx.args)
    import_code(code_path)
    with show_validation_error(config_path):
        config = util.load_config(config_path, overrides=overrides)
        nlp, _ = util.load_model_from_config(config)
    msg.good("Original config is valid")
@debug_cli.command(
    "data", context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
 )
--- a/spacy/gold/loggers.py
+++ b/spacy/gold/loggers.py
@ -1,6 +1,7 @@
-from typing import Dict, Any, Tuple, Callable
+from typing import Dict, Any, Tuple, Callable, List
 from ..util import registry
 from .. import util
 from ..errors import Errors
 from wasabi import msg
@ -66,7 +67,7 @@ def console_logger():
@registry.loggers("spacy.WandbLogger.v1")
-def wandb_logger(project_name: str):
+def wandb_logger(project_name: str, remove_config_values: List[str] = []):
    import wandb
    console = console_logger()
@ -75,16 +76,19 @@ def wandb_logger(project_name: str):
        nlp: "Language"
    ) -> Tuple[Callable[[Dict[str, Any]], None], Callable]:
        config = nlp.config.interpolate()
        config_dot = util.dict_to_dot(config)
        for field in remove_config_values:
            del config_dot[field]
        config = util.dot_to_dict(config_dot)
        wandb.init(project=project_name, config=config)
        console_log_step, console_finalize = console(nlp)
        def log_step(info: Dict[str, Any]):
            console_log_step(info)
            epoch = info["epoch"]
            score = info["score"]
            other_scores = info["other_scores"]
            losses = info["losses"]
-            wandb.log({"score": score, "epoch": epoch})
+            wandb.log({"score": score})
            if losses:
                wandb.log({f"loss_{k}": v for k, v in losses.items()})
            if isinstance(other_scores, dict):
--- a/spacy/language.py
+++ b/spacy/language.py
@ -1542,7 +1542,9 @@ class Language:
        path = util.ensure_path(path)
        deserializers = {}
        if Path(path / "config.cfg").exists():
-            deserializers["config.cfg"] = lambda p: self.config.from_disk(p)
+            deserializers["config.cfg"] = lambda p: self.config.from_disk(
                p, interpolate=False
            )
        deserializers["meta.json"] = deserialize_meta
        deserializers["vocab"] = deserialize_vocab
        deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(
@ -1605,7 +1607,9 @@ class Language:
            self.vocab.vectors.name = data.get("vectors", {}).get("name")
        deserializers = {}
-        deserializers["config.cfg"] = lambda b: self.config.from_bytes(b)
+        deserializers["config.cfg"] = lambda b: self.config.from_bytes(
            b, interpolate=False
        )
        deserializers["meta.json"] = deserialize_meta
        deserializers["vocab"] = self.vocab.from_bytes
        deserializers["tokenizer"] = lambda b: self.tokenizer.from_bytes(
--- a/spacy/tests/pipeline/test_pipe_methods.py
+++ b/spacy/tests/pipeline/test_pipe_methods.py
@ -81,9 +81,9 @@ def test_replace_last_pipe(nlp):
 def test_replace_pipe_config(nlp):
    nlp.add_pipe("entity_linker")
    nlp.add_pipe("sentencizer")
-    assert nlp.get_pipe("entity_linker").cfg["incl_prior"] == True
+    assert nlp.get_pipe("entity_linker").cfg["incl_prior"] is True
    nlp.replace_pipe("entity_linker", "entity_linker", config={"incl_prior": False})
-    assert nlp.get_pipe("entity_linker").cfg["incl_prior"] == False
+    assert nlp.get_pipe("entity_linker").cfg["incl_prior"] is False
@pytest.mark.parametrize("old_name,new_name", [("old_pipe", "new_pipe")])
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@ -209,6 +209,20 @@ def test_config_nlp_roundtrip():
    assert new_nlp._factory_meta == nlp._factory_meta
 def test_config_nlp_roundtrip_bytes_disk():
    """Test that the config is serialized correctly and not interpolated
    by mistake."""
    nlp = English()
    nlp_bytes = nlp.to_bytes()
    new_nlp = English().from_bytes(nlp_bytes)
    assert new_nlp.config == nlp.config
    nlp = English()
    with make_tempdir() as d:
        nlp.to_disk(d)
        new_nlp = spacy.load(d)
    assert new_nlp.config == nlp.config
 def test_serialize_config_language_specific():
    """Test that config serialization works as expected with language-specific
    factories."""
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@ -1,4 +1,6 @@
 import pytest
 from click import NoSuchOption
 from spacy.gold import docs_to_json, biluo_tags_from_offsets
 from spacy.gold.converters import iob2docs, conll_ner2docs, conllu2docs
 from spacy.lang.en import English
@ -372,9 +374,18 @@ def test_parse_config_overrides(args, expected):
@pytest.mark.parametrize(
    "args",
-    [["--foo"], ["--x.foo", "bar", "--baz"], ["--x.foo", "bar", "baz"], ["x.foo"]],
+    [["--foo"], ["--x.foo", "bar", "--baz"]],
 )
 def test_parse_config_overrides_invalid(args):
    with pytest.raises(NoSuchOption):
        parse_config_overrides(args)
@pytest.mark.parametrize(
    "args",
    [["--x.foo", "bar", "baz"], ["x.foo"]],
 )
 def test_parse_config_overrides_invalid_2(args):
    with pytest.raises(SystemExit):
        parse_config_overrides(args)
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@ -246,19 +246,19 @@ some config validation errors are blocking and will prevent the rest of the
 config from being resolved. This means that you may not see all validation
 errors at once and some issues are only shown once previous errors have been
 fixed. To auto-fill a partial config and save the result, you can use the
-[`init fillconfig`](/api/cli#init-fill-config) command.
+[`init fill-config`](/api/cli#init-fill-config) command.
 ```cli
-$ python -m spacy debug config [config_path] [--code_path] [overrides]
+$ python -m spacy debug config [config_path] [--code-path] [--show-functions] [--show-variables] [overrides]
 ```
 > #### Example
 >
 > ```cli
-> $ python -m spacy debug config ./config.cfg
+> $ python -m spacy debug config config.cfg
 > ```
-<Accordion title="Example output" spaced>
+<Accordion title="Example output (validation error)">
 ```
 ✘ Config validation error
@ -277,10 +277,124 @@ python -m spacy init fill-config tmp/starter-config_invalid.cfg --base tmp/start
 </Accordion>
 <Accordion title="Example output (valid config and all options)" spaced>
 ```cli
 $ python -m spacy debug config ./config.cfg --show-functions --show-variables
 ```
 ```
 ============================= Config validation =============================
 ✔ Config is valid
 =============================== Variables (6) ===============================
 Variable                                   Value
 -----------------------------------------  ----------------------------------
 ${components.tok2vec.model.encode.width}   96
 ${paths.dev}                               'hello'
 ${paths.init_tok2vec}                      None
 ${paths.raw}                               None
 ${paths.train}                             ''
 ${system.seed}                             0
 ========================= Registered functions (17) =========================
 ℹ [nlp.tokenizer]
 Registry   @tokenizers
 Name       spacy.Tokenizer.v1
 Module     spacy.language
 File       /path/to/spacy/language.py (line 64)
 ℹ [components.ner.model]
 Registry   @architectures
 Name       spacy.TransitionBasedParser.v1
 Module     spacy.ml.models.parser
 File       /path/to/spacy/ml/models/parser.py (line 11)
 ℹ [components.ner.model.tok2vec]
 Registry   @architectures
 Name       spacy.Tok2VecListener.v1
 Module     spacy.ml.models.tok2vec
 File       /path/to/spacy/ml/models/tok2vec.py (line 16)
 ℹ [components.parser.model]
 Registry   @architectures
 Name       spacy.TransitionBasedParser.v1
 Module     spacy.ml.models.parser
 File       /path/to/spacy/ml/models/parser.py (line 11)
 ℹ [components.parser.model.tok2vec]
 Registry   @architectures
 Name       spacy.Tok2VecListener.v1
 Module     spacy.ml.models.tok2vec
 File       /path/to/spacy/ml/models/tok2vec.py (line 16)
 ℹ [components.tagger.model]
 Registry   @architectures
 Name       spacy.Tagger.v1
 Module     spacy.ml.models.tagger
 File       /path/to/spacy/ml/models/tagger.py (line 9)
 ℹ [components.tagger.model.tok2vec]
 Registry   @architectures
 Name       spacy.Tok2VecListener.v1
 Module     spacy.ml.models.tok2vec
 File       /path/to/spacy/ml/models/tok2vec.py (line 16)
 ℹ [components.tok2vec.model]
 Registry   @architectures
 Name       spacy.Tok2Vec.v1
 Module     spacy.ml.models.tok2vec
 File       /path/to/spacy/ml/models/tok2vec.py (line 72)
 ℹ [components.tok2vec.model.embed]
 Registry   @architectures
 Name       spacy.MultiHashEmbed.v1
 Module     spacy.ml.models.tok2vec
 File       /path/to/spacy/ml/models/tok2vec.py (line 93)
 ℹ [components.tok2vec.model.encode]
 Registry   @architectures
 Name       spacy.MaxoutWindowEncoder.v1
 Module     spacy.ml.models.tok2vec
 File       /path/to/spacy/ml/models/tok2vec.py (line 207)
 ℹ [training.logger]
 Registry   @loggers
 Name       spacy.ConsoleLogger.v1
 Module     spacy.gold.loggers
 File       /path/to/spacy/gold/loggers.py (line 8)
 ℹ [training.batcher]
 Registry   @batchers
 Name       batch_by_words.v1
 Module     spacy.gold.batchers
 File       /path/to/spacy/gold/batchers.py (line 49)
 ℹ [training.batcher.size]
 Registry   @schedules
 Name       compounding.v1
 Module     thinc.schedules
 File       /Users/ines/Repos/explosion/thinc/thinc/schedules.py (line 43)
 ℹ [training.dev_corpus]
 Registry   @readers
 Name       spacy.Corpus.v1
 Module     spacy.gold.corpus
 File       /path/to/spacy/gold/corpus.py (line 18)
 ℹ [training.optimizer]
 Registry   @optimizers
 Name       Adam.v1
 Module     thinc.optimizers
 File       /Users/ines/Repos/explosion/thinc/thinc/optimizers.py (line 58)
 ℹ [training.optimizer.learn_rate]
 Registry   @schedules
 Name       warmup_linear.v1
 Module     thinc.schedules
 File       /Users/ines/Repos/explosion/thinc/thinc/schedules.py (line 91)
 ℹ [training.train_corpus]
 Registry   @readers
 Name       spacy.Corpus.v1
 Module     spacy.gold.corpus
 File       /path/to/spacy/gold/corpus.py (line 18)
 ```
 </Accordion>
 | Name                     | Description                                                                                                                                                                                                                    |
-| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| ------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `config_path`            | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~                                                                                                    |
-| `--code_path`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~       |
+| `--code-path`, `-c`      | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~                                           |
 | `--show-functions`, `-F` | Show an overview of all registered function blocks used in the config and where those functions come from, including the module name, Python file and line number. ~~bool (flag)~~                                             |
 | `--show-variables`, `-V` | Show an overview of all variables referenced in the config, e.g. `${paths.train}` and their values that will be used. This also reflects any config overrides provided on the CLI, e.g. `--paths.train /path`. ~~bool (flag)~~ |
 | `--help`, `-h`           | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                                     |
 | overrides                | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~                                     |
 | **PRINTS**               | Config validation errors, if available.                                                                                                                                                                                        |