Update config and commands

2025-07-31 18:39:49 +03:00 · 2020-07-11 13:03:53 +02:00 · 2020-07-11 13:03:53 +02:00 · b7111da1d7
commit b7111da1d7
parent 9e48ea48a1
8 changed files with 122 additions and 64 deletions
--- a/examples/experiments/onto-joint/defaults.cfg
+++ b/examples/experiments/onto-joint/defaults.cfg
@ -48,7 +48,7 @@ use_averages = false
 eps = 1e-8
 #learn_rate = 0.001

-[optimizer.learn_rate]
+[training.optimizer.learn_rate]
@schedules = "warmup_linear.v1"
 warmup_steps = 250
 total_steps = 20000
@ -56,8 +56,11 @@ initial_rate = 0.001

 [nlp]
 lang = "en"
+base_model = null
 vectors = null

+[nlp.pipeline]
+
 [nlp.pipeline.tok2vec]
 factory = "tok2vec"

--- a/examples/experiments/onto-joint/pretrain.cfg
+++ b/examples/experiments/onto-joint/pretrain.cfg
@ -25,6 +25,11 @@ score_weights = {"las": 0.4, "ents_f": 0.4, "tags_acc": 0.2}
 # These settings are invalid for the transformer models.
 init_tok2vec = null
 discard_oversize = false
+omit_extra_lookups = false
+batch_by = "words"
+use_gpu = -1
+raw_text = null
+tag_map = null

 [training.batch_size]
@schedules = "compounding.v1"
@ -72,6 +77,9 @@ learn_rate = 0.001
 [nlp]
 lang = "en"
 vectors = null
+base_model = null
+
+[nlp.pipeline]

 [nlp.pipeline.tok2vec]
 factory = "tok2vec"
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@ -1,4 +1,4 @@
-from typing import Dict, Any, Union, List
+from typing import Dict, Any, Union, List, Optional
 from pathlib import Path
 from wasabi import msg
 import srsly
@ -11,6 +11,7 @@ from configparser import InterpolationError
 import sys

 from ..schemas import ProjectConfigSchema, validate
+from ..util import import_file


 PROJECT_FILE = "project.yml"
@ -172,3 +173,16 @@ def show_validation_error(title: str = "Config validation error"):
        msg.fail(title, spaced=True)
        print(str(e).replace("Config validation error", "").strip())
        sys.exit(1)
+
+
+def import_code(code_path: Optional[Union[Path, str]]) -> None:
+    """Helper to import Python file provided in training commands / commands
+    using the config. This makes custom registered functions available.
+    """
+    if code_path is not None:
+        if not Path(code_path).exists():
+            msg.fail("Path to Python code not found", code_path, exits=1)
+        try:
+            import_file("python_code", code_path)
+        except Exception as e:
+            msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1)
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@ -1,15 +1,18 @@
-from typing import Optional, List, Sequence, Dict, Any, Tuple
+from typing import List, Sequence, Dict, Any, Tuple, Optional
 from pathlib import Path
 from collections import Counter
 import sys
 import srsly
 from wasabi import Printer, MESSAGES
+import typer

-from ._util import app, Arg, Opt
+from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
+from ._util import import_code
+from ..schemas import ConfigSchema
 from ..gold import Corpus, Example
 from ..syntax import nonproj
 from ..language import Language
-from ..util import load_model, get_lang_class
+from .. import util


 # Minimum number of expected occurrences of NER label in data to train new label
@ -24,12 +27,11 @@ BLANK_MODEL_THRESHOLD = 2000
@app.command("debug-data")
 def debug_data_cli(
    # fmt: off
-    lang: str = Arg(..., help="Model language"),
+    ctx: typer.Context,  # This is only used to read additional arguments
    train_path: Path = Arg(..., help="Location of JSON-formatted training data", exists=True),
    dev_path: Path = Arg(..., help="Location of JSON-formatted development data", exists=True),
-    tag_map_path: Optional[Path] = Opt(None, "--tag-map-path", "-tm", help="Location of JSON-formatted tag map", exists=True, dir_okay=False),
-    base_model: Optional[str] = Opt(None, "--base-model", "-b", help="Name of model to update (optional)"),
-    pipeline: str = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of pipeline components to train"),
+    config_path: Path = Arg(..., help="Path to config file", exists=True),
+    code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
    ignore_warnings: bool = Opt(False, "--ignore-warnings", "-IW", help="Ignore warnings, only show stats and errors"),
    verbose: bool = Opt(False, "--verbose", "-V", help="Print additional information and explanations"),
    no_format: bool = Opt(False, "--no-format", "-NF", help="Don't pretty-print the results"),
@ -40,13 +42,13 @@ def debug_data_cli(
    stats, and find problems like invalid entity annotations, cyclic
    dependencies, low data labels and more.
    """
+    overrides = parse_config_overrides(ctx.args)
+    import_code(code_path)
    debug_data(
-        lang,
        train_path,
        dev_path,
-        tag_map_path=tag_map_path,
-        base_model=base_model,
-        pipeline=[p.strip() for p in pipeline.split(",")],
+        config_path,
+        config_overrides=overrides,
        ignore_warnings=ignore_warnings,
        verbose=verbose,
        no_format=no_format,
@ -55,13 +57,11 @@ def debug_data_cli(


 def debug_data(
-    lang: str,
    train_path: Path,
    dev_path: Path,
+    config_path: Path,
    *,
-    tag_map_path: Optional[Path] = None,
-    base_model: Optional[str] = None,
-    pipeline: List[str] = ["tagger", "parser", "ner"],
+    config_overrides: Dict[str, Any] = {},
    ignore_warnings: bool = False,
    verbose: bool = False,
    no_format: bool = True,
@ -75,25 +75,27 @@ def debug_data(
        msg.fail("Training data not found", train_path, exits=1)
    if not dev_path.exists():
        msg.fail("Development data not found", dev_path, exits=1)
-
+    if not config_path.exists():
+        msg.fail("Config file not found", config_path, exists=1)
+    with show_validation_error():
+        config = util.load_config(
+            config_path,
+            create_objects=False,
+            schema=ConfigSchema,
+            overrides=config_overrides,
+        )
+    nlp = util.load_model_from_config(config["nlp"])
+    lang = config["nlp"]["lang"]
+    base_model = config["nlp"]["base_model"]
+    pipeline = list(config["nlp"]["pipeline"].keys())
+    tag_map_path = util.ensure_path(config["training"]["tag_map"])
    tag_map = {}
    if tag_map_path is not None:
        tag_map = srsly.read_json(tag_map_path)
-
-    # Initialize the model and pipeline
-    if base_model:
-        nlp = load_model(base_model)
-    else:
-        lang_cls = get_lang_class(lang)
-        nlp = lang_cls()
    # Update tag map with provided mapping
    nlp.vocab.morphology.tag_map.update(tag_map)

-    msg.divider("Data format validation")
-
-    # TODO: Validate data format using the JSON schema
-    # TODO: update once the new format is ready
-    # TODO: move validation to GoldCorpus in order to be able to load from dir
+    msg.divider("Data file validation")

    # Create the gold corpus to be able to better analyze data
    loading_train_error_message = ""
@ -380,7 +382,7 @@ def debug_data(
        if gold_dev_data["n_nonproj"] > 0:
            n_nonproj = gold_dev_data["n_nonproj"]
            msg.info(f"Found {n_nonproj} nonprojective dev sentence(s)")
-        msg.info(f"{labels_train_unpreprocessed} label(s) in train data")
+        msg.info(f"{len(labels_train_unpreprocessed)} label(s) in train data")
        msg.info(f"{len(labels_train)} label(s) in projectivized train data")
        labels_with_counts = _format_labels(
            gold_train_unpreprocessed_data["deps"].most_common(), counts=True
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, Dict, Any
 import random
 import numpy
 import time
@ -11,8 +11,11 @@ from thinc.api import CosineDistance, L2Distance
 from wasabi import msg
 import srsly
 from functools import partial
+import typer

-from ._util import app, Arg, Opt
+from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
+from ._util import import_code
+from ..schemas import ConfigSchema
 from ..errors import Errors
 from ..ml.models.multi_task import build_cloze_multi_task_model
 from ..ml.models.multi_task import build_cloze_characters_multi_task_model
@ -24,10 +27,11 @@ from .. import util
@app.command("pretrain")
 def pretrain_cli(
    # fmt: off
+    ctx: typer.Context,  # This is only used to read additional arguments
    texts_loc: Path = Arg(..., help="Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the key 'tokens'", exists=True),
    output_dir: Path = Arg(..., help="Directory to write models to on each epoch"),
    config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False),
-    use_gpu: int = Opt(-1, "--use-gpu", "-g", help="Use GPU"),
+    code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
    resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
    epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files."),
    # fmt: on
@ -51,11 +55,13 @@ def pretrain_cli(
    all settings are the same between pretraining and training. Ideally,
    this is done by using the same config file for both commands.
    """
+    overrides = parse_config_overrides(ctx.args)
+    import_code(code_path)
    pretrain(
        texts_loc,
        output_dir,
        config_path,
-        use_gpu=use_gpu,
+        config_overrides=overrides,
        resume_path=resume_path,
        epoch_resume=epoch_resume,
    )
@ -65,24 +71,34 @@ def pretrain(
    texts_loc: Path,
    output_dir: Path,
    config_path: Path,
-    use_gpu: int = -1,
+    config_overrides: Dict[str, Any] = {},
    resume_path: Optional[Path] = None,
    epoch_resume: Optional[int] = None,
 ):
-    verify_cli_args(**locals())
+    verify_cli_args(texts_loc, output_dir, config_path, resume_path, epoch_resume)
+    msg.info(f"Loading config from: {config_path}")
+    with show_validation_error():
+        config = util.load_config(
+            config_path,
+            create_objects=False,
+            validate=True,
+            schema=ConfigSchema,
+            overrides=config_overrides,
+        )
    if not output_dir.exists():
        output_dir.mkdir()
        msg.good(f"Created output directory: {output_dir}")

+    use_gpu = config["training"]["use_gpu"]
    if use_gpu >= 0:
        msg.info("Using GPU")
        require_gpu(use_gpu)
    else:
        msg.info("Using CPU")

-    msg.info(f"Loading config from: {config_path}")
-    config = util.load_config(config_path, create_objects=False)
-    fix_random_seed(config["pretraining"]["seed"])
+    seed = config["pretraining"]["seed"]
+    if seed is not None:
+        fix_random_seed(seed)
    if use_gpu >= 0 and config["pretraining"]["use_pytorch_for_gpu_memory"]:
        use_pytorch_for_gpu_memory()

@ -360,9 +376,7 @@ def _smart_round(figure, width=10, max_decimal=4):
        return format_str % figure


-def verify_cli_args(
-    texts_loc, output_dir, config_path, use_gpu, resume_path, epoch_resume
-):
+def verify_cli_args(texts_loc, output_dir, config_path, resume_path, epoch_resume):
    if not config_path or not config_path.exists():
        msg.fail("Config file not found", config_path, exits=1)
    if output_dir.exists() and [p for p in output_dir.iterdir()]:
@ -401,10 +415,3 @@ def verify_cli_args(
                f"The argument --epoch-resume has to be greater or equal to 0. {epoch_resume} is invalid",
                exits=True,
            )
-    config = util.load_config(config_path, create_objects=False)
-    if config["pretraining"]["objective"]["type"] == "vectors":
-        if not config["nlp"]["vectors"]:
-            msg.fail(
-                "Must specify nlp.vectors if pretraining.objective.type is vectors",
-                exits=True,
-            )
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -11,6 +11,7 @@ import random
 import typer

 from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
+from ._util import import_code
 from ..gold import Corpus, Example
 from ..lookups import Lookups
 from .. import util
@ -53,17 +54,10 @@ def train_cli(
    """
    util.set_env_log(verbose)
    verify_cli_args(
-        train_path=train_path,
-        dev_path=dev_path,
-        config_path=config_path,
-        code_path=code_path,
+        train_path=train_path, dev_path=dev_path, config_path=config_path,
    )
    overrides = parse_config_overrides(ctx.args)
-    if code_path is not None:
-        try:
-            util.import_file("python_code", code_path)
-        except Exception as e:
-            msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1)
+    import_code(code_path)
    train(
        config_path,
        {"train": train_path, "dev": dev_path},
@ -503,7 +497,6 @@ def verify_cli_args(
    dev_path: Path,
    config_path: Path,
    output_path: Optional[Path] = None,
-    code_path: Optional[Path] = None,
 ):
    # Make sure all files and paths exists if they are needed
    if not config_path or not config_path.exists():
@ -524,9 +517,6 @@ def verify_cli_args(
                "the specified output path doesn't exist, the directory will be "
                "created for you.",
            )
-    if code_path is not None:
-        if not code_path.exists():
-            msg.fail("Path to Python code not found", code_path, exits=1)


 def verify_textcat_config(nlp, nlp_config):
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@ -2,7 +2,7 @@ from typing import Dict, List, Union, Optional, Sequence, Any
 from enum import Enum
 from pydantic import BaseModel, Field, ValidationError, validator
 from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool
-from pydantic import FilePath, DirectoryPath
+from pydantic import FilePath, DirectoryPath, root_validator
 from collections import defaultdict
 from thinc.api import Model, Optimizer

@ -242,6 +242,7 @@ class ConfigSchemaPipeline(BaseModel):

 class ConfigSchemaNlp(BaseModel):
    lang: StrictStr = Field(..., title="The base language to use")
+    base_model: Optional[StrictStr] = Field(..., title="The base model to use")
    vectors: Optional[DirectoryPath] = Field(..., title="Path to vectors")
    pipeline: Optional[ConfigSchemaPipeline]

@ -250,9 +251,40 @@ class ConfigSchemaNlp(BaseModel):
        arbitrary_types_allowed = True


+class ConfigSchemaPretrain(BaseModel):
+    # fmt: off
+    max_epochs: StrictInt = Field(..., title="Maximum number of epochs to train for")
+    min_length: StrictInt = Field(..., title="Minimum length of examples")
+    max_length: StrictInt = Field(..., title="Maximum length of examples")
+    dropout: StrictFloat = Field(..., title="Dropout rate")
+    n_save_every: Optional[StrictInt] = Field(..., title="Saving frequency")
+    batch_size: Union[Sequence[int], int] = Field(..., title="The batch size or batch size schedule")
+    seed: Optional[StrictInt] = Field(..., title="Random seed")
+    use_pytorch_for_gpu_memory: StrictBool = Field(..., title="Allocate memory via PyTorch")
+    tok2vec_model: StrictStr = Field(..., title="tok2vec model in config, e.g. nlp.pipeline.tok2vec.model")
+    optimizer: Optimizer = Field(..., title="The optimizer to use")
+    # TODO: use a more detailed schema for this?
+    objective: Dict[str, Any] = Field(..., title="Pretraining objective")
+    # fmt: on
+
+    class Config:
+        extra = "forbid"
+        arbitrary_types_allowed = True
+
+
 class ConfigSchema(BaseModel):
    training: ConfigSchemaTraining
    nlp: ConfigSchemaNlp
+    pretraining: Optional[ConfigSchemaPretrain]
+
+    @root_validator
+    def validate_config(cls, values):
+        """Perform additional validation for settings with dependencies."""
+        pt = values.get("pretraining")
+        if pt and pt.objective.get("type") == "vectors" and not values["nlp"].vectors:
+            err = "Need nlp.vectors if pretraining.objective.type is vectors"
+            raise ValueError(err)
+        return values

    class Config:
        extra = "allow"
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@ -12,6 +12,8 @@ nlp_config_string = """
 [nlp]
 lang = "en"

+[nlp.pipeline]
+
 [nlp.pipeline.tok2vec]
 factory = "tok2vec"