Update with WIP

2025-10-17 09:14:14 +03:00 · 2020-07-10 13:31:27 +02:00 · 2020-07-10 13:31:27 +02:00 · 240e0a62ca
commit 240e0a62ca
parent a60562f208
3 changed files with 137 additions and 193 deletions
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -1,13 +1,12 @@
-from typing import Optional, Dict, List, Union, Sequence
+from typing import Optional, Dict
 from timeit import default_timer as timer
 import srsly
 import tqdm
 from pydantic import BaseModel, FilePath
 from pathlib import Path
 from wasabi import msg
 import thinc
 import thinc.schedules
-from thinc.api import Model, use_pytorch_for_gpu_memory, require_gpu, fix_random_seed
+from thinc.api import use_pytorch_for_gpu_memory, require_gpu, fix_random_seed
 import random
 from ._app import app, Arg, Opt
@ -15,108 +14,15 @@ from ..gold import Corpus, Example
 from ..lookups import Lookups
 from .. import util
 from ..errors import Errors
 from ..schemas import ConfigSchema
 # Don't remove - required to load the built-in architectures
 from ..ml import models  # noqa: F401
 # from ..schemas import ConfigSchema  # TODO: include?
 registry = util.registry
 CONFIG_STR = """
 [training]
 patience = 10
 eval_frequency = 10
 dropout = 0.2
 init_tok2vec = null
 max_epochs = 100
 orth_variant_level = 0.0
 gold_preproc = false
 max_length = 0
 use_gpu = 0
 scores = ["ents_p",  "ents_r", "ents_f"]
 score_weights = {"ents_f": 1.0}
 limit = 0
 [training.batch_size]
@schedules = "compounding.v1"
 start = 100
 stop = 1000
 compound = 1.001
 [optimizer]
@optimizers = "Adam.v1"
 learn_rate = 0.001
 beta1 = 0.9
 beta2 = 0.999
 [nlp]
 lang = "en"
 vectors = null
 [nlp.pipeline.tok2vec]
 factory = "tok2vec"
 [nlp.pipeline.ner]
 factory = "ner"
 [nlp.pipeline.ner.model]
@architectures = "spacy.TransitionBasedParser.v1"
 nr_feature_tokens = 3
 hidden_width = 64
 maxout_pieces = 3
 [nlp.pipeline.ner.model.tok2vec]
@architectures = "spacy.Tok2VecTensors.v1"
 width = ${nlp.pipeline.tok2vec.model:width}
 [nlp.pipeline.tok2vec.model]
@architectures = "spacy.HashEmbedCNN.v1"
 pretrained_vectors = ${nlp:vectors}
 width = 128
 depth = 4
 window_size = 1
 embed_size = 10000
 maxout_pieces = 3
 subword_features = true
 """
 class PipelineComponent(BaseModel):
    factory: str
    model: Model
    class Config:
        arbitrary_types_allowed = True
 class ConfigSchema(BaseModel):
    optimizer: Optional["Optimizer"]
    class training(BaseModel):
        patience: int = 10
        eval_frequency: int = 100
        dropout: float = 0.2
        init_tok2vec: Optional[FilePath] = None
        max_epochs: int = 100
        orth_variant_level: float = 0.0
        gold_preproc: bool = False
        max_length: int = 0
        use_gpu: int = 0
        scores: List[str] = ["ents_p", "ents_r", "ents_f"]
        score_weights: Dict[str, Union[int, float]] = {"ents_f": 1.0}
        limit: int = 0
        batch_size: Union[Sequence[int], int]
    class nlp(BaseModel):
        lang: str
        vectors: Optional[str]
        pipeline: Optional[Dict[str, PipelineComponent]]
    class Config:
        extra = "allow"
@app.command("train")
 def train_cli(
@ -126,12 +32,7 @@ def train_cli(
    config_path: Path = Arg(..., help="Path to config file", exists=True),
    output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store model in"),
    code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
    init_tok2vec: Optional[Path] = Opt(None, "--init-tok2vec", "-t2v", help="Path to pretrained weights for the tok2vec components. See 'spacy pretrain'. Experimental."),
    raw_text: Optional[Path] = Opt(None, "--raw-text", "-rt", help="Path to jsonl file with unlabelled text documents."),
    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
    use_gpu: int = Opt(-1, "--use-gpu", "-g", help="Use GPU"),
    tag_map_path: Optional[Path] = Opt(None, "--tag-map-path", "-tm", help="Location of JSON-formatted tag map"),
    omit_extra_lookups: bool = Opt(False, "--omit-extra-lookups", "-OEL", help="Don't include extra lookups in model"),
    # fmt: on
 ):
    """
@ -141,33 +42,11 @@ def train_cli(
    """
    util.set_env_log(verbose)
    verify_cli_args(**locals())
-
+    try:
-    if raw_text is not None:
+        util.import_file("python_code", code_path)
-        raw_text = list(srsly.read_jsonl(raw_text))
+    except Exception as e:
-    tag_map = {}
+        msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1)
-    if tag_map_path is not None:
+    train(config_path, {"train": train_path, "dev": dev_path}, output_path=output_path)
        tag_map = srsly.read_json(tag_map_path)
    weights_data = None
    if init_tok2vec is not None:
        with init_tok2vec.open("rb") as file_:
            weights_data = file_.read()
    if use_gpu >= 0:
        msg.info("Using GPU: {use_gpu}")
        require_gpu(use_gpu)
    else:
        msg.info("Using CPU")
    train(
        config_path,
        {"train": train_path, "dev": dev_path},
        output_path=output_path,
        raw_text=raw_text,
        tag_map=tag_map,
        weights_data=weights_data,
        omit_extra_lookups=omit_extra_lookups,
    )
 def train(
@ -175,19 +54,24 @@ def train(
    data_paths: Dict[str, Path],
    raw_text: Optional[Path] = None,
    output_path: Optional[Path] = None,
    tag_map: Optional[Path] = None,
    weights_data: Optional[bytes] = None,
    omit_extra_lookups: bool = False,
 ) -> None:
    msg.info(f"Loading config from: {config_path}")
    # Read the config first without creating objects, to get to the original nlp_config
-    config = util.load_config(config_path, create_objects=False)
+    config = util.load_config(config_path, create_objects=False, schema=ConfigSchema)
    use_gpu = config["training"]["use_gpu"]
    if use_gpu >= 0:
        msg.info(f"Using GPU: {use_gpu}")
        require_gpu(use_gpu)
    else:
        msg.info("Using CPU")
    raw_text, tag_map, weights_data = load_from_paths(config)
    fix_random_seed(config["training"]["seed"])
    if config["training"].get("use_pytorch_for_gpu_memory"):
        # It feels kind of weird to not have a default for this.
        use_pytorch_for_gpu_memory()
    nlp_config = config["nlp"]
-    config = util.load_config(config_path, create_objects=True)
+    config = util.load_config(config_path, create_objects=True, schema=ConfigSchema)
    training = config["training"]
    msg.info("Creating nlp from config")
    nlp = util.load_model_from_config(nlp_config)
@ -216,7 +100,7 @@ def train(
    # Create empty extra lexeme tables so the data from spacy-lookups-data
    # isn't loaded if these features are accessed
-    if omit_extra_lookups:
+    if config["omit_extra_lookups"]:
        nlp.vocab.lookups_extra = Lookups()
        nlp.vocab.lookups_extra.add_table("lexeme_cluster")
        nlp.vocab.lookups_extra.add_table("lexeme_prob")
@ -556,18 +440,36 @@ def update_meta(training, nlp, info):
        nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name]
 def load_from_paths(config):
    # TODO: separate checks from loading
    raw_text = util.ensure_path(config["training"]["raw_text"])
    if raw_text is not None:
        if not raw_text.exists():
            msg.fail("Can't find raw text", raw_text, exits=1)
        raw_text = list(srsly.read_jsonl(config["training"]["raw_text"]))
    tag_map = {}
    tag_map_path = util.ensure_path(config["training"]["tag_map"])
    if tag_map_path is not None:
        if not tag_map_path.exists():
            msg.fail("Can't find tag map path", tag_map_path, exits=1)
        tag_map = srsly.read_json(config["training"]["tag_map"])
    weights_data = None
    init_tok2vec = util.ensure_path(config["training"]["init_tok2vec"])
    if init_tok2vec is not None:
        if not init_tok2vec.exists():
            msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1)
        with init_tok2vec.open("rb") as file_:
            weights_data = file_.read()
    return raw_text, tag_map, weights_data
 def verify_cli_args(
-    train_path,
+    train_path: Path,
-    dev_path,
+    dev_path: Path,
-    config_path,
+    config_path: Path,
-    output_path=None,
+    output_path: Optional[Path] = None,
-    code_path=None,
+    code_path: Optional[Path] = None,
-    init_tok2vec=None,
+    verbose: bool = False,
    raw_text=None,
    verbose=False,
    use_gpu=-1,
    tag_map_path=None,
    omit_extra_lookups=False,
 ):
    # Make sure all files and paths exists if they are needed
    if not config_path or not config_path.exists():
@ -591,12 +493,6 @@ def verify_cli_args(
    if code_path is not None:
        if not code_path.exists():
            msg.fail("Path to Python code not found", code_path, exits=1)
        try:
            util.import_file("python_code", code_path)
        except Exception as e:
            msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1)
    if init_tok2vec is not None and not init_tok2vec.exists():
        msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1)
 def verify_textcat_config(nlp, nlp_config):
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@ -1,9 +1,10 @@
 from typing import Dict, List, Union, Optional, Sequence, Any
 from enum import Enum
 from pydantic import BaseModel, Field, ValidationError, validator
-from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool, FilePath
+from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool
 from pydantic import FilePath, DirectoryPath
 from collections import defaultdict
-from thinc.api import Model
+from thinc.api import Model, Optimizer
 from .attrs import NAMES
@ -173,41 +174,6 @@ class ModelMetaSchema(BaseModel):
 # JSON training format
 class PipelineComponent(BaseModel):
    factory: str
    model: Model
    class Config:
        arbitrary_types_allowed = True
 class ConfigSchema(BaseModel):
    optimizer: Optional["Optimizer"]
    class training(BaseModel):
        patience: int = 10
        eval_frequency: int = 100
        dropout: float = 0.2
        init_tok2vec: Optional[FilePath] = None
        max_epochs: int = 100
        orth_variant_level: float = 0.0
        gold_preproc: bool = False
        max_length: int = 0
        use_gpu: int = 0
        scores: List[str] = ["ents_p", "ents_r", "ents_f"]
        score_weights: Dict[str, Union[int, float]] = {"ents_f": 1.0}
        limit: int = 0
        batch_size: Union[Sequence[int], int]
    class nlp(BaseModel):
        lang: str
        vectors: Optional[str]
        pipeline: Optional[Dict[str, PipelineComponent]]
    class Config:
        extra = "allow"
 class TrainingSchema(BaseModel):
    # TODO: write
@ -216,6 +182,76 @@ class TrainingSchema(BaseModel):
        extra = "forbid"
 # Config schema
 # We're not setting any defaults here (which is too messy) and are making all
 # fields required, so we can raise validation errors for missing values. To
 # provide a default, we include a separate .cfg file with all values and
 # check that against this schema in the test suite to make sure it's always
 # up to date.
 class ConfigSchemaTraining(BaseModel):
    # fmt: off
    gold_preproc: StrictBool = Field(..., title="Whether to train on gold-standard sentences and tokens")
    max_length: StrictInt = Field(..., title="Maximum length of examples (longer examples are divided into sentences if possible)")
    limit: StrictInt = Field(..., title="Number of examples to use (0 for all)")
    orth_variant_level: StrictFloat = Field(..., title="Orth variants for data augmentation")
    dropout: StrictFloat = Field(..., title="Dropout rate")
    patience: StrictInt = Field(..., title="How many steps to continue without improvement in evaluation score")
    max_epochs: StrictInt = Field(..., title="Maximum number of epochs to train for")
    max_steps: StrictInt = Field(..., title="Maximum number of update steps to train for")
    eval_frequency: StrictInt = Field(..., title="How often to evaluate during training (steps)")
    seed: StrictInt = Field(..., title="Random seed")
    accumulate_gradient: StrictInt = Field(..., title="Whether to divide the batch up into substeps")
    use_pytorch_for_gpu_memory: StrictBool = Field(..., title="Allocate memory via PyTorch")
    use_gpu: StrictInt = Field(..., title="GPU ID or -1 for CPU")
    scores: List[StrictStr] = Field(..., title="Score types to be printed in overview")
    score_weights: Dict[StrictStr, Union[StrictFloat, StrictInt]] = Field(..., title="Weights of each score type for selecting final model")
    init_tok2vec: Optional[FilePath] = Field(..., title="Path to pretrained tok2vec weights")
    discard_oversize: StrictBool = Field(..., title="Whether to skip examples longer than batch size")
    omit_extra_lookups: StrictBool = Field(..., title="Don't include extra lookups in model")
    batch_by: StrictStr = Field(..., title="Batch examples by type")
    raw_text: Optional[FilePath] = Field(..., title="Raw text")
    tag_map: Optional[FilePath] = Field(..., title="Path to JSON-formatted tag map")
    batch_size: Union[Sequence[int], int] = Field(..., title="The batch size or batch size schedule")
    optimizer: Optimizer = Field(..., title="The optimizer to use")
    # fmt: on
    class Config:
        extra = "forbid"
        arbitrary_types_allowed = True
 class ConfigSchemaNlpComponent(BaseModel):
    factory: StrictStr = Field(..., title="Component factory name")
    model: Model = Field(..., title="Component model")
    # TODO: add config schema / types for components so we can fill and validate
    # component options like learn_tokens, min_action_freq etc.
    class Config:
        extra = "allow"
        arbitrary_types_allowed = True
 class ConfigSchemaNlp(BaseModel):
    lang: StrictStr = Field(..., title="The base language to use")
    vectors: Optional[DirectoryPath] = Field(..., title="Path to vectors")
    pipeline: Optional[Dict[str, ConfigSchemaNlpComponent]]
    class Config:
        extra = "forbid"
        arbitrary_types_allowed = True
 class ConfigSchema(BaseModel):
    training: ConfigSchemaTraining
    nlp: ConfigSchemaNlp
    class Config:
        extra = "allow"
        arbitrary_types_allowed = True
 # Project config Schema
--- a/spacy/util.py
+++ b/spacy/util.py
@ -1,4 +1,4 @@
-from typing import List, Union
+from typing import List, Union, Type, Dict, Any
 import os
 import importlib
 import importlib.util
@ -6,6 +6,8 @@ import re
 from pathlib import Path
 import thinc
 from thinc.api import NumpyOps, get_current_ops, Adam, Config
 from thinc.config import EmptySchema
 from pydantic import BaseModel
 import functools
 import itertools
 import numpy.random
@ -20,6 +22,7 @@ import subprocess
 from contextlib import contextmanager
 import tempfile
 import shutil
 import hashlib
 import shlex
 try:
@ -326,20 +329,29 @@ def get_base_version(version):
    return Version(version).base_version
-def load_config(path, create_objects=False):
+def load_config(
    path: Union[Path, str],
    *,
    create_objects: bool = False,
    schema: Type[BaseModel] = EmptySchema,
    validate: bool = True,
 ) -> Dict[str, Any]:
    """Load a Thinc-formatted config file, optionally filling in objects where
    the config references registry entries. See "Thinc config files" for details.
    path (str / Path): Path to the config file
    create_objects (bool): Whether to automatically create objects when the config
        references registry entries. Defaults to False.
-
+    schema (BaseModel): Optional pydantic base schema to use for validation.
    RETURNS (dict): The objects from the config file.
    """
    config = thinc.config.Config().from_disk(path)
    if create_objects:
-        return registry.make_from_config(config, validate=True)
+        return registry.make_from_config(config, validate=validate, schema=schema)
    else:
        # Just fill config here so we can validate and fail early
        if validate and schema:
            registry.fill_config(config, validate=validate, schema=schema)
        return config