Merge branch 'develop' into nightly.spacy.io

2025-07-17 11:42:30 +03:00 · 2020-09-12 17:55:45 +02:00 · 2020-09-12 17:55:45 +02:00 · 472b9b4fa3
commit 472b9b4fa3
parent fa101a1bb6 24e138b8ac
180 changed files with 2008 additions and 1902 deletions
--- a/2
+++ b/2
@ -1,7 +1,7 @@
 SHELL := /bin/bash

 ifndef SPACY_EXTRAS
-override SPACY_EXTRAS = spacy-lookups-data jieba pkuseg==0.0.25 sudachipy sudachidict_core
+override SPACY_EXTRAS = spacy-lookups-data==0.4.0.dev0 jieba pkuseg==0.0.25 sudachipy sudachidict_core
 endif

 ifndef PYVER
--- a/extra/example_data/textcat_example_data/textcatjsonl_to_trainjson.py
+++ b/extra/example_data/textcat_example_data/textcatjsonl_to_trainjson.py
@ -1,7 +1,7 @@
 from pathlib import Path
 import plac
 import spacy
-from spacy.gold import docs_to_json
+from spacy.training import docs_to_json
 import srsly
 import sys

--- a/extra/experiments/tok2vec-ner/multihashembed_tok2vec.cfg
+++ b/extra/experiments/tok2vec-ner/multihashembed_tok2vec.cfg
@ -31,10 +31,13 @@ lang = "en"
 vectors = null

 [nlp.pipeline.ner]
-factory = "simple_ner"
+factory = "ner"

 [nlp.pipeline.ner.model]
-@architectures = "spacy.BiluoTagger.v1"
+@architectures = "spacy.TransitionBasedParser.v1"
+nr_feature_tokens = 6
+hidden_width = 64
+maxout_pieces = 2

 [nlp.pipeline.ner.model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v1"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -6,7 +6,7 @@ requires = [
    "cymem>=2.0.2,<2.1.0",
    "preshed>=3.0.2,<3.1.0",
    "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=8.0.0a30,<8.0.0a40",
+    "thinc>=8.0.0a31,<8.0.0a40",
    "blis>=0.4.0,<0.5.0",
    "pytokenizations",
    "pathy"
--- a/requirements.txt
+++ b/requirements.txt
@ -1,7 +1,7 @@
 # Our libraries
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=8.0.0a30,<8.0.0a40
+thinc>=8.0.0a31,<8.0.0a40
 blis>=0.4.0,<0.5.0
 ml_datasets>=0.1.1
 murmurhash>=0.28.0,<1.1.0
--- a/setup.cfg
+++ b/setup.cfg
@ -34,13 +34,13 @@ setup_requires =
    cymem>=2.0.2,<2.1.0
    preshed>=3.0.2,<3.1.0
    murmurhash>=0.28.0,<1.1.0
-    thinc>=8.0.0a30,<8.0.0a40
+    thinc>=8.0.0a31,<8.0.0a40
 install_requires =
    # Our libraries
    murmurhash>=0.28.0,<1.1.0
    cymem>=2.0.2,<2.1.0
    preshed>=3.0.2,<3.1.0
-    thinc>=8.0.0a30,<8.0.0a40
+    thinc>=8.0.0a31,<8.0.0a40
    blis>=0.4.0,<0.5.0
    wasabi>=0.8.0,<1.1.0
    srsly>=2.1.0,<3.0.0
@ -64,7 +64,7 @@ console_scripts =

 [options.extras_require]
 lookups =
-    spacy_lookups_data>=0.3.2,<0.4.0
+    spacy_lookups_data==0.4.0.dev0
 cuda =
    cupy>=5.0.0b4,<9.0.0
 cuda80 =
--- a/setup.py
+++ b/setup.py
@ -23,7 +23,7 @@ Options.docstrings = True

 PACKAGES = find_packages()
 MOD_NAMES = [
-    "spacy.gold.example",
+    "spacy.training.example",
    "spacy.parts_of_speech",
    "spacy.strings",
    "spacy.lexeme",
@ -48,7 +48,7 @@ MOD_NAMES = [
    "spacy.pipeline._parser_internals.stateclass",
    "spacy.pipeline._parser_internals.transition_system",
    "spacy.tokenizer",
-    "spacy.gold.gold_io",
+    "spacy.training.gold_io",
    "spacy.tokens.doc",
    "spacy.tokens.span",
    "spacy.tokens.token",
--- a/spacy/about.py
+++ b/spacy/about.py
@ -1,7 +1,8 @@
 # fmt: off
 __title__ = "spacy-nightly"
-__version__ = "3.0.0a14"
+__version__ = "3.0.0a16"
 __release__ = True
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
-__projects__ = "https://github.com/explosion/spacy-boilerplates"
+__projects__ = "https://github.com/explosion/projects"
+__projects_branch__ = "v3"
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@ -1,4 +1,4 @@
-from typing import Dict, Any, Union, List, Optional, TYPE_CHECKING
+from typing import Dict, Any, Union, List, Optional, Tuple, TYPE_CHECKING
 import sys
 import shutil
 from pathlib import Path
@ -6,6 +6,7 @@ from wasabi import msg
 import srsly
 import hashlib
 import typer
+import subprocess
 from click import NoSuchOption
 from typer.main import get_command
 from contextlib import contextmanager
@ -13,7 +14,7 @@ from thinc.config import Config, ConfigValidationError
 from configparser import InterpolationError

 from ..schemas import ProjectConfigSchema, validate
-from ..util import import_file, run_command, make_tempdir
+from ..util import import_file, run_command, make_tempdir, registry

 if TYPE_CHECKING:
    from pathy import Pathy  # noqa: F401
@ -54,6 +55,8 @@ app.add_typer(init_cli)


 def setup_cli() -> None:
+    # Make sure the entry-point for CLI runs, so that they get imported.
+    registry.cli.get_all()
    # Ensure that the help messages always display the correct prompt
    command = get_command(app)
    command(prog_name=COMMAND)
@ -318,33 +321,87 @@ def git_sparse_checkout(repo: str, subpath: str, dest: Path, *, branch: str = "m
    # *that* we can do by path.
    # We're using Git and sparse checkout to only clone the files we need
    with make_tempdir() as tmp_dir:
+        git_version = get_git_version()
+        supports_sparse = git_version >= (2, 22)
        # This is the "clone, but don't download anything" part.
-        cmd = (
-            f"git clone {repo} {tmp_dir} --no-checkout --depth 1 "
-            f"--filter=blob:none "  # <-- The key bit
-            f"-b {branch}"
+        cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 " f"-b {branch} "
+        if supports_sparse:
+            cmd += f"--filter=blob:none"  # <-- The key bit
+        else:
+            msg.warn(
+                f"You're running an old version of Git (v{git_version[0]}.{git_version[1]}) "
+                f"that doesn't fully support sparse checkout yet. This means that "
+                f"more files than necessary may be downloaded temporarily. To "
+                f"only download the files needed, upgrade to Git v2.22 or above."
            )
-        run_command(cmd, capture=True)
+        _attempt_run_command(cmd)
        # Now we need to find the missing filenames for the subpath we want.
        # Looking for this 'rev-list' command in the git --help? Hah.
-        cmd = f"git -C {tmp_dir} rev-list --objects --all --missing=print -- {subpath}"
-        ret = run_command(cmd, capture=True)
-        repo = _from_http_to_git(repo)
+        cmd = f"git -C {tmp_dir} rev-list --objects --all {'--missing=print ' if supports_sparse else ''} -- {subpath}"
+        ret = _attempt_run_command(cmd)
+        git_repo = _from_http_to_git(repo)
        # Now pass those missings into another bit of git internals
        missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")])
-        cmd = f"git -C {tmp_dir} fetch-pack {repo} {missings}"
-        run_command(cmd, capture=True)
+        if supports_sparse and not missings:
+            err = (
+                f"Could not find any relevant files for '{subpath}'. "
+                f"Did you specify a correct and complete path within repo '{repo}' "
+                f"and branch {branch}?"
+            )
+            msg.fail(err, exits=1)
+        if supports_sparse:
+            cmd = f"git -C {tmp_dir} fetch-pack {git_repo} {missings}"
+            _attempt_run_command(cmd)
        # And finally, we can checkout our subpath
        cmd = f"git -C {tmp_dir} checkout {branch} {subpath}"
-        run_command(cmd)
+        _attempt_run_command(cmd)
        # We need Path(name) to make sure we also support subdirectories
        shutil.move(str(tmp_dir / Path(subpath)), str(dest))


-def _from_http_to_git(repo):
+def get_git_version() -> Tuple[int, int]:
+    ret = _attempt_run_command(["git", "--version"])
+    # TODO: this seems kinda brittle?
+    version = ret.stdout[11:].strip().split(".")
+    return (int(version[0]), int(version[1]))
+
+
+def _attempt_run_command(cmd: Union[str, List[str]]):
+    try:
+        return run_command(cmd, capture=True)
+    except subprocess.CalledProcessError as e:
+        err = f"Could not run command"
+        msg.fail(err)
+        print(cmd)
+        sys.exit(1)
+
+
+def _from_http_to_git(repo: str) -> str:
    if repo.startswith("http://"):
        repo = repo.replace(r"http://", r"https://")
    if repo.startswith(r"https://"):
        repo = repo.replace("https://", "git@").replace("/", ":", 1)
+        if repo.endswith("/"):
+            repo = repo[:-1]
        repo = f"{repo}.git"
    return repo
+
+
+def string_to_list(value, intify=False):
+    """Parse a comma-separated string to a list"""
+    if not value:
+        return []
+    if value.startswith("[") and value.endswith("]"):
+        value = value[1:-1]
+    result = []
+    for p in value.split(","):
+        p = p.strip()
+        if p.startswith("'") and p.endswith("'"):
+            p = p[1:-1]
+        if p.startswith('"') and p.endswith('"'):
+            p = p[1:-1]
+        p = p.strip()
+        if intify:
+            p = int(p)
+        result.append(p)
+    return result
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@ -7,9 +7,9 @@ import re
 import sys

 from ._util import app, Arg, Opt
-from ..gold import docs_to_json
+from ..training import docs_to_json
 from ..tokens import DocBin
-from ..gold.converters import iob2docs, conll_ner2docs, json2docs, conllu2docs
+from ..training.converters import iob2docs, conll_ner2docs, json2docs, conllu2docs


 # Converters are matched by file extension except for ner/iob, which are
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@ -8,7 +8,7 @@ import typer

 from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
 from ._util import import_code, debug_cli, get_sourced_components
-from ..gold import Corpus, Example
+from ..training import Corpus, Example
 from ..pipeline._parser_internals import nonproj
 from ..language import Language
 from .. import util
--- a/spacy/cli/debug_model.py
+++ b/spacy/cli/debug_model.py
@ -5,7 +5,7 @@ from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam
 from thinc.api import Model, data_validation
 import typer

-from ._util import Arg, Opt, debug_cli, show_validation_error, parse_config_overrides
+from ._util import Arg, Opt, debug_cli, show_validation_error, parse_config_overrides, string_to_list
 from .. import util


@ -38,12 +38,13 @@ def debug_model_cli(
        require_gpu(use_gpu)
    else:
        msg.info("Using CPU")
+    layers = string_to_list(layers, intify=True)
    print_settings = {
        "dimensions": dimensions,
        "parameters": parameters,
        "gradients": gradients,
        "attributes": attributes,
-        "layers": [int(x.strip()) for x in layers.split(",")] if layers else [],
+        "layers": layers,
        "print_before_training": P0,
        "print_after_init": P1,
        "print_after_training": P2,
@ -84,11 +85,11 @@ def debug_model(model: Model, *, print_settings: Optional[Dict[str, Any]] = None
        _print_model(model, print_settings)

    # STEP 1: Initializing the model and printing again
+    X = _get_docs()
    Y = _get_output(model.ops.xp)
-    _set_output_dim(nO=Y.shape[-1], model=model)
    # The output vector might differ from the official type of the output layer
    with data_validation(False):
-        model.initialize(X=_get_docs(), Y=Y)
+        model.initialize(X=X, Y=Y)
    if print_settings.get("print_after_init"):
        msg.divider(f"STEP 1 - after initialization")
        _print_model(model, print_settings)
@ -135,15 +136,6 @@ def _get_output(xp):
    return xp.asarray([i + 10 for i, _ in enumerate(_get_docs())], dtype="float32")


-def _set_output_dim(model, nO):
-    # the dim inference doesn't always work 100%, we need this hack like we have it in pipe.pyx
-    if model.has_dim("nO") is None:
-        model.set_dim("nO", nO)
-    if model.has_ref("output_layer"):
-        if model.get_ref("output_layer").has_dim("nO") is None:
-            model.get_ref("output_layer").set_dim("nO", nO)
-
-
 def _print_model(model, print_settings):
    layers = print_settings.get("layers", "")
    parameters = print_settings.get("parameters", False)
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@ -5,7 +5,7 @@ import re
 import srsly
 from thinc.api import require_gpu, fix_random_seed

-from ..gold import Corpus
+from ..training import Corpus
 from ..tokens import Doc
 from ._util import app, Arg, Opt
 from ..scorer import Scorer
--- a/spacy/cli/init_config.py
+++ b/spacy/cli/init_config.py
@ -9,7 +9,7 @@ import re
 from .. import util
 from ..language import DEFAULT_CONFIG_PRETRAIN_PATH
 from ..schemas import RecommendationSchema
-from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND
+from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND, string_to_list


 ROOT = Path(__file__).parent / "templates"
@ -42,7 +42,7 @@ def init_config_cli(
    """
    if isinstance(optimize, Optimizations):  # instance of enum from the CLI
        optimize = optimize.value
-    pipeline = [p.strip() for p in pipeline.split(",")]
+    pipeline = string_to_list(pipeline)
    init_config(output_file, lang=lang, pipeline=pipeline, optimize=optimize, cpu=cpu)


--- a/spacy/cli/init_model.py
+++ b/spacy/cli/init_model.py
@ -256,6 +256,7 @@ def add_vectors(

 def read_vectors(msg: Printer, vectors_loc: Path, truncate_vectors: int):
    f = open_file(vectors_loc)
+    f = ensure_shape(f)
    shape = tuple(int(size) for size in next(f).split())
    if truncate_vectors >= 1:
        shape = (truncate_vectors, shape[1])
@ -274,6 +275,31 @@ def read_vectors(msg: Printer, vectors_loc: Path, truncate_vectors: int):
    return vectors_data, vectors_keys


+def ensure_shape(lines):
+    """Ensure that the first line of the data is the vectors shape.
+    
+    If it's not, we read in the data and output the shape as the first result,
+    so that the reader doesn't have to deal with the problem.
+    """
+    first_line = next(lines)
+    try:
+        shape = tuple(int(size) for size in first_line.split())
+    except ValueError:
+        shape = None
+    if shape is not None:
+        # All good, give the data
+        yield first_line
+        yield from lines
+    else:
+        # Figure out the shape, make it the first value, and then give the
+        # rest of the data.
+        width = len(first_line.split()) - 1
+        captured = [first_line] + list(lines)
+        length = len(captured)
+        yield f"{length} {width}"
+        yield from captured
+
+
 def read_freqs(
    freqs_loc: Path, max_length: int = 100, min_doc_freq: int = 5, min_freq: int = 50
 ):
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@ -18,6 +18,7 @@ def package_cli(
    output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False),
    meta_path: Optional[Path] = Opt(None, "--meta-path", "--meta", "-m", help="Path to meta.json", exists=True, dir_okay=False),
    create_meta: bool = Opt(False, "--create-meta", "-c", "-C", help="Create meta.json, even if one exists"),
+    name: Optional[str] = Opt(None, "--name", "-n", help="Package name to override meta"),
    version: Optional[str] = Opt(None, "--version", "-v", help="Package version to override meta"),
    no_sdist: bool = Opt(False, "--no-sdist", "-NS", help="Don't build .tar.gz sdist, can be set if you want to run this step manually"),
    force: bool = Opt(False, "--force", "-f", "-F", help="Force overwriting existing data in output directory"),
@ -38,6 +39,7 @@ def package_cli(
        input_dir,
        output_dir,
        meta_path=meta_path,
+        name=name,
        version=version,
        create_meta=create_meta,
        create_sdist=not no_sdist,
@ -50,6 +52,7 @@ def package(
    input_dir: Path,
    output_dir: Path,
    meta_path: Optional[Path] = None,
+    name: Optional[str] = None,
    version: Optional[str] = None,
    create_meta: bool = False,
    create_sdist: bool = True,
@ -71,6 +74,8 @@ def package(
        msg.fail("Can't load pipeline meta.json", meta_path, exits=1)
    meta = srsly.read_json(meta_path)
    meta = get_meta(input_dir, meta)
+    if name is not None:
+        meta["name"] = name
    if version is not None:
        meta["version"] = version
    if not create_meta:  # only print if user doesn't want to overwrite
--- a/spacy/cli/project/assets.py
+++ b/spacy/cli/project/assets.py
@ -38,16 +38,21 @@ def project_assets(project_dir: Path) -> None:
        msg.warn(f"No assets specified in {PROJECT_FILE}", exits=0)
    msg.info(f"Fetching {len(assets)} asset(s)")
    for asset in assets:
-        dest = Path(asset["dest"])
+        dest = (project_dir / asset["dest"]).resolve()
        checksum = asset.get("checksum")
        if "git" in asset:
            if dest.exists():
                # If there's already a file, check for checksum
                if checksum and checksum == get_checksum(dest):
-                    msg.good(f"Skipping download with matching checksum: {dest}")
+                    msg.good(
+                        f"Skipping download with matching checksum: {asset['dest']}"
+                    )
                    continue
                else:
+                    if dest.is_dir():
                        shutil.rmtree(dest)
+                    else:
+                        dest.unlink()
            git_sparse_checkout(
                asset["git"]["repo"],
                asset["git"]["path"],
@ -67,14 +72,16 @@ def check_private_asset(dest: Path, checksum: Optional[str] = None) -> None:
    """Check and validate assets without a URL (private assets that the user
    has to provide themselves) and give feedback about the checksum.

-    dest (Path): Desintation path of the asset.
+    dest (Path): Destination path of the asset.
    checksum (Optional[str]): Optional checksum of the expected file.
    """
    if not Path(dest).exists():
        err = f"No URL provided for asset. You need to add this file yourself: {dest}"
        msg.warn(err)
    else:
-        if checksum and checksum == get_checksum(dest):
+        if not checksum:
+            msg.good(f"Asset already exists: {dest}")
+        elif checksum == get_checksum(dest):
            msg.good(f"Asset exists with matching checksum: {dest}")
        else:
            msg.fail(f"Asset available but with incorrect checksum: {dest}")
--- a/spacy/cli/project/clone.py
+++ b/spacy/cli/project/clone.py
@ -16,6 +16,7 @@ def project_clone_cli(
    name: str = Arg(..., help="The name of the template to clone"),
    dest: Optional[Path] = Arg(None, help="Where to clone the project. Defaults to current working directory", exists=False),
    repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to clone from"),
+    branch: str = Opt(about.__projects_branch__, "--branch", "-b", help="The branch to clone from")
    # fmt: on
 ):
    """Clone a project template from a repository. Calls into "git" and will
@ -26,23 +27,30 @@ def project_clone_cli(
    DOCS: https://nightly.spacy.io/api/cli#project-clone
    """
    if dest is None:
-        dest = Path.cwd() / name
-    project_clone(name, dest, repo=repo)
+        dest = Path.cwd() / Path(name).parts[-1]
+    project_clone(name, dest, repo=repo, branch=branch)


-def project_clone(name: str, dest: Path, *, repo: str = about.__projects__) -> None:
+def project_clone(
+    name: str,
+    dest: Path,
+    *,
+    repo: str = about.__projects__,
+    branch: str = about.__projects_branch__,
+) -> None:
    """Clone a project template from a repository.

    name (str): Name of subdirectory to clone.
    dest (Path): Destination path of cloned project.
    repo (str): URL of Git repo containing project templates.
+    branch (str): The branch to clone from
    """
    dest = ensure_path(dest)
    check_clone(name, dest, repo)
    project_dir = dest.resolve()
    repo_name = re.sub(r"(http(s?)):\/\/github.com/", "", repo)
    try:
-        git_sparse_checkout(repo, name, dest)
+        git_sparse_checkout(repo, name, dest, branch=branch)
    except subprocess.CalledProcessError:
        err = f"Could not clone '{name}' from repo '{repo_name}'"
        msg.fail(err, exits=1)
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -1,4 +1,5 @@
 from typing import Optional, Dict, Any, Tuple, Union, Callable, List
+from timeit import default_timer as timer
 import srsly
 import tqdm
 from pathlib import Path
@ -15,7 +16,7 @@ from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
 from ._util import import_code, get_sourced_components
 from ..language import Language
 from .. import util
-from ..gold.example import Example
+from ..training.example import Example
 from ..errors import Errors


@ -286,9 +287,12 @@ def train_while_improving(
        ]
        raw_batches = util.minibatch(raw_examples, size=8)

+    words_seen = 0
+    start_time = timer()
    for step, (epoch, batch) in enumerate(train_data):
        dropout = next(dropouts)
        for subbatch in subdivide_batch(batch, accumulate_gradient):
+
            nlp.update(
                subbatch, drop=dropout, losses=losses, sgd=False, exclude=exclude
            )
@ -317,6 +321,7 @@ def train_while_improving(
        else:
            score, other_scores = (None, None)
            is_best_checkpoint = None
+        words_seen += sum(len(eg) for eg in batch)
        info = {
            "epoch": epoch,
            "step": step,
@ -324,6 +329,8 @@ def train_while_improving(
            "other_scores": other_scores,
            "losses": losses,
            "checkpoints": results,
+            "seconds": int(timer() - start_time),
+            "words": words_seen,
        }
        yield batch, info, is_best_checkpoint
        if is_best_checkpoint is not None:
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@ -52,7 +52,7 @@ path = ${paths.train}
 # data is passed in sentence-by-sentence via some prior preprocessing.
 gold_preproc = false
 # Limitations on training document length
-max_length = 2000
+max_length = 0
 # Limitation on number of training examples
 limit = 0

@ -64,7 +64,7 @@ path = ${paths.dev}
 # data is passed in sentence-by-sentence via some prior preprocessing.
 gold_preproc = false
 # Limitations on training document length
-max_length = 2000
+max_length = 0
 # Limitation on number of training examples
 limit = 0

@ -88,9 +88,4 @@ L2 = 0.01
 grad_clip = 1.0
 use_averages = false
 eps = 1e-8
-
-[training.optimizer.learn_rate]
-@schedules = "warmup_linear.v1"
-warmup_steps = 250
-total_steps = 20000
-initial_rate = 0.001
+learn_rate = 0.001
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -66,7 +66,7 @@ class Warnings:
            "in problems with the vocab further on in the pipeline.")
    W030 = ("Some entities could not be aligned in the text \"{text}\" with "
            "entities \"{entities}\". Use "
-            "`spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)`"
+            "`spacy.training.biluo_tags_from_offsets(nlp.make_doc(text), entities)`"
            " to check the alignment. Misaligned entities ('-') will be "
            "ignored during training.")
    W033 = ("Training a new {model} using a model with no lexeme normalization "
@ -247,8 +247,8 @@ class Errors:
            "Query string: {string}\nOrth cached: {orth}\nOrth ID: {orth_id}")
    E065 = ("Only one of the vector table's width and shape can be specified. "
            "Got width {width} and shape {shape}.")
-    E067 = ("Invalid BILUO tag sequence: Got a tag starting with 'I' (inside "
-            "an entity) without a preceding 'B' (beginning of an entity). "
+    E067 = ("Invalid BILUO tag sequence: Got a tag starting with {start} "
+            "without a preceding 'B' (beginning of an entity). "
            "Tag sequence:\n{tags}")
    E068 = ("Invalid BILUO tag: '{tag}'.")
    E071 = ("Error creating lexeme: specified orth ID ({orth}) does not "
@ -320,10 +320,6 @@ class Errors:
            "So instead of pickling the span, pickle the Doc it belongs to or "
            "use Span.as_doc to convert the span to a standalone Doc object.")
    E115 = ("All subtokens must have associated heads.")
-    E116 = ("Cannot currently add labels to pretrained text classifier. Add "
-            "labels before training begins. This functionality was available "
-            "in previous versions, but had significant bugs that led to poor "
-            "performance.")
    E117 = ("The newly split tokens must match the text of the original token. "
            "New orths: {new}. Old text: {old}.")
    E118 = ("The custom extension attribute '{attr}' is not registered on the "
@ -378,8 +374,9 @@ class Errors:
            "should be of equal length.")
    E141 = ("Entity vectors should be of length {required} instead of the "
            "provided {found}.")
-    E143 = ("Labels for component '{name}' not initialized. Did you forget to "
-            "call add_label()?")
+    E143 = ("Labels for component '{name}' not initialized. This can be fixed "
+            "by calling add_label, or by providing a representative batch of "
+            "examples to the component's begin_training method.")
    E145 = ("Error reading `{param}` from input file.")
    E146 = ("Could not access `{path}`.")
    E147 = ("Unexpected error in the {method} functionality of the "
@ -483,6 +480,16 @@ class Errors:
    E201 = ("Span index out of range.")

    # TODO: fix numbering after merging develop into master
+    E921 = ("The method 'set_output' can only be called on components that have "
+            "a Model with a 'resize_output' attribute. Otherwise, the output "
+            "layer can not be dynamically changed.")
+    E922 = ("Component '{name}' has been initialized with an output dimension of "
+            "{nO} - cannot add any more labels.")
+    E923 = ("It looks like there is no proper sample data to initialize the "
+            "Model of component '{name}'. "
+            "This is likely a bug in spaCy, so feel free to open an issue.")
+    E924 = ("The '{name}' component does not seem to be initialized properly. "
+            "This is likely a bug in spaCy, so feel free to open an issue.")
    E925 = ("Invalid color values for displaCy visualizer: expected dictionary "
            "mapping label names to colors but got: {obj}")
    E926 = ("It looks like you're trying to modify nlp.{attr} directly. This "
--- a/spacy/language.py
+++ b/spacy/language.py
@ -17,7 +17,7 @@ from timeit import default_timer as timer
 from .tokens.underscore import Underscore
 from .vocab import Vocab, create_vocab
 from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
-from .gold import Example, validate_examples
+from .training import Example, validate_examples
 from .scorer import Scorer
 from .util import create_default_optimizer, registry, SimpleFrozenList
 from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER
@ -243,6 +243,7 @@ class Language:
        self._config["nlp"]["pipeline"] = list(self.component_names)
        self._config["nlp"]["disabled"] = list(self.disabled)
        self._config["components"] = pipeline
+        if not self._config["training"].get("score_weights"):
            self._config["training"]["score_weights"] = combine_score_weights(score_weights)
        if not srsly.is_json_serializable(self._config):
            raise ValueError(Errors.E961.format(config=self._config))
@ -656,7 +657,7 @@ class Language:
        return resolved[factory_name]

    def create_pipe_from_source(
-        self, source_name: str, source: "Language", *, name: str,
+        self, source_name: str, source: "Language", *, name: str
    ) -> Tuple[Callable[[Doc], Doc], str]:
        """Create a pipeline component by copying it from an existing model.

@ -1155,10 +1156,13 @@ class Language:

        DOCS: https://nightly.spacy.io/api/language#begin_training
        """
-        # TODO: throw warning when get_gold_tuples is provided instead of get_examples
        if get_examples is None:
-            get_examples = lambda: []
-        else:  # Populate vocab
+            util.logger.debug(
+                "No 'get_examples' callback provided to 'Language.begin_training', creating dummy examples"
+            )
+            doc = Doc(self.vocab, words=["x", "y", "z"])
+            get_examples = lambda: [Example.from_dict(doc, {})]
+        # Populate vocab
        if not hasattr(get_examples, "__call__"):
            err = Errors.E930.format(name="Language", obj=type(get_examples))
            raise ValueError(err)
@ -1187,7 +1191,7 @@ class Language:
        return self._optimizer

    def resume_training(
-        self, *, sgd: Optional[Optimizer] = None, device: int = -1,
+        self, *, sgd: Optional[Optimizer] = None, device: int = -1
    ) -> Optimizer:
        """Continue training a pretrained model.

--- a/spacy/ml/_biluo.py
+++ b/spacy/ml/_biluo.py
@ -1,105 +0,0 @@
-"""Thinc layer to do simpler transition-based parsing, NER, etc."""
-from typing import Dict, Optional
-import numpy
-from thinc.api import Model
-from thinc.types import Padded, Floats3d
-
-
-def BILUO() -> Model[Padded, Padded]:
-    return Model(
-        "biluo",
-        forward,
-        init=init,
-        dims={"nO": None},
-        attrs={"get_num_actions": get_num_actions},
-    )
-
-
-def init(model, X: Optional[Padded] = None, Y: Optional[Padded] = None):
-    if X is not None and Y is not None:
-        if X.data.shape != Y.data.shape:
-            # TODO: Fix error
-            raise ValueError("Mismatched shapes (TODO: Fix message)")
-        model.set_dim("nO", X.data.shape[2])
-    elif X is not None:
-        model.set_dim("nO", X.data.shape[2])
-    elif Y is not None:
-        model.set_dim("nO", Y.data.shape[2])
-    elif model.get_dim("nO") is None:
-        raise ValueError("Dimension unset for BILUO: nO")
-
-
-def forward(model: Model[Padded, Padded], Xp: Padded, is_train: bool):
-    n_labels = (model.get_dim("nO") - 1) // 4
-    n_tokens, n_docs, n_actions = Xp.data.shape
-    # At each timestep, we make a validity mask of shape (n_docs, n_actions)
-    # to indicate which actions are valid next for each sequence. To construct
-    # the mask, we have a state of shape (2, n_actions) and a validity table of
-    # shape (2, n_actions+1, n_actions). The first dimension of the state indicates
-    # whether it's the last token, the second dimension indicates the previous
-    # action, plus a special 'null action' for the first entry.
-    valid_transitions = model.ops.asarray(_get_transition_table(n_labels))
-    prev_actions = model.ops.alloc1i(n_docs)
-    # Initialize as though prev action was O
-    prev_actions.fill(n_actions - 1)
-    Y = model.ops.alloc3f(*Xp.data.shape)
-    masks = model.ops.alloc3f(*Y.shape)
-    max_value = Xp.data.max()
-    for t in range(Xp.data.shape[0]):
-        is_last = (Xp.lengths < (t + 2)).astype("i")
-        masks[t] = valid_transitions[is_last, prev_actions]
-        # Don't train the out-of-bounds sequences.
-        masks[t, Xp.size_at_t[t] :] = 0
-        # Valid actions get 0*10e8, invalid get large negative value
-        Y[t] = Xp.data[t] + ((masks[t] - 1) * max_value * 10)
-        prev_actions = Y[t].argmax(axis=-1)
-
-    def backprop_biluo(dY: Padded) -> Padded:
-        dY.data *= masks
-        return dY
-
-    return Padded(Y, Xp.size_at_t, Xp.lengths, Xp.indices), backprop_biluo
-
-
-def get_num_actions(n_labels: int) -> int:
-    # One BEGIN action per label
-    # One IN action per label
-    # One LAST action per label
-    # One UNIT action per label
-    # One OUT action
-    return n_labels + n_labels + n_labels + n_labels + 1
-
-
-def _get_transition_table(
-    n_labels: int, *, _cache: Dict[int, Floats3d] = {}
-) -> Floats3d:
-    n_actions = get_num_actions(n_labels)
-    if n_actions in _cache:
-        return _cache[n_actions]
-    table = numpy.zeros((2, n_actions, n_actions), dtype="f")
-    B_start, B_end = (0, n_labels)
-    I_start, I_end = (B_end, B_end + n_labels)
-    L_start, L_end = (I_end, I_end + n_labels)
-    U_start, _ = (L_end, L_end + n_labels)  # noqa: F841
-    # Using ranges allows us to set specific cells, which is necessary to express
-    # that only actions of the same label are valid continuations.
-    B_range = numpy.arange(B_start, B_end)
-    I_range = numpy.arange(I_start, I_end)
-    L_range = numpy.arange(L_start, L_end)
-    # If this is the last token and the previous action was B or I, only L
-    # of that label is valid
-    table[1, B_range, L_range] = 1
-    table[1, I_range, L_range] = 1
-    # If this isn't the last token and the previous action was B or I, only I or
-    # L of that label are valid.
-    table[0, B_range, I_range] = 1
-    table[0, B_range, L_range] = 1
-    table[0, I_range, I_range] = 1
-    table[0, I_range, L_range] = 1
-    # If this isn't the last token and the previous was L, U or O, B is valid
-    table[0, L_start:, :B_end] = 1
-    # Regardless of whether this is the last token, if the previous action was
-    # {L, U, O}, U and O are valid.
-    table[:, L_start:, U_start:] = 1
-    _cache[n_actions] = table
-    return table
--- a/spacy/ml/_iob.py
+++ b/spacy/ml/_iob.py
@ -1,90 +0,0 @@
-"""Thinc layer to do simpler transition-based parsing, NER, etc."""
-from typing import Dict, Optional
-from thinc.api import Ops, Model
-from thinc.types import Padded, Floats3d
-
-
-def IOB() -> Model[Padded, Padded]:
-    return Model(
-        "biluo",
-        forward,
-        init=init,
-        dims={"nO": None},
-        attrs={"get_num_actions": get_num_actions},
-    )
-
-
-def init(model: Model, X: Optional[Padded] = None, Y: Optional[Padded] = None) -> None:
-    if X is not None and Y is not None:
-        if X.data.shape != Y.data.shape:
-            # TODO: Fix error
-            raise ValueError("Mismatched shapes (TODO: Fix message)")
-        model.set_dim("nO", X.data.shape[2])
-    elif X is not None:
-        model.set_dim("nO", X.data.shape[2])
-    elif Y is not None:
-        model.set_dim("nO", Y.data.shape[2])
-    elif model.get_dim("nO") is None:
-        raise ValueError("Dimension unset for BILUO: nO")
-
-
-def forward(model: Model[Padded, Padded], Xp: Padded, is_train: bool):
-    n_labels = (model.get_dim("nO") - 1) // 2
-    n_tokens, n_docs, n_actions = Xp.data.shape
-    # At each timestep, we make a validity mask of shape (n_docs, n_actions)
-    # to indicate which actions are valid next for each sequence. To construct
-    # the mask, we have a state of shape (2, n_actions) and a validity table of
-    # shape (2, n_actions+1, n_actions). The first dimension of the state indicates
-    # whether it's the last token, the second dimension indicates the previous
-    # action, plus a special 'null action' for the first entry.
-    valid_transitions = _get_transition_table(model.ops, n_labels)
-    prev_actions = model.ops.alloc1i(n_docs)
-    # Initialize as though prev action was O
-    prev_actions.fill(n_actions - 1)
-    Y = model.ops.alloc3f(*Xp.data.shape)
-    masks = model.ops.alloc3f(*Y.shape)
-    for t in range(Xp.data.shape[0]):
-        masks[t] = valid_transitions[prev_actions]
-        # Don't train the out-of-bounds sequences.
-        masks[t, Xp.size_at_t[t] :] = 0
-        # Valid actions get 0*10e8, invalid get -1*10e8
-        Y[t] = Xp.data[t] + ((masks[t] - 1) * 10e8)
-        prev_actions = Y[t].argmax(axis=-1)
-
-    def backprop_biluo(dY: Padded) -> Padded:
-        # Masking the gradient seems to do poorly here. But why?
-        # dY.data *= masks
-        return dY
-
-    return Padded(Y, Xp.size_at_t, Xp.lengths, Xp.indices), backprop_biluo
-
-
-def get_num_actions(n_labels: int) -> int:
-    # One BEGIN action per label
-    # One IN action per label
-    # One LAST action per label
-    # One UNIT action per label
-    # One OUT action
-    return n_labels * 2 + 1
-
-
-def _get_transition_table(
-    ops: Ops, n_labels: int, _cache: Dict[int, Floats3d] = {}
-) -> Floats3d:
-    n_actions = get_num_actions(n_labels)
-    if n_actions in _cache:
-        return ops.asarray(_cache[n_actions])
-    table = ops.alloc2f(n_actions, n_actions)
-    B_start, B_end = (0, n_labels)
-    I_start, I_end = (B_end, B_end + n_labels)
-    O_action = I_end
-    B_range = ops.xp.arange(B_start, B_end)
-    I_range = ops.xp.arange(I_start, I_end)
-    # B and O are always valid
-    table[:, B_start:B_end] = 1
-    table[:, O_action] = 1
-    # I can only follow a matching B
-    table[B_range, I_range] = 1
-
-    _cache[n_actions] = table
-    return table
--- a/spacy/ml/models/init.py
+++ b/spacy/ml/models/init.py
@ -1,6 +1,5 @@
 from .entity_linker import *  # noqa
 from .parser import *  # noqa
-from .simple_ner import *  # noqa
 from .tagger import *  # noqa
 from .textcat import *  # noqa
 from .tok2vec import *  # noqa
--- a/spacy/ml/models/simple_ner.py
+++ b/spacy/ml/models/simple_ner.py
@ -1,104 +0,0 @@
-from typing import List
-from thinc.api import Model, Linear, with_array, softmax_activation, padded2list
-from thinc.api import chain, list2padded, configure_normal_init
-from thinc.api import Dropout
-from thinc.types import Floats2d
-
-from ...tokens import Doc
-from .._biluo import BILUO
-from .._iob import IOB
-from ...util import registry
-
-
-@registry.architectures.register("spacy.BILUOTagger.v1")
-def BiluoTagger(
-    tok2vec: Model[List[Doc], List[Floats2d]]
-) -> Model[List[Doc], List[Floats2d]]:
-    """Construct a simple NER tagger, that predicts BILUO tag scores for each
-    token and uses greedy decoding with transition-constraints to return a valid
-    BILUO tag sequence.
-
-    A BILUO tag sequence encodes a sequence of non-overlapping labelled spans
-    into tags assigned to each token. The first token of a span is given the
-    tag B-LABEL, the last token of the span is given the tag L-LABEL, and tokens
-    within the span are given the tag U-LABEL. Single-token spans are given
-    the tag U-LABEL. All other tokens are assigned the tag O.
-
-    The BILUO tag scheme generally results in better linear separation between
-    classes, especially for non-CRF models, because there are more distinct classes
-    for the different situations (Ratinov et al., 2009).
-    """
-    biluo = BILUO()
-    linear = Linear(
-        nO=None, nI=tok2vec.get_dim("nO"), init_W=configure_normal_init(mean=0.02)
-    )
-    model = chain(
-        tok2vec,
-        list2padded(),
-        with_array(chain(Dropout(0.1), linear)),
-        biluo,
-        with_array(softmax_activation()),
-        padded2list(),
-    )
-    return Model(
-        "biluo-tagger",
-        forward,
-        init=init,
-        layers=[model, linear],
-        refs={"tok2vec": tok2vec, "linear": linear, "biluo": biluo},
-        dims={"nO": None},
-        attrs={"get_num_actions": biluo.attrs["get_num_actions"]},
-    )
-
-
-@registry.architectures.register("spacy.IOBTagger.v1")
-def IOBTagger(
-    tok2vec: Model[List[Doc], List[Floats2d]]
-) -> Model[List[Doc], List[Floats2d]]:
-    """Construct a simple NER tagger, that predicts IOB tag scores for each
-    token and uses greedy decoding with transition-constraints to return a valid
-    IOB tag sequence.
-
-    An IOB tag sequence encodes a sequence of non-overlapping labelled spans
-    into tags assigned to each token. The first token of a span is given the
-    tag B-LABEL, and subsequent tokens are given the tag I-LABEL.
-    All other tokens are assigned the tag O.
-    """
-    biluo = IOB()
-    linear = Linear(nO=None, nI=tok2vec.get_dim("nO"))
-    model = chain(
-        tok2vec,
-        list2padded(),
-        with_array(linear),
-        biluo,
-        with_array(softmax_activation()),
-        padded2list(),
-    )
-    return Model(
-        "iob-tagger",
-        forward,
-        init=init,
-        layers=[model],
-        refs={"tok2vec": tok2vec, "linear": linear, "biluo": biluo},
-        dims={"nO": None},
-        attrs={"get_num_actions": biluo.attrs["get_num_actions"]},
-    )
-
-
-def init(model: Model[List[Doc], List[Floats2d]], X=None, Y=None) -> None:
-    if model.get_dim("nO") is None and Y:
-        model.set_dim("nO", Y[0].shape[1])
-    nO = model.get_dim("nO")
-    biluo = model.get_ref("biluo")
-    linear = model.get_ref("linear")
-    biluo.set_dim("nO", nO)
-    if linear.has_dim("nO") is None:
-        linear.set_dim("nO", nO)
-    model.layers[0].initialize(X=X, Y=Y)
-
-
-def forward(model: Model, X: List[Doc], is_train: bool):
-    return model.layers[0](X, is_train)
-
-
-__all__ = ["BiluoTagger"]
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@ -165,7 +165,7 @@ def MultiHashEmbed(

@registry.architectures.register("spacy.CharacterEmbed.v1")
 def CharacterEmbed(width: int, rows: int, nM: int, nC: int):
-    """Construct an embedded representations based on character embeddings, using
+    """Construct an embedded representation based on character embeddings, using
    a feed-forward network. A fixed number of UTF-8 byte characters are used for
    each word, taken from the beginning and end of the word equally. Padding is
    used in the centre for words that are too short.
@ -176,8 +176,8 @@ def CharacterEmbed(width: int, rows: int, nM: int, nC: int):
    ensures that the final character is always in the last position, instead
    of being in an arbitrary position depending on the word length.

-    The characters are embedded in a embedding table with 256 rows, and the
-    vectors concatenated. A hash-embedded vector of the NORM of the word is
+    The characters are embedded in a embedding table with a given number of rows,
+    and the vectors concatenated. A hash-embedded vector of the NORM of the word is
    also concatenated on, and the result is then passed through a feed-forward
    network to construct a single vector to represent the information.

--- a/spacy/pipeline/init.py
+++ b/spacy/pipeline/init.py
@ -8,7 +8,6 @@ from .morphologizer import Morphologizer
 from .pipe import Pipe
 from .senter import SentenceRecognizer
 from .sentencizer import Sentencizer
-from .simple_ner import SimpleNER
 from .tagger import Tagger
 from .textcat import TextCategorizer
 from .tok2vec import Tok2Vec
@ -25,7 +24,6 @@ __all__ = [
    "Pipe",
    "SentenceRecognizer",
    "Sentencizer",
-    "SimpleNER",
    "Tagger",
    "TextCategorizer",
    "Tok2Vec",
--- a/spacy/pipeline/_parser_internals/arc_eager.pyx
+++ b/spacy/pipeline/_parser_internals/arc_eager.pyx
@ -8,7 +8,7 @@ from ...typedefs cimport hash_t, attr_t
 from ...strings cimport hash_string
 from ...structs cimport TokenC
 from ...tokens.doc cimport Doc, set_children_from_heads
-from ...gold.example cimport Example
+from ...training.example cimport Example
 from ...errors import Errors
 from .stateclass cimport StateClass
 from ._state cimport StateC
--- a/spacy/pipeline/_parser_internals/ner.pyx
+++ b/spacy/pipeline/_parser_internals/ner.pyx
@ -5,7 +5,7 @@ from cymem.cymem cimport Pool
 from ...typedefs cimport weight_t, attr_t
 from ...lexeme cimport Lexeme
 from ...attrs cimport IS_SPACE
-from ...gold.example cimport Example
+from ...training.example cimport Example
 from ...errors import Errors
 from .stateclass cimport StateClass
 from ._state cimport StateC
--- a/spacy/pipeline/_parser_internals/transition_system.pxd
+++ b/spacy/pipeline/_parser_internals/transition_system.pxd
@ -3,7 +3,7 @@ from cymem.cymem cimport Pool
 from ...typedefs cimport attr_t, weight_t
 from ...structs cimport TokenC
 from ...strings cimport StringStore
-from ...gold.example cimport Example
+from ...training.example cimport Example
 from .stateclass cimport StateClass
 from ._state cimport StateC

--- a/spacy/pipeline/attributeruler.py
+++ b/spacy/pipeline/attributeruler.py
@ -4,7 +4,7 @@ from pathlib import Path

 from .pipe import Pipe
 from ..errors import Errors
-from ..gold import validate_examples
+from ..training import validate_examples
 from ..language import Language
 from ..matcher import Matcher
 from ..scorer import Scorer
--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.pyx
@ -9,7 +9,7 @@ from .functions import merge_subtokens
 from ..language import Language
 from ._parser_internals import nonproj
 from ..scorer import Scorer
-from ..gold import validate_examples
+from ..training import validate_examples


 default_model_config = """
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@ -1,3 +1,4 @@
+from itertools import islice
 from typing import Optional, Iterable, Callable, Dict, Iterator, Union, List, Tuple
 from pathlib import Path
 import srsly
@ -11,7 +12,7 @@ from ..tokens import Doc
 from .pipe import Pipe, deserialize_config
 from ..language import Language
 from ..vocab import Vocab
-from ..gold import Example, validate_examples
+from ..training import Example, validate_examples
 from ..errors import Errors, Warnings
 from ..util import SimpleFrozenList
 from .. import util
@ -128,7 +129,7 @@ class EntityLinker(Pipe):
        # how many neightbour sentences to take into account
        self.n_sents = cfg.get("n_sents", 0)

-    def require_kb(self) -> None:
+    def _require_kb(self) -> None:
        # Raise an error if the knowledge base is not initialized.
        if len(self.kb) == 0:
            raise ValueError(Errors.E139.format(name=self.name))
@ -140,10 +141,11 @@ class EntityLinker(Pipe):
        pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None,
        sgd: Optional[Optimizer] = None,
    ) -> Optimizer:
-        """Initialize the pipe for training, using data examples if available.
+        """Initialize the pipe for training, using a representative set
+        of data examples.

-        get_examples (Callable[[], Iterable[Example]]): Optional function that
-            returns gold-standard Example objects.
+        get_examples (Callable[[], Iterable[Example]]): Function that
+            returns a representative sample of gold-standard Example objects.
        pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
            components that this component is part of. Corresponds to
            nlp.pipeline.
@ -153,10 +155,19 @@ class EntityLinker(Pipe):

        DOCS: https://nightly.spacy.io/api/entitylinker#begin_training
        """
-        self.require_kb()
+        self._ensure_examples(get_examples)
+        self._require_kb()
        nO = self.kb.entity_vector_length
-        self.set_output(nO)
-        self.model.initialize()
+        doc_sample = []
+        vector_sample = []
+        for example in islice(get_examples(), 10):
+            doc_sample.append(example.x)
+            vector_sample.append(self.model.ops.alloc1f(nO))
+        assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
+        assert len(vector_sample) > 0, Errors.E923.format(name=self.name)
+        self.model.initialize(
+            X=doc_sample, Y=self.model.ops.asarray(vector_sample, dtype="float32")
+        )
        if sgd is None:
            sgd = self.create_optimizer()
        return sgd
@ -184,7 +195,7 @@ class EntityLinker(Pipe):

        DOCS: https://nightly.spacy.io/api/entitylinker#update
        """
-        self.require_kb()
+        self._require_kb()
        if losses is None:
            losses = {}
        losses.setdefault(self.name, 0.0)
@ -296,7 +307,7 @@ class EntityLinker(Pipe):

        DOCS: https://nightly.spacy.io/api/entitylinker#predict
        """
-        self.require_kb()
+        self._require_kb()
        entity_count = 0
        final_kb_ids = []
        if not docs:
@ -405,7 +416,7 @@ class EntityLinker(Pipe):
                    token.ent_kb_id_ = kb_id

    def to_disk(
-        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList(),
+        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
    ) -> None:
        """Serialize the pipe to disk.

@ -422,7 +433,7 @@ class EntityLinker(Pipe):
        util.to_disk(path, serialize, exclude)

    def from_disk(
-        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList(),
+        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
    ) -> "EntityLinker":
        """Load the pipe from disk. Modifies the object in place and returns it.

--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@ -9,7 +9,7 @@ from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList
 from ..tokens import Doc, Span
 from ..matcher import Matcher, PhraseMatcher
 from ..scorer import Scorer
-from ..gold import validate_examples
+from ..training import validate_examples


 DEFAULT_ENT_ID_SEP = "||"
--- a/spacy/pipeline/lemmatizer.py
+++ b/spacy/pipeline/lemmatizer.py
@ -8,7 +8,7 @@ from ..lookups import Lookups, load_lookups
 from ..scorer import Scorer
 from ..tokens import Doc, Token
 from ..vocab import Vocab
-from ..gold import validate_examples
+from ..training import validate_examples
 from .. import util


--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@ -2,6 +2,7 @@
 from typing import Optional
 import srsly
 from thinc.api import SequenceCategoricalCrossentropy, Model, Config
+from itertools import islice

 from ..tokens.doc cimport Doc
 from ..vocab cimport Vocab
@ -15,7 +16,7 @@ from .pipe import deserialize_config
 from .tagger import Tagger
 from .. import util
 from ..scorer import Scorer
-from ..gold import validate_examples
+from ..training import validate_examples


 default_model_config = """
@ -112,6 +113,7 @@ class Morphologizer(Tagger):
            raise ValueError(Errors.E187)
        if label in self.labels:
            return 0
+        self._allow_extra_label()
        # normalize label
        norm_label = self.vocab.morphology.normalize_features(label)
        # extract separate POS and morph tags
@ -128,10 +130,11 @@ class Morphologizer(Tagger):
        return 1

    def begin_training(self, get_examples, *, pipeline=None, sgd=None):
-        """Initialize the pipe for training, using data examples if available.
+        """Initialize the pipe for training, using a representative set
+        of data examples.

-        get_examples (Callable[[], Iterable[Example]]): Optional function that
-            returns gold-standard Example objects.
+        get_examples (Callable[[], Iterable[Example]]): Function that
+            returns a representative sample of gold-standard Example objects.
        pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
            components that this component is part of. Corresponds to
            nlp.pipeline.
@ -141,9 +144,8 @@ class Morphologizer(Tagger):

        DOCS: https://nightly.spacy.io/api/morphologizer#begin_training
        """
-        if not hasattr(get_examples, "__call__"):
-            err = Errors.E930.format(name="Morphologizer", obj=type(get_examples))
-            raise ValueError(err)
+        self._ensure_examples(get_examples)
+        # First, fetch all labels from the data
        for example in get_examples():
            for i, token in enumerate(example.reference):
                pos = token.pos_
@ -157,8 +159,25 @@ class Morphologizer(Tagger):
                if norm_label not in self.cfg["labels_morph"]:
                    self.cfg["labels_morph"][norm_label] = morph
                    self.cfg["labels_pos"][norm_label] = POS_IDS[pos]
-        self.set_output(len(self.labels))
-        self.model.initialize()
+        if len(self.labels) <= 1:
+            raise ValueError(Errors.E143.format(name=self.name))
+        doc_sample = []
+        label_sample = []
+        for example in islice(get_examples(), 10):
+            gold_array = []
+            for i, token in enumerate(example.reference):
+                pos = token.pos_
+                morph = token.morph_
+                morph_dict = Morphology.feats_to_dict(morph)
+                if pos:
+                    morph_dict[self.POS_FEAT] = pos
+                norm_label = self.vocab.strings[self.vocab.morphology.add(morph_dict)]
+                gold_array.append([1.0 if label == norm_label else 0.0 for label in self.labels])
+            doc_sample.append(example.x)
+            label_sample.append(self.model.ops.asarray(gold_array, dtype="float32"))
+        assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
+        assert len(label_sample) > 0, Errors.E923.format(name=self.name)
+        self.model.initialize(X=doc_sample, Y=label_sample)
        if sgd is None:
            sgd = self.create_optimizer()
        return sgd
--- a/spacy/pipeline/multitask.pyx
+++ b/spacy/pipeline/multitask.pyx
@ -8,7 +8,7 @@ from ..tokens.doc cimport Doc

 from .pipe import Pipe
 from .tagger import Tagger
-from ..gold import validate_examples
+from ..training import validate_examples
 from ..language import Language
 from ._parser_internals import nonproj
 from ..attrs import POS, ID
@ -90,7 +90,7 @@ class MultitaskObjective(Tagger):
                label = self.make_label(token)
                if label is not None and label not in self.labels:
                    self.labels[label] = len(self.labels)
-        self.model.initialize()
+        self.model.initialize()   # TODO: fix initialization by defining X and Y
        if sgd is None:
            sgd = self.create_optimizer()
        return sgd
@ -178,7 +178,7 @@ class ClozeMultitask(Pipe):
        pass

    def begin_training(self, get_examples, pipeline=None, sgd=None):
-        self.model.initialize()
+        self.model.initialize()  # TODO: fix initialization by defining X and Y
        X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO")))
        self.model.output_layer.begin_training(X)
        if sgd is None:
--- a/spacy/pipeline/ner.pyx
+++ b/spacy/pipeline/ner.pyx
@ -7,7 +7,7 @@ from ._parser_internals.ner cimport BiluoPushDown

 from ..language import Language
 from ..scorer import Scorer
-from ..gold import validate_examples
+from ..training import validate_examples


 default_model_config = """
--- a/spacy/pipeline/pipe.pyx
+++ b/spacy/pipeline/pipe.pyx
@ -4,7 +4,7 @@ from thinc.api import set_dropout_rate, Model

 from ..tokens.doc cimport Doc

-from ..gold import validate_examples
+from ..training import validate_examples
 from ..errors import Errors
 from .. import util

@ -160,6 +160,20 @@ cdef class Pipe:
        """
        raise NotImplementedError(Errors.E931.format(method="add_label", name=self.name))

+
+    def _require_labels(self) -> None:
+        """Raise an error if the component's model has no labels defined."""
+        if not self.labels or list(self.labels) == [""]:
+            raise ValueError(Errors.E143.format(name=self.name))
+
+
+    def _allow_extra_label(self) -> None:
+        """Raise an error if the component can not add any more labels."""
+        if self.model.has_dim("nO") and self.model.get_dim("nO") == len(self.labels):
+            if not self.is_resizable():
+                raise ValueError(Errors.E922.format(name=self.name, nO=self.model.get_dim("nO")))
+
+
    def create_optimizer(self):
        """Create an optimizer for the pipeline component.

@ -171,9 +185,12 @@ cdef class Pipe:

    def begin_training(self, get_examples, *, pipeline=None, sgd=None):
        """Initialize the pipe for training, using data examples if available.
+        This method needs to be implemented by each Pipe component,
+        ensuring the internal model (if available) is initialized properly
+        using the provided sample of Example objects.

-        get_examples (Callable[[], Iterable[Example]]): Optional function that
-            returns gold-standard Example objects.
+        get_examples (Callable[[], Iterable[Example]]): Function that
+            returns a representative sample of gold-standard Example objects.
        pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
            components that this component is part of. Corresponds to
            nlp.pipeline.
@ -183,16 +200,24 @@ cdef class Pipe:

        DOCS: https://nightly.spacy.io/api/pipe#begin_training
        """
-        self.model.initialize()
-        if sgd is None:
-            sgd = self.create_optimizer()
-        return sgd
+        raise NotImplementedError(Errors.E931.format(method="add_label", name=self.name))
+
+    def _ensure_examples(self, get_examples):
+        if get_examples is None or not hasattr(get_examples, "__call__"):
+            err = Errors.E930.format(name=self.name, obj=type(get_examples))
+            raise ValueError(err)
+        if not get_examples():
+            err = Errors.E930.format(name=self.name, obj=get_examples())
+            raise ValueError(err)
+
+    def is_resizable(self):
+        return hasattr(self, "model") and "resize_output" in self.model.attrs

    def set_output(self, nO):
-        if self.model.has_dim("nO") is not False:
-            self.model.set_dim("nO", nO)
-        if self.model.has_ref("output_layer"):
-            self.model.get_ref("output_layer").set_dim("nO", nO)
+        if self.is_resizable():
+            self.model.attrs["resize_output"](self.model, nO)
+        else:
+            raise NotImplementedError(Errors.E921)

    def use_params(self, params):
        """Modify the pipe's model, to use the given parameter values. At the
--- a/spacy/pipeline/sentencizer.pyx
+++ b/spacy/pipeline/sentencizer.pyx
@ -7,7 +7,7 @@ from ..tokens.doc cimport Doc
 from .pipe import Pipe
 from ..language import Language
 from ..scorer import Scorer
-from ..gold import validate_examples
+from ..training import validate_examples
 from .. import util


--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@ -1,4 +1,6 @@
 # cython: infer_types=True, profile=True, binding=True
+from itertools import islice
+
 import srsly
 from thinc.api import Model, SequenceCategoricalCrossentropy, Config

@ -9,7 +11,7 @@ from .tagger import Tagger
 from ..language import Language
 from ..errors import Errors
 from ..scorer import Scorer
-from ..gold import validate_examples
+from ..training import validate_examples
 from .. import util


@ -124,10 +126,11 @@ class SentenceRecognizer(Tagger):
        return float(loss), d_scores

    def begin_training(self, get_examples, *, pipeline=None, sgd=None):
-        """Initialize the pipe for training, using data examples if available.
+        """Initialize the pipe for training, using a representative set
+        of data examples.

-        get_examples (Callable[[], Iterable[Example]]): Optional function that
-            returns gold-standard Example objects.
+        get_examples (Callable[[], Iterable[Example]]): Function that
+            returns a representative sample of gold-standard Example objects.
        pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
            components that this component is part of. Corresponds to
            nlp.pipeline.
@ -137,8 +140,18 @@ class SentenceRecognizer(Tagger):

        DOCS: https://nightly.spacy.io/api/sentencerecognizer#begin_training
        """
-        self.set_output(len(self.labels))
-        self.model.initialize()
+        self._ensure_examples(get_examples)
+        doc_sample = []
+        label_sample = []
+        assert self.labels, Errors.E924.format(name=self.name)
+        for example in islice(get_examples(), 10):
+            doc_sample.append(example.x)
+            gold_tags = example.get_aligned("SENT_START")
+            gold_array = [[1.0 if tag == gold_tag else 0.0 for tag in self.labels] for gold_tag in gold_tags]
+            label_sample.append(self.model.ops.asarray(gold_array, dtype="float32"))
+        assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
+        assert len(label_sample) > 0, Errors.E923.format(name=self.name)
+        self.model.initialize(X=doc_sample, Y=label_sample)
        if sgd is None:
            sgd = self.create_optimizer()
        return sgd
--- a/spacy/pipeline/simple_ner.py
+++ b/spacy/pipeline/simple_ner.py
@ -1,211 +0,0 @@
-from typing import List, Iterable, Optional, Dict, Tuple, Callable, Set
-from thinc.types import Floats2d
-from thinc.api import SequenceCategoricalCrossentropy, set_dropout_rate, Model
-from thinc.api import Optimizer, Config
-from thinc.util import to_numpy
-
-from ..errors import Errors
-from ..gold import Example, spans_from_biluo_tags, iob_to_biluo, biluo_to_iob
-from ..gold import validate_examples
-from ..tokens import Doc
-from ..language import Language
-from ..vocab import Vocab
-from ..scorer import Scorer
-from .pipe import Pipe
-
-
-default_model_config = """
-[model]
-@architectures = "spacy.BILUOTagger.v1"
-
-[model.tok2vec]
-@architectures = "spacy.HashEmbedCNN.v1"
-pretrained_vectors = null
-width = 128
-depth = 4
-embed_size = 7000
-window_size = 1
-maxout_pieces = 3
-subword_features = true
-"""
-DEFAULT_SIMPLE_NER_MODEL = Config().from_str(default_model_config)["model"]
-
-
-@Language.factory(
-    "simple_ner",
-    assigns=["doc.ents"],
-    default_config={"labels": [], "model": DEFAULT_SIMPLE_NER_MODEL},
-    scores=["ents_p", "ents_r", "ents_f", "ents_per_type"],
-    default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0},
-)
-def make_simple_ner(
-    nlp: Language, name: str, model: Model, labels: Iterable[str]
-) -> "SimpleNER":
-    return SimpleNER(nlp.vocab, model, name, labels=labels)
-
-
-class SimpleNER(Pipe):
-    """Named entity recognition with a tagging model. The model should include
-    validity constraints to ensure that only valid tag sequences are returned."""
-
-    def __init__(
-        self,
-        vocab: Vocab,
-        model: Model,
-        name: str = "simple_ner",
-        *,
-        labels: Iterable[str],
-    ) -> None:
-        self.vocab = vocab
-        self.model = model
-        self.name = name
-        self.cfg = {"labels": []}
-        for label in labels:
-            self.add_label(label)
-        self.loss_func = SequenceCategoricalCrossentropy(
-            names=self.get_tag_names(), normalize=True, missing_value=None
-        )
-        assert self.model is not None
-
-    @property
-    def is_biluo(self) -> bool:
-        return self.model.name.startswith("biluo")
-
-    @property
-    def labels(self) -> Tuple[str]:
-        return tuple(self.cfg["labels"])
-
-    def add_label(self, label: str) -> None:
-        """Add a new label to the pipe.
-        label (str): The label to add.
-        DOCS: https://nightly.spacy.io/api/simplener#add_label
-        """
-        if not isinstance(label, str):
-            raise ValueError(Errors.E187)
-        if label not in self.labels:
-            self.cfg["labels"].append(label)
-            self.vocab.strings.add(label)
-
-    def get_tag_names(self) -> List[str]:
-        if self.is_biluo:
-            return (
-                [f"B-{label}" for label in self.labels]
-                + [f"I-{label}" for label in self.labels]
-                + [f"L-{label}" for label in self.labels]
-                + [f"U-{label}" for label in self.labels]
-                + ["O"]
-            )
-        else:
-            return (
-                [f"B-{label}" for label in self.labels]
-                + [f"I-{label}" for label in self.labels]
-                + ["O"]
-            )
-
-    def predict(self, docs: List[Doc]) -> List[Floats2d]:
-        scores = self.model.predict(docs)
-        return scores
-
-    def set_annotations(self, docs: List[Doc], scores: List[Floats2d]) -> None:
-        """Set entities on a batch of documents from a batch of scores."""
-        tag_names = self.get_tag_names()
-        for i, doc in enumerate(docs):
-            actions = to_numpy(scores[i].argmax(axis=1))
-            tags = [tag_names[actions[j]] for j in range(len(doc))]
-            if not self.is_biluo:
-                tags = iob_to_biluo(tags)
-            doc.ents = spans_from_biluo_tags(doc, tags)
-
-    def update(
-        self,
-        examples: List[Example],
-        *,
-        set_annotations: bool = False,
-        drop: float = 0.0,
-        sgd: Optional[Optimizer] = None,
-        losses: Optional[Dict[str, float]] = None,
-    ) -> Dict[str, float]:
-        if losses is None:
-            losses = {}
-        losses.setdefault("ner", 0.0)
-        validate_examples(examples, "SimpleNER.update")
-        if not any(_has_ner(eg) for eg in examples):
-            return losses
-        docs = [eg.predicted for eg in examples]
-        set_dropout_rate(self.model, drop)
-        scores, bp_scores = self.model.begin_update(docs)
-        loss, d_scores = self.get_loss(examples, scores)
-        bp_scores(d_scores)
-        if set_annotations:
-            self.set_annotations(docs, scores)
-        if sgd is not None:
-            self.model.finish_update(sgd)
-        losses["ner"] += loss
-        return losses
-
-    def get_loss(self, examples: List[Example], scores) -> Tuple[List[Floats2d], float]:
-        validate_examples(examples, "SimpleNER.get_loss")
-        truths = []
-        for eg in examples:
-            tags = eg.get_aligned_ner()
-            gold_tags = [(tag if tag != "-" else None) for tag in tags]
-            if not self.is_biluo:
-                gold_tags = biluo_to_iob(gold_tags)
-            truths.append(gold_tags)
-        for i in range(len(scores)):
-            if len(scores[i]) != len(truths[i]):
-                raise ValueError(
-                    f"Mismatched output and gold sizes.\n"
-                    f"Output: {len(scores[i])}, gold: {len(truths[i])}."
-                    f"Input: {len(examples[i].doc)}"
-                )
-        d_scores, loss = self.loss_func(scores, truths)
-        return loss, d_scores
-
-    def begin_training(
-        self,
-        get_examples: Callable[[], Iterable[Example]],
-        pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None,
-        sgd: Optional[Optimizer] = None,
-    ):
-        all_labels = set()
-        if not hasattr(get_examples, "__call__"):
-            err = Errors.E930.format(name="SimpleNER", obj=type(get_examples))
-            raise ValueError(err)
-        for example in get_examples():
-            all_labels.update(_get_labels(example))
-        for label in sorted(all_labels):
-            self.add_label(label)
-        labels = self.labels
-        n_actions = self.model.attrs["get_num_actions"](len(labels))
-        self.model.set_dim("nO", n_actions)
-        self.model.initialize()
-        if pipeline is not None:
-            self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg)
-        self.loss_func = SequenceCategoricalCrossentropy(
-            names=self.get_tag_names(), normalize=True, missing_value=None
-        )
-        return sgd
-
-    def init_multitask_objectives(self, *args, **kwargs):
-        pass
-
-    def score(self, examples, **kwargs):
-        validate_examples(examples, "SimpleNER.score")
-        return Scorer.score_spans(examples, "ents", **kwargs)
-
-
-def _has_ner(example: Example) -> bool:
-    for ner_tag in example.get_aligned_ner():
-        if ner_tag != "-" and ner_tag is not None:
-            return True
-    else:
-        return False
-
-
-def _get_labels(example: Example) -> Set[str]:
-    labels = set()
-    for ner_tag in example.get_aligned("ENT_TYPE", as_string=True):
-        if ner_tag != "O" and ner_tag != "-":
-            labels.add(ner_tag)
-    return labels
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@ -5,6 +5,7 @@ import srsly
 from thinc.api import Model, set_dropout_rate, SequenceCategoricalCrossentropy, Config
 from thinc.types import Floats2d
 import warnings
+from itertools import islice

 from ..tokens.doc cimport Doc
 from ..morphology cimport Morphology
@ -16,7 +17,7 @@ from ..attrs import POS, ID
 from ..parts_of_speech import X
 from ..errors import Errors, TempErrors, Warnings
 from ..scorer import Scorer
-from ..gold import validate_examples
+from ..training import validate_examples
 from .. import util


@ -258,10 +259,11 @@ class Tagger(Pipe):
        return float(loss), d_scores

    def begin_training(self, get_examples, *, pipeline=None, sgd=None):
-        """Initialize the pipe for training, using data examples if available.
+        """Initialize the pipe for training, using a representative set
+        of data examples.

-        get_examples (Callable[[], Iterable[Example]]): Optional function that
-            returns gold-standard Example objects.
+        get_examples (Callable[[], Iterable[Example]]): Function that
+            returns a representative sample of gold-standard Example objects..
        pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
            components that this component is part of. Corresponds to
            nlp.pipeline.
@ -271,32 +273,24 @@ class Tagger(Pipe):

        DOCS: https://nightly.spacy.io/api/tagger#begin_training
        """
-        if not hasattr(get_examples, "__call__"):
-            err = Errors.E930.format(name="Tagger", obj=type(get_examples))
-            raise ValueError(err)
-        tags = set()
+        self._ensure_examples(get_examples)
        doc_sample = []
+        label_sample = []
+        tags = set()
        for example in get_examples():
            for token in example.y:
+                if token.tag_:
                    tags.add(token.tag_)
-            if len(doc_sample) < 10:
-                doc_sample.append(example.x)
-        if not doc_sample:
-            doc_sample.append(Doc(self.vocab, words=["hello"]))
        for tag in sorted(tags):
            self.add_label(tag)
-        if len(self.labels) == 0:
-            err = Errors.E1006.format(name="Tagger")
-            raise ValueError(err)
-        self.set_output(len(self.labels))
-        if doc_sample:
-            label_sample = [
-                self.model.ops.alloc2f(len(doc), len(self.labels))
-                for doc in doc_sample
-            ]
+        for example in islice(get_examples(), 10):
+            doc_sample.append(example.x)
+            gold_tags = example.get_aligned("TAG", as_string=True)
+            gold_array = [[1.0 if tag == gold_tag else 0.0 for tag in self.labels] for gold_tag in gold_tags]
+            label_sample.append(self.model.ops.asarray(gold_array, dtype="float32"))
+        assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
+        assert len(label_sample) > 0, Errors.E923.format(name=self.name)
        self.model.initialize(X=doc_sample, Y=label_sample)
-        else:
-            self.model.initialize()
        if sgd is None:
            sgd = self.create_optimizer()
        return sgd
@ -313,6 +307,7 @@ class Tagger(Pipe):
            raise ValueError(Errors.E187)
        if label in self.labels:
            return 0
+        self._allow_extra_label()
        self.cfg["labels"].append(label)
        self.vocab.strings.add(label)
        return 1
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@ -1,3 +1,4 @@
+from itertools import islice
 from typing import Iterable, Tuple, Optional, Dict, List, Callable, Iterator, Any
 from thinc.api import get_array_module, Model, Optimizer, set_dropout_rate, Config
 from thinc.types import Floats2d
@ -5,7 +6,7 @@ import numpy

 from .pipe import Pipe
 from ..language import Language
-from ..gold import Example, validate_examples
+from ..training import Example, validate_examples
 from ..errors import Errors
 from ..scorer import Scorer
 from .. import util
@ -128,11 +129,6 @@ class TextCategorizer(Pipe):
        """
        return tuple(self.cfg.setdefault("labels", []))

-    def require_labels(self) -> None:
-        """Raise an error if the component's model has no labels defined."""
-        if not self.labels:
-            raise ValueError(Errors.E143.format(name=self.name))
-
    @labels.setter
    def labels(self, value: Iterable[str]) -> None:
        self.cfg["labels"] = tuple(value)
@ -311,17 +307,7 @@ class TextCategorizer(Pipe):
            raise ValueError(Errors.E187)
        if label in self.labels:
            return 0
-        if self.model.has_dim("nO"):
-            # This functionality was available previously, but was broken.
-            # The problem is that we resize the last layer, but the last layer
-            # is actually just an ensemble. We're not resizing the child layers
-            # - a huge problem.
-            raise ValueError(Errors.E116)
-            # smaller = self.model._layers[-1]
-            # larger = Linear(len(self.labels)+1, smaller.nI)
-            # copy_array(larger.W[:smaller.nO], smaller.W)
-            # copy_array(larger.b[:smaller.nO], smaller.b)
-            # self.model._layers[-1] = larger
+        self._allow_extra_label()
        self.labels = tuple(list(self.labels) + [label])
        return 1

@ -332,10 +318,11 @@ class TextCategorizer(Pipe):
        pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None,
        sgd: Optional[Optimizer] = None,
    ) -> Optimizer:
-        """Initialize the pipe for training, using data examples if available.
+        """Initialize the pipe for training, using a representative set
+        of data examples.

-        get_examples (Callable[[], Iterable[Example]]): Optional function that
-            returns gold-standard Example objects.
+        get_examples (Callable[[], Iterable[Example]]): Function that
+            returns a representative sample of gold-standard Example objects.
        pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
            components that this component is part of. Corresponds to
            nlp.pipeline.
@ -345,22 +332,19 @@ class TextCategorizer(Pipe):

        DOCS: https://nightly.spacy.io/api/textcategorizer#begin_training
        """
-        if not hasattr(get_examples, "__call__"):
-            err = Errors.E930.format(name="TextCategorizer", obj=type(get_examples))
-            raise ValueError(err)
+        self._ensure_examples(get_examples)
        subbatch = []  # Select a subbatch of examples to initialize the model
-        for example in get_examples():
+        for example in islice(get_examples(), 10):
            if len(subbatch) < 2:
                subbatch.append(example)
            for cat in example.y.cats:
                self.add_label(cat)
-        self.require_labels()
-        docs = [eg.reference for eg in subbatch]
-        if not docs:  # need at least one doc
-            docs = [Doc(self.vocab, words=["hello"])]
-        truths, _ = self._examples_to_truth(subbatch)
-        self.set_output(len(self.labels))
-        self.model.initialize(X=docs, Y=truths)
+        doc_sample = [eg.reference for eg in subbatch]
+        label_sample, _ = self._examples_to_truth(subbatch)
+        self._require_labels()
+        assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
+        assert len(label_sample) > 0, Errors.E923.format(name=self.name)
+        self.model.initialize(X=doc_sample, Y=label_sample)
        if sgd is None:
            sgd = self.create_optimizer()
        return sgd
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@ -1,8 +1,9 @@
 from typing import Iterator, Sequence, Iterable, Optional, Dict, Callable, List, Tuple
 from thinc.api import Model, set_dropout_rate, Optimizer, Config
+from itertools import islice

 from .pipe import Pipe
-from ..gold import Example, validate_examples
+from ..training import Example, validate_examples
 from ..tokens import Doc
 from ..vocab import Vocab
 from ..language import Language
@ -209,10 +210,11 @@ class Tok2Vec(Pipe):
        pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None,
        sgd: Optional[Optimizer] = None,
    ):
-        """Initialize the pipe for training, using data examples if available.
+        """Initialize the pipe for training, using a representative set
+        of data examples.

-        get_examples (Callable[[], Iterable[Example]]): Optional function that
-            returns gold-standard Example objects.
+        get_examples (Callable[[], Iterable[Example]]): Function that
+            returns a representative sample of gold-standard Example objects.
        pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
            components that this component is part of. Corresponds to
            nlp.pipeline.
@ -222,8 +224,12 @@ class Tok2Vec(Pipe):

        DOCS: https://nightly.spacy.io/api/tok2vec#begin_training
        """
-        docs = [Doc(self.vocab, words=["hello"])]
-        self.model.initialize(X=docs)
+        self._ensure_examples(get_examples)
+        doc_sample = []
+        for example in islice(get_examples(), 10):
+            doc_sample.append(example.x)
+        assert doc_sample, Errors.E923.format(name=self.name)
+        self.model.initialize(X=doc_sample)

    def add_label(self, label):
        raise NotImplementedError
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@ -21,7 +21,7 @@ from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
 from ..ml.parser_model cimport get_c_weights, get_c_sizes
 from ..tokens.doc cimport Doc

-from ..gold import validate_examples
+from ..training import validate_examples
 from ..errors import Errors, Warnings
 from .. import util

@ -244,7 +244,7 @@ cdef class Parser(Pipe):
            int nr_class, int batch_size) nogil:
        # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
        with gil:
-            assert self.moves.n_moves > 0
+            assert self.moves.n_moves > 0, Errors.E924.format(name=self.name)
        is_valid = <int*>calloc(self.moves.n_moves, sizeof(int))
        cdef int i, guess
        cdef Transition action
@ -378,7 +378,7 @@ cdef class Parser(Pipe):
        cdef int i

        # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
-        assert self.moves.n_moves > 0
+        assert self.moves.n_moves > 0, Errors.E924.format(name=self.name)

        is_valid = <int*>mem.alloc(self.moves.n_moves, sizeof(int))
        costs = <float*>mem.alloc(self.moves.n_moves, sizeof(float))
@ -406,9 +406,7 @@ cdef class Parser(Pipe):
        self.model.attrs["resize_output"](self.model, nO)

    def begin_training(self, get_examples, pipeline=None, sgd=None, **kwargs):
-        if not hasattr(get_examples, "__call__"):
-            err = Errors.E930.format(name="DependencyParser/EntityRecognizer", obj=type(get_examples))
-            raise ValueError(err)
+        self._ensure_examples(get_examples)
        self.cfg.update(kwargs)
        lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {})
        if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS:
@ -430,9 +428,6 @@ cdef class Parser(Pipe):
        if sgd is None:
            sgd = self.create_optimizer()
        doc_sample = []
-        for example in islice(get_examples(), 10):
-            doc_sample.append(example.predicted)
-
        if pipeline is not None:
            for name, component in pipeline:
                if component is self:
@ -441,10 +436,11 @@ cdef class Parser(Pipe):
                    doc_sample = list(component.pipe(doc_sample, batch_size=8))
                else:
                    doc_sample = [component(doc) for doc in doc_sample]
-        if doc_sample:
+        if not doc_sample:
+            for example in islice(get_examples(), 10):
+                doc_sample.append(example.predicted)
+        assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
        self.model.initialize(doc_sample)
-        else:
-            self.model.initialize()
        if pipeline is not None:
            self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg)
        return sgd
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@ -12,7 +12,7 @@ from .attrs import NAMES
 if TYPE_CHECKING:
    # This lets us add type hints for mypy etc. without causing circular imports
    from .language import Language  # noqa: F401
-    from .gold import Example  # noqa: F401
+    from .training import Example  # noqa: F401


 ItemT = TypeVar("ItemT")
@ -180,7 +180,7 @@ class ModelMetaSchema(BaseModel):
    url: StrictStr = Field("", title="Model author URL")
    sources: Optional[Union[List[StrictStr], List[Dict[str, str]]]] = Field(None, title="Training data sources")
    vectors: Dict[str, Any] = Field({}, title="Included word vectors")
-    labels: Dict[str, Dict[str, List[str]]] = Field({}, title="Component labels, keyed by component name")
+    labels: Dict[str, List[str]] = Field({}, title="Component labels, keyed by component name")
    accuracy: Dict[str, Union[float, Dict[str, float]]] = Field({}, title="Accuracy numbers")
    speed: Dict[str, Union[float, int]] = Field({}, title="Speed evaluation numbers")
    spacy_git_version: StrictStr = Field("", title="Commit of spaCy version used")
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@ -1,7 +1,7 @@
 from typing import Optional, Iterable, Dict, Any, Callable, TYPE_CHECKING
 import numpy as np

-from .gold import Example
+from .training import Example
 from .tokens import Token, Doc, Span
 from .errors import Errors
 from .util import get_lang_class, SimpleFrozenList
--- a/spacy/tests/doc/test_add_entities.py
+++ b/spacy/tests/doc/test_add_entities.py
@ -1,5 +1,6 @@
+from spacy.training import Example
 from spacy.pipeline import EntityRecognizer
-from spacy.tokens import Span
+from spacy.tokens import Span, Doc
 from spacy import registry
 import pytest

@ -7,6 +8,12 @@ from ..util import get_doc
 from spacy.pipeline.ner import DEFAULT_NER_MODEL


+def _ner_example(ner):
+    doc = Doc(ner.vocab, words=["Joe", "loves", "visiting", "London", "during", "the", "weekend"])
+    gold = {"entities": [(0, 3, "PERSON"), (19, 25, "LOC")]}
+    return Example.from_dict(doc, gold)
+
+
 def test_doc_add_entities_set_ents_iob(en_vocab):
    text = ["This", "is", "a", "lion"]
    doc = get_doc(en_vocab, text)
@ -18,10 +25,8 @@ def test_doc_add_entities_set_ents_iob(en_vocab):
    cfg = {"model": DEFAULT_NER_MODEL}
    model = registry.make_from_config(cfg, validate=True)["model"]
    ner = EntityRecognizer(en_vocab, model, **config)
-    ner.begin_training(lambda: [])
+    ner.begin_training(lambda: [_ner_example(ner)])
    ner(doc)
-    assert len(list(doc.ents)) == 0
-    assert [w.ent_iob_ for w in doc] == (["O"] * len(doc))

    doc.ents = [(doc.vocab.strings["ANIMAL"], 3, 4)]
    assert [w.ent_iob_ for w in doc] == ["O", "O", "O", "B"]
@ -31,6 +36,7 @@ def test_doc_add_entities_set_ents_iob(en_vocab):


 def test_ents_reset(en_vocab):
+    """Ensure that resetting doc.ents does not change anything"""
    text = ["This", "is", "a", "lion"]
    doc = get_doc(en_vocab, text)
    config = {
@ -41,11 +47,11 @@ def test_ents_reset(en_vocab):
    cfg = {"model": DEFAULT_NER_MODEL}
    model = registry.make_from_config(cfg, validate=True)["model"]
    ner = EntityRecognizer(en_vocab, model, **config)
-    ner.begin_training(lambda: [])
+    ner.begin_training(lambda: [_ner_example(ner)])
    ner(doc)
-    assert [t.ent_iob_ for t in doc] == (["O"] * len(doc))
+    orig_iobs = [t.ent_iob_ for t in doc]
    doc.ents = list(doc.ents)
-    assert [t.ent_iob_ for t in doc] == (["O"] * len(doc))
+    assert [t.ent_iob_ for t in doc] == orig_iobs


 def test_add_overlapping_entities(en_vocab):
--- a/spacy/tests/parser/test_add_label.py
+++ b/spacy/tests/parser/test_add_label.py
@ -3,7 +3,7 @@ from thinc.api import Adam, fix_random_seed
 from spacy import registry
 from spacy.attrs import NORM
 from spacy.vocab import Vocab
-from spacy.gold import Example
+from spacy.training import Example
 from spacy.tokens import Doc
 from spacy.pipeline import DependencyParser, EntityRecognizer
 from spacy.pipeline.ner import DEFAULT_NER_MODEL
@ -35,7 +35,7 @@ def test_init_parser(parser):
 def _train_parser(parser):
    fix_random_seed(1)
    parser.add_label("left")
-    parser.begin_training(lambda: [], **parser.cfg)
+    parser.begin_training(lambda: [_parser_example(parser)], **parser.cfg)
    sgd = Adam(0.001)

    for i in range(5):
@ -47,16 +47,25 @@ def _train_parser(parser):
    return parser


+def _parser_example(parser):
+    doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
+    gold = {"heads": [1, 1, 3, 3], "deps": ["right", "ROOT", "left", "ROOT"]}
+    return Example.from_dict(doc, gold)
+
+
+def _ner_example(ner):
+    doc = Doc(ner.vocab, words=["Joe", "loves", "visiting", "London", "during", "the", "weekend"])
+    gold = {"entities": [(0, 3, "PERSON"), (19, 25, "LOC")]}
+    return Example.from_dict(doc, gold)
+
+
 def test_add_label(parser):
    parser = _train_parser(parser)
    parser.add_label("right")
    sgd = Adam(0.001)
    for i in range(100):
        losses = {}
-        doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
-        gold = {"heads": [1, 1, 3, 3], "deps": ["right", "ROOT", "left", "ROOT"]}
-        example = Example.from_dict(doc, gold)
-        parser.update([example], sgd=sgd, losses=losses)
+        parser.update([_parser_example(parser)], sgd=sgd, losses=losses)
    doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
    doc = parser(doc)
    assert doc[0].dep_ == "right"
@ -75,7 +84,7 @@ def test_add_label_deserializes_correctly():
    ner1.add_label("C")
    ner1.add_label("B")
    ner1.add_label("A")
-    ner1.begin_training(lambda: [])
+    ner1.begin_training(lambda: [_ner_example(ner1)])
    ner2 = EntityRecognizer(Vocab(), model, **config)

    # the second model needs to be resized before we can call from_bytes
--- a/spacy/tests/parser/test_arc_eager_oracle.py
+++ b/spacy/tests/parser/test_arc_eager_oracle.py
@ -1,7 +1,7 @@
 import pytest
 from spacy.vocab import Vocab
 from spacy import registry
-from spacy.gold import Example
+from spacy.training import Example
 from spacy.pipeline import DependencyParser
 from spacy.tokens import Doc
 from spacy.pipeline._parser_internals.nonproj import projectivize
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@ -4,7 +4,7 @@ from spacy.lang.en import English
 from spacy.language import Language
 from spacy.lookups import Lookups
 from spacy.pipeline._parser_internals.ner import BiluoPushDown
-from spacy.gold import Example
+from spacy.training import Example
 from spacy.tokens import Doc
 from spacy.vocab import Vocab
 import logging
--- a/spacy/tests/parser/test_neural_parser.py
+++ b/spacy/tests/parser/test_neural_parser.py
@ -1,7 +1,7 @@
 import pytest

 from spacy import registry
-from spacy.gold import Example
+from spacy.training import Example
 from spacy.vocab import Vocab
 from spacy.pipeline._parser_internals.arc_eager import ArcEager
 from spacy.pipeline.transition_parser import Parser
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@ -3,7 +3,7 @@ import pytest
 from spacy.lang.en import English
 from ..util import get_doc, apply_transition_sequence, make_tempdir
 from ... import util
-from ...gold import Example
+from ...training import Example

 TRAIN_DATA = [
    (
@ -85,7 +85,7 @@ def test_parser_merge_pp(en_tokenizer):
    pos = ["DET", "NOUN", "ADP", "DET", "NOUN", "VERB"]
    tokens = en_tokenizer(text)
    doc = get_doc(
-        tokens.vocab, words=[t.text for t in tokens], deps=deps, heads=heads, pos=pos,
+        tokens.vocab, words=[t.text for t in tokens], deps=deps, heads=heads, pos=pos
    )
    with doc.retokenize() as retokenizer:
        for np in doc.noun_chunks:
--- a/spacy/tests/parser/test_preset_sbd.py
+++ b/spacy/tests/parser/test_preset_sbd.py
@ -3,7 +3,7 @@ from thinc.api import Adam
 from spacy.attrs import NORM
 from spacy.vocab import Vocab
 from spacy import registry
-from spacy.gold import Example
+from spacy.training import Example
 from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
 from spacy.tokens import Doc
 from spacy.pipeline import DependencyParser
@ -14,6 +14,12 @@ def vocab():
    return Vocab(lex_attr_getters={NORM: lambda s: s})


+def _parser_example(parser):
+    doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
+    gold = {"heads": [1, 1, 3, 3], "deps": ["right", "ROOT", "left", "ROOT"]}
+    return Example.from_dict(doc, gold)
+
+
@pytest.fixture
 def parser(vocab):
    config = {
@ -28,7 +34,7 @@ def parser(vocab):
    parser.cfg["hidden_width"] = 32
    # parser.add_label('right')
    parser.add_label("left")
-    parser.begin_training(lambda: [], **parser.cfg)
+    parser.begin_training(lambda: [_parser_example(parser)], **parser.cfg)
    sgd = Adam(0.001)

    for i in range(10):
--- a/spacy/tests/pipeline/test_attributeruler.py
+++ b/spacy/tests/pipeline/test_attributeruler.py
@ -1,6 +1,6 @@
 import pytest
 import numpy
-from spacy.gold import Example
+from spacy.training import Example
 from spacy.lang.en import English
 from spacy.pipeline import AttributeRuler
 from spacy import util, registry
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@ -4,7 +4,7 @@ import pytest
 from spacy.kb import KnowledgeBase, get_candidates, Candidate

 from spacy import util, registry
-from spacy.gold import Example
+from spacy.training import Example
 from spacy.lang.en import English
 from spacy.tests.util import make_tempdir
 from spacy.tokens import Span
@ -281,11 +281,12 @@ def test_append_invalid_alias(nlp):

 def test_preserving_links_asdoc(nlp):
    """Test that Span.as_doc preserves the existing entity links"""
+    vector_length = 1

    @registry.misc.register("myLocationsKB.v1")
    def dummy_kb() -> Callable[["Vocab"], KnowledgeBase]:
        def create_kb(vocab):
-            mykb = KnowledgeBase(vocab, entity_vector_length=1)
+            mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
            # adding entities
            mykb.add_entity(entity="Q1", freq=19, entity_vector=[1])
            mykb.add_entity(entity="Q2", freq=8, entity_vector=[1])
@ -305,10 +306,9 @@ def test_preserving_links_asdoc(nlp):
    ruler = nlp.add_pipe("entity_ruler")
    ruler.add_patterns(patterns)
    el_config = {"kb_loader": {"@misc": "myLocationsKB.v1"}, "incl_prior": False}
-    el_pipe = nlp.add_pipe("entity_linker", config=el_config, last=True)
-    el_pipe.begin_training(lambda: [])
-    el_pipe.incl_context = False
-    el_pipe.incl_prior = True
+    entity_linker = nlp.add_pipe("entity_linker", config=el_config, last=True)
+    nlp.begin_training()
+    assert entity_linker.model.get_dim("nO") == vector_length

    # test whether the entity links are preserved by the `as_doc()` function
    text = "She lives in Boston. He lives in Denver."
@ -373,6 +373,7 @@ def test_overfitting_IO():
    # Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly
    nlp = English()
    nlp.add_pipe("sentencizer")
+    vector_length = 3

    # Add a custom component to recognize "Russ Cochran" as an entity for the example training data
    patterns = [
@ -393,7 +394,7 @@ def test_overfitting_IO():
            # create artificial KB - assign same prior weight to the two russ cochran's
            # Q2146908 (Russ Cochran): American golfer
            # Q7381115 (Russ Cochran): publisher
-            mykb = KnowledgeBase(vocab, entity_vector_length=3)
+            mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
            mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
            mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7])
            mykb.add_alias(
@ -406,14 +407,17 @@ def test_overfitting_IO():
        return create_kb

    # Create the Entity Linker component and add it to the pipeline
-    nlp.add_pipe(
+    entity_linker = nlp.add_pipe(
        "entity_linker",
        config={"kb_loader": {"@misc": "myOverfittingKB.v1"}},
        last=True,
    )

    # train the NEL pipe
-    optimizer = nlp.begin_training()
+    optimizer = nlp.begin_training(get_examples=lambda: train_examples)
+    assert entity_linker.model.get_dim("nO") == vector_length
+    assert entity_linker.model.get_dim("nO") == entity_linker.kb.entity_vector_length
+
    for i in range(50):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)
--- a/spacy/tests/pipeline/test_morphologizer.py
+++ b/spacy/tests/pipeline/test_morphologizer.py
@ -1,7 +1,7 @@
 import pytest

 from spacy import util
-from spacy.gold import Example
+from spacy.training import Example
 from spacy.lang.en import English
 from spacy.language import Language
 from spacy.tests.util import make_tempdir
@ -25,27 +25,61 @@ TRAIN_DATA = [
        },
    ),
    # test combinations of morph+POS
-    ("Eat blue ham", {"morphs": ["Feat=V", "", ""], "pos": ["", "ADJ", ""]},),
+    ("Eat blue ham", {"morphs": ["Feat=V", "", ""], "pos": ["", "ADJ", ""]}),
 ]


+def test_no_label():
+    nlp = Language()
+    nlp.add_pipe("morphologizer")
+    with pytest.raises(ValueError):
+        nlp.begin_training()
+
+
+def test_implicit_label():
+    nlp = Language()
+    nlp.add_pipe("morphologizer")
+    train_examples = []
+    for t in TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+    nlp.begin_training(get_examples=lambda: train_examples)
+
+
+def test_no_resize():
+    nlp = Language()
+    morphologizer = nlp.add_pipe("morphologizer")
+    morphologizer.add_label("POS" + Morphology.FIELD_SEP + "NOUN")
+    morphologizer.add_label("POS" + Morphology.FIELD_SEP + "VERB")
+    nlp.begin_training()
+    # this throws an error because the morphologizer can't be resized after initialization
+    with pytest.raises(ValueError):
+        morphologizer.add_label("POS" + Morphology.FIELD_SEP + "ADJ")
+
+
+def test_begin_training_examples():
+    nlp = Language()
+    morphologizer = nlp.add_pipe("morphologizer")
+    morphologizer.add_label("POS" + Morphology.FIELD_SEP + "NOUN")
+    train_examples = []
+    for t in TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+    # you shouldn't really call this more than once, but for testing it should be fine
+    nlp.begin_training()
+    nlp.begin_training(get_examples=lambda: train_examples)
+    with pytest.raises(TypeError):
+        nlp.begin_training(get_examples=lambda: None)
+    with pytest.raises(ValueError):
+        nlp.begin_training(get_examples=train_examples)
+
+
 def test_overfitting_IO():
    # Simple test to try and quickly overfit the morphologizer - ensuring the ML models work correctly
    nlp = English()
-    morphologizer = nlp.add_pipe("morphologizer")
+    nlp.add_pipe("morphologizer")
    train_examples = []
    for inst in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(inst[0]), inst[1]))
-        for morph, pos in zip(inst[1]["morphs"], inst[1]["pos"]):
-            if morph and pos:
-                morphologizer.add_label(
-                    morph + Morphology.FEATURE_SEP + "POS" + Morphology.FIELD_SEP + pos
-                )
-            elif pos:
-                morphologizer.add_label("POS" + Morphology.FIELD_SEP + pos)
-            elif morph:
-                morphologizer.add_label(morph)
-    optimizer = nlp.begin_training()
+    optimizer = nlp.begin_training(get_examples=lambda: train_examples)

    for i in range(50):
        losses = {}
@ -55,18 +89,8 @@ def test_overfitting_IO():
    # test the trained model
    test_text = "I like blue ham"
    doc = nlp(test_text)
-    gold_morphs = [
-        "Feat=N",
-        "Feat=V",
-        "",
-        "",
-    ]
-    gold_pos_tags = [
-        "NOUN",
-        "VERB",
-        "ADJ",
-        "",
-    ]
+    gold_morphs = ["Feat=N", "Feat=V", "", ""]
+    gold_pos_tags = ["NOUN", "VERB", "ADJ", ""]
    assert [t.morph_ for t in doc] == gold_morphs
    assert [t.pos_ for t in doc] == gold_pos_tags

--- a/spacy/tests/pipeline/test_senter.py
+++ b/spacy/tests/pipeline/test_senter.py
@ -1,7 +1,7 @@
 import pytest

 from spacy import util
-from spacy.gold import Example
+from spacy.training import Example
 from spacy.lang.en import English
 from spacy.language import Language
 from spacy.tests.util import make_tempdir
@ -30,6 +30,20 @@ TRAIN_DATA = [
    ),
 ]

+def test_begin_training_examples():
+    nlp = Language()
+    senter = nlp.add_pipe("senter")
+    train_examples = []
+    for t in TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+    # you shouldn't really call this more than once, but for testing it should be fine
+    nlp.begin_training()
+    nlp.begin_training(get_examples=lambda: train_examples)
+    with pytest.raises(TypeError):
+        nlp.begin_training(get_examples=lambda: None)
+    with pytest.raises(ValueError):
+        nlp.begin_training(get_examples=train_examples)
+

 def test_overfitting_IO():
    # Simple test to try and quickly overfit the senter - ensuring the ML models work correctly
--- a/spacy/tests/pipeline/test_simple_ner.py
+++ b/spacy/tests/pipeline/test_simple_ner.py
@ -1,45 +0,0 @@
-from spacy.lang.en import English
-from spacy.gold import Example
-from spacy import util
-from ..util import make_tempdir
-
-
-TRAIN_DATA = [
-    ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
-    ("I like London and Berlin.", {"entities": [(7, 13, "LOC"), (18, 24, "LOC")]}),
-]
-
-
-def test_overfitting_IO():
-    # Simple test to try and quickly overfit the SimpleNER component - ensuring the ML models work correctly
-    nlp = English()
-    ner = nlp.add_pipe("simple_ner")
-    train_examples = []
-    for text, annotations in TRAIN_DATA:
-        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
-        for ent in annotations.get("entities"):
-            ner.add_label(ent[2])
-    optimizer = nlp.begin_training()
-
-    for i in range(50):
-        losses = {}
-        nlp.update(train_examples, sgd=optimizer, losses=losses)
-    assert losses["ner"] < 0.0001
-
-    # test the trained model
-    test_text = "I like London."
-    doc = nlp(test_text)
-    ents = doc.ents
-    assert len(ents) == 1
-    assert ents[0].text == "London"
-    assert ents[0].label_ == "LOC"
-
-    # Also test the results are still the same after IO
-    with make_tempdir() as tmp_dir:
-        nlp.to_disk(tmp_dir)
-        nlp2 = util.load_model_from_path(tmp_dir)
-        doc2 = nlp2(test_text)
-        ents2 = doc2.ents
-        assert len(ents2) == 1
-        assert ents2[0].text == "London"
-        assert ents2[0].label_ == "LOC"
--- a/spacy/tests/pipeline/test_tagger.py
+++ b/spacy/tests/pipeline/test_tagger.py
@ -1,6 +1,6 @@
 import pytest
 from spacy import util
-from spacy.gold import Example
+from spacy.training import Example
 from spacy.lang.en import English
 from spacy.language import Language

@ -34,6 +34,56 @@ TRAIN_DATA = [
 ]


+def test_no_label():
+    nlp = Language()
+    nlp.add_pipe("tagger")
+    with pytest.raises(ValueError):
+        nlp.begin_training()
+
+
+def test_no_resize():
+    nlp = Language()
+    tagger = nlp.add_pipe("tagger")
+    tagger.add_label("N")
+    tagger.add_label("V")
+    assert tagger.labels == ("N", "V")
+    nlp.begin_training()
+    assert tagger.model.get_dim("nO") == 2
+    # this throws an error because the tagger can't be resized after initialization
+    with pytest.raises(ValueError):
+        tagger.add_label("J")
+
+
+def test_implicit_label():
+    nlp = Language()
+    nlp.add_pipe("tagger")
+    train_examples = []
+    for t in TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+    nlp.begin_training(get_examples=lambda: train_examples)
+
+
+def test_begin_training_examples():
+    nlp = Language()
+    tagger = nlp.add_pipe("tagger")
+    train_examples = []
+    for tag in TAGS:
+        tagger.add_label(tag)
+    for t in TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+    # you shouldn't really call this more than once, but for testing it should be fine
+    nlp.begin_training()
+    nlp.begin_training(get_examples=lambda: train_examples)
+    with pytest.raises(TypeError):
+        nlp.begin_training(get_examples=lambda: None)
+    with pytest.raises(TypeError):
+        nlp.begin_training(get_examples=lambda: train_examples[0])
+    with pytest.raises(ValueError):
+        nlp.begin_training(get_examples=lambda: [])
+    with pytest.raises(ValueError):
+        nlp.begin_training(get_examples=train_examples)
+
+
 def test_overfitting_IO():
    # Simple test to try and quickly overfit the tagger - ensuring the ML models work correctly
    nlp = English()
@ -41,9 +91,8 @@ def test_overfitting_IO():
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
-    for tag in TAGS:
-        tagger.add_label(tag)
-    optimizer = nlp.begin_training()
+    optimizer = nlp.begin_training(get_examples=lambda: train_examples)
+    assert tagger.model.get_dim("nO") == len(TAGS)

    for i in range(50):
        losses = {}
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@ -10,7 +10,7 @@ from spacy.tokens import Doc
 from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL

 from ..util import make_tempdir
-from ...gold import Example
+from ...training import Example


 TRAIN_DATA = [
@ -80,6 +80,51 @@ def test_label_types():
        textcat.add_label(9)


+def test_no_label():
+    nlp = Language()
+    nlp.add_pipe("textcat")
+    with pytest.raises(ValueError):
+        nlp.begin_training()
+
+
+def test_implicit_label():
+    nlp = Language()
+    textcat = nlp.add_pipe("textcat")
+    train_examples = []
+    for t in TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+    nlp.begin_training(get_examples=lambda: train_examples)
+
+
+def test_no_resize():
+    nlp = Language()
+    textcat = nlp.add_pipe("textcat")
+    textcat.add_label("POSITIVE")
+    textcat.add_label("NEGATIVE")
+    nlp.begin_training()
+    assert textcat.model.get_dim("nO") == 2
+    # this throws an error because the textcat can't be resized after initialization
+    with pytest.raises(ValueError):
+        textcat.add_label("NEUTRAL")
+
+
+def test_begin_training_examples():
+    nlp = Language()
+    textcat = nlp.add_pipe("textcat")
+    train_examples = []
+    for text, annotations in TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
+        for label, value in annotations.get("cats").items():
+            textcat.add_label(label)
+    # you shouldn't really call this more than once, but for testing it should be fine
+    nlp.begin_training()
+    nlp.begin_training(get_examples=lambda: train_examples)
+    with pytest.raises(TypeError):
+        nlp.begin_training(get_examples=lambda: None)
+    with pytest.raises(ValueError):
+        nlp.begin_training(get_examples=train_examples)
+
+
 def test_overfitting_IO():
    # Simple test to try and quickly overfit the textcat component - ensuring the ML models work correctly
    fix_random_seed(0)
@ -89,9 +134,8 @@ def test_overfitting_IO():
    train_examples = []
    for text, annotations in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
-        for label, value in annotations.get("cats").items():
-            textcat.add_label(label)
-    optimizer = nlp.begin_training()
+    optimizer = nlp.begin_training(get_examples=lambda: train_examples)
+    assert textcat.model.get_dim("nO") == 2

    for i in range(50):
        losses = {}
--- a/spacy/tests/regression/test_issue1-1000.py
+++ b/spacy/tests/regression/test_issue1-1000.py
@ -1,7 +1,7 @@
 import pytest
 import random
 from spacy import util
-from spacy.gold import Example
+from spacy.training import Example
 from spacy.matcher import Matcher
 from spacy.attrs import IS_PUNCT, ORTH, LOWER
 from spacy.vocab import Vocab
--- a/spacy/tests/regression/test_issue1501-2000.py
+++ b/spacy/tests/regression/test_issue1501-2000.py
@ -3,7 +3,7 @@ import gc
 import numpy
 import copy

-from spacy.gold import Example
+from spacy.training import Example
 from spacy.lang.en import English
 from spacy.lang.en.stop_words import STOP_WORDS
 from spacy.lang.lex_attrs import is_stop
--- a/spacy/tests/regression/test_issue2001-2500.py
+++ b/spacy/tests/regression/test_issue2001-2500.py
@ -3,7 +3,7 @@ import numpy
 from spacy.tokens import Doc
 from spacy.matcher import Matcher
 from spacy.displacy import render
-from spacy.gold import iob_to_biluo
+from spacy.training import iob_to_biluo
 from spacy.lang.it import Italian
 from spacy.lang.en import English

--- a/spacy/tests/regression/test_issue2501-3000.py
+++ b/spacy/tests/regression/test_issue2501-3000.py
@ -1,6 +1,6 @@
 import pytest
 from spacy import displacy
-from spacy.gold import Example
+from spacy.training import Example
 from spacy.lang.en import English
 from spacy.lang.ja import Japanese
 from spacy.lang.xx import MultiLanguage
@ -20,7 +20,7 @@ def test_issue2564():
    nlp = Language()
    tagger = nlp.add_pipe("tagger")
    tagger.add_label("A")
-    tagger.begin_training(lambda: [])
+    nlp.begin_training()
    doc = nlp("hello world")
    assert doc.is_tagged
    docs = nlp.pipe(["hello", "world"])
--- a/spacy/tests/regression/test_issue3501-4000.py
+++ b/spacy/tests/regression/test_issue3501-4000.py
@ -9,7 +9,7 @@ from spacy.tokens import Doc, Token
 from spacy.matcher import Matcher, PhraseMatcher
 from spacy.errors import MatchPatternError
 from spacy.util import minibatch
-from spacy.gold import Example
+from spacy.training import Example
 from spacy.lang.hi import Hindi
 from spacy.lang.es import Spanish
 from spacy.lang.en import English
@ -251,6 +251,12 @@ def test_issue3803():
    assert [t.like_num for t in doc] == [True, True, True, True, True, True]


+def _parser_example(parser):
+    doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
+    gold = {"heads": [1, 1, 3, 3], "deps": ["right", "ROOT", "left", "ROOT"]}
+    return Example.from_dict(doc, gold)
+
+
 def test_issue3830_no_subtok():
    """Test that the parser doesn't have subtok label if not learn_tokens"""
    config = {
@ -264,7 +270,7 @@ def test_issue3830_no_subtok():
    parser = DependencyParser(Vocab(), model, **config)
    parser.add_label("nsubj")
    assert "subtok" not in parser.labels
-    parser.begin_training(lambda: [])
+    parser.begin_training(lambda: [_parser_example(parser)])
    assert "subtok" not in parser.labels


@ -281,7 +287,7 @@ def test_issue3830_with_subtok():
    parser = DependencyParser(Vocab(), model, **config)
    parser.add_label("nsubj")
    assert "subtok" not in parser.labels
-    parser.begin_training(lambda: [])
+    parser.begin_training(lambda: [_parser_example(parser)])
    assert "subtok" in parser.labels


--- a/spacy/tests/regression/test_issue4001-4500.py
+++ b/spacy/tests/regression/test_issue4001-4500.py
@ -2,8 +2,8 @@ import pytest
 from spacy.pipeline import Pipe
 from spacy.matcher import PhraseMatcher, Matcher
 from spacy.tokens import Doc, Span, DocBin
-from spacy.gold import Example, Corpus
-from spacy.gold.converters import json2docs
+from spacy.training import Example, Corpus
+from spacy.training.converters import json2docs
 from spacy.vocab import Vocab
 from spacy.lang.en import English
 from spacy.util import minibatch, ensure_path, load_model
--- a/spacy/tests/regression/test_issue4501-5000.py
+++ b/spacy/tests/regression/test_issue4501-5000.py
@ -1,9 +1,7 @@
 import pytest
-from mock import Mock
-from spacy.matcher import DependencyMatcher
 from spacy.tokens import Doc, Span, DocBin
-from spacy.gold import Example
-from spacy.gold.converters.conllu2docs import conllu2docs
+from spacy.training import Example
+from spacy.training.converters.conllu2docs import conllu2docs
 from spacy.lang.en import English
 from spacy.kb import KnowledgeBase
 from spacy.vocab import Vocab
@ -12,7 +10,7 @@ from spacy.util import ensure_path, load_model_from_path
 import numpy
 import pickle

-from ..util import get_doc, make_tempdir
+from ..util import make_tempdir


 def test_issue4528(en_vocab):
--- a/spacy/tests/regression/test_issue5230.py
+++ b/spacy/tests/regression/test_issue5230.py
@ -64,7 +64,7 @@ def tagger():
    # 1. no model leads to error in serialization,
    # 2. the affected line is the one for model serialization
    tagger.add_label("A")
-    tagger.begin_training(lambda: [], pipeline=nlp.pipeline)
+    nlp.begin_training()
    return tagger


@ -85,7 +85,7 @@ def entity_linker():
    # need to add model for two reasons:
    # 1. no model leads to error in serialization,
    # 2. the affected line is the one for model serialization
-    entity_linker.begin_training(lambda: [], pipeline=nlp.pipeline)
+    nlp.begin_training()
    return entity_linker


--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@ -1,14 +1,15 @@
 import pytest
 from click import NoSuchOption

-from spacy.gold import docs_to_json, biluo_tags_from_offsets
-from spacy.gold.converters import iob2docs, conll_ner2docs, conllu2docs
+from spacy.training import docs_to_json, biluo_tags_from_offsets
+from spacy.training.converters import iob2docs, conll_ner2docs, conllu2docs
 from spacy.lang.en import English
 from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
 from spacy.cli.pretrain import make_docs
 from spacy.cli.init_config import init_config, RECOMMENDATIONS
 from spacy.cli._util import validate_project_commands, parse_config_overrides
 from spacy.cli._util import load_project_config, substitute_project_variables
+from spacy.cli._util import string_to_list
 from thinc.config import ConfigValidationError
 import srsly

@ -372,17 +373,13 @@ def test_parse_config_overrides(args, expected):
    assert parse_config_overrides(args) == expected


-@pytest.mark.parametrize(
-    "args", [["--foo"], ["--x.foo", "bar", "--baz"]],
-)
+@pytest.mark.parametrize("args", [["--foo"], ["--x.foo", "bar", "--baz"]])
 def test_parse_config_overrides_invalid(args):
    with pytest.raises(NoSuchOption):
        parse_config_overrides(args)


-@pytest.mark.parametrize(
-    "args", [["--x.foo", "bar", "baz"], ["x.foo"]],
-)
+@pytest.mark.parametrize("args", [["--x.foo", "bar", "baz"], ["x.foo"]])
 def test_parse_config_overrides_invalid_2(args):
    with pytest.raises(SystemExit):
        parse_config_overrides(args)
@ -401,3 +398,44 @@ def test_init_config(lang, pipeline, optimize):
 def test_model_recommendations():
    for lang, data in RECOMMENDATIONS.items():
        assert RecommendationSchema(**data)
+
+
+@pytest.mark.parametrize(
+    "value",
+    [
+        # fmt: off
+        "parser,textcat,tagger",
+        " parser, textcat ,tagger ",
+        'parser,textcat,tagger',
+        ' parser, textcat ,tagger ',
+        ' "parser"," textcat " ,"tagger "',
+        " 'parser',' textcat ' ,'tagger '",
+        '[parser,textcat,tagger]',
+        '["parser","textcat","tagger"]',
+        '[" parser" ,"textcat ", " tagger " ]',
+        "[parser,textcat,tagger]",
+        "[ parser, textcat , tagger]",
+        "['parser','textcat','tagger']",
+        "[' parser' , 'textcat', ' tagger ' ]",
+        # fmt: on
+    ],
+)
+def test_string_to_list(value):
+    assert string_to_list(value, intify=False) == ["parser", "textcat", "tagger"]
+
+
+@pytest.mark.parametrize(
+    "value",
+    [
+        # fmt: off
+        "1,2,3",
+        '[1,2,3]',
+        '["1","2","3"]',
+        '[" 1" ,"2 ", " 3 " ]',
+        "[' 1' , '2', ' 3 ' ]",
+        # fmt: on
+    ],
+)
+def test_string_to_list_intify(value):
+    assert string_to_list(value, intify=False) == ["1", "2", "3"]
+    assert string_to_list(value, intify=True) == [1, 2, 3]
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@ -3,7 +3,7 @@ import pytest
 from spacy.language import Language
 from spacy.tokens import Doc, Span
 from spacy.vocab import Vocab
-from spacy.gold import Example
+from spacy.training import Example
 from spacy.lang.en import English
 from spacy.util import registry

--- a/spacy/tests/test_new_example.py
+++ b/spacy/tests/test_new_example.py
@ -1,5 +1,5 @@
 import pytest
-from spacy.gold.example import Example
+from spacy.training.example import Example
 from spacy.tokens import Doc
 from spacy.vocab import Vocab

--- a/spacy/tests/test_scorer.py
+++ b/spacy/tests/test_scorer.py
@ -1,8 +1,8 @@
 from numpy.testing import assert_almost_equal, assert_array_almost_equal
 import pytest
 from pytest import approx
-from spacy.gold import Example
-from spacy.gold.iob_utils import biluo_tags_from_offsets
+from spacy.training import Example
+from spacy.training.iob_utils import biluo_tags_from_offsets
 from spacy.scorer import Scorer, ROCAUCScore
 from spacy.scorer import _roc_auc_score, _roc_curve
 from .util import get_doc
--- a/spacy/tests/test_tok2vec.py
+++ b/spacy/tests/test_tok2vec.py
@ -6,7 +6,7 @@ from spacy.ml.models.tok2vec import MishWindowEncoder, MaxoutWindowEncoder
 from spacy.pipeline.tok2vec import Tok2Vec, Tok2VecListener
 from spacy.vocab import Vocab
 from spacy.tokens import Doc
-from spacy.gold import Example
+from spacy.training import Example
 from spacy import util
 from spacy.lang.en import English
 from .util import get_batch
@ -89,6 +89,7 @@ def test_init_tok2vec():
    tok2vec = nlp.add_pipe("tok2vec")
    assert tok2vec.listeners == []
    nlp.begin_training()
+    assert tok2vec.model.get_dim("nO")


 cfg_string = """
--- a/spacy/tests/test_training.py
+++ b/spacy/tests/test_training.py
@ -1,9 +1,10 @@
 import numpy
-from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags, Alignment
-from spacy.gold import spans_from_biluo_tags, iob_to_biluo
-from spacy.gold import Corpus, docs_to_json
-from spacy.gold.example import Example
-from spacy.gold.converters import json2docs
+from spacy.training import biluo_tags_from_offsets, offsets_from_biluo_tags, Alignment
+from spacy.training import spans_from_biluo_tags, iob_to_biluo
+from spacy.training import Corpus, docs_to_json
+from spacy.training.example import Example
+from spacy.training.converters import json2docs
+from spacy.training.augment import make_orth_variants_example
 from spacy.lang.en import English
 from spacy.tokens import Doc, DocBin
 from spacy.util import get_words_and_spaces, minibatch
@ -12,7 +13,6 @@ import pytest
 import srsly

 from .util import make_tempdir
-from ..gold.augment import make_orth_variants_example


@pytest.fixture
--- a/spacy/tests/test_util.py
+++ b/spacy/tests/test_util.py
@ -5,7 +5,7 @@ from .util import get_random_doc
 from spacy import util
 from spacy.util import dot_to_object, SimpleFrozenList
 from thinc.api import Config, Optimizer
-from spacy.gold.batchers import minibatch_by_words
+from spacy.training.batchers import minibatch_by_words
 from ..lang.en import English
 from ..lang.nl import Dutch
 from ..language import DEFAULT_CONFIG_PATH
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -24,7 +24,7 @@ from .util import registry
 from .attrs import intify_attrs
 from .symbols import ORTH
 from .scorer import Scorer
-from .gold import validate_examples
+from .training import validate_examples


 cdef class Tokenizer:
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -576,7 +576,7 @@ cdef class Doc:
                entity_type = 0
                kb_id = 0

-                # Set ent_iob to Missing (0) bij default unless this token was nered before
+                # Set ent_iob to Missing (0) by default unless this token was nered before
                ent_iob = 0
                if self.c[i].ent_iob != 0:
                    ent_iob = 2
--- a/spacy/training/init.pxd
+++ b/spacy/training/init.pxd
--- a/spacy/training/init.py
+++ b/spacy/training/init.py
--- a/spacy/training/align.py
+++ b/spacy/training/align.py
--- a/spacy/training/augment.py
+++ b/spacy/training/augment.py
--- a/spacy/training/batchers.py
+++ b/spacy/training/batchers.py
--- a/spacy/training/converters/init.py
+++ b/spacy/training/converters/init.py
--- a/spacy/training/converters/conll_ner2docs.py
+++ b/spacy/training/converters/conll_ner2docs.py
@ -1,7 +1,7 @@
 from wasabi import Printer

 from .. import tags_to_entities
-from ...gold import iob_to_biluo
+from ...training import iob_to_biluo
 from ...lang.xx import MultiLanguage
 from ...tokens import Doc, Span
 from ...util import load_model
--- a/spacy/training/converters/conllu2docs.py
+++ b/spacy/training/converters/conllu2docs.py
@ -1,7 +1,7 @@
 import re

 from .conll_ner2docs import n_sents_info
-from ...gold import iob_to_biluo, spans_from_biluo_tags
+from ...training import iob_to_biluo, spans_from_biluo_tags
 from ...tokens import Doc, Token, Span
 from ...vocab import Vocab
 from wasabi import Printer
--- a/spacy/training/converters/iob2docs.py
+++ b/spacy/training/converters/iob2docs.py
@ -1,7 +1,7 @@
 from wasabi import Printer

 from .conll_ner2docs import n_sents_info
-from ...gold import iob_to_biluo, tags_to_entities
+from ...training import iob_to_biluo, tags_to_entities
 from ...tokens import Doc, Span
 from ...util import minibatch

--- a/spacy/training/converters/json2docs.py
+++ b/spacy/training/converters/json2docs.py
--- a/spacy/training/corpus.py
+++ b/spacy/training/corpus.py
--- a/spacy/training/example.pxd
+++ b/spacy/training/example.pxd
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
--- a/spacy/training/gold_io.pyx
+++ b/spacy/training/gold_io.pyx
--- a/spacy/training/iob_utils.py
+++ b/spacy/training/iob_utils.py
@ -195,13 +195,15 @@ def tags_to_entities(tags):
            continue
        elif tag.startswith("I"):
            if start is None:
-                raise ValueError(Errors.E067.format(tags=tags[: i + 1]))
+                raise ValueError(Errors.E067.format(start="I", tags=tags[: i + 1]))
            continue
        if tag.startswith("U"):
            entities.append((tag[2:], i, i))
        elif tag.startswith("B"):
            start = i
        elif tag.startswith("L"):
+            if start is None:
+                raise ValueError(Errors.E067.format(start="L", tags=tags[: i + 1]))
            entities.append((tag[2:], start, i))
            start = None
        else:
--- a/spacy/training/loggers.py
+++ b/spacy/training/loggers.py
--- a/spacy/util.py
+++ b/spacy/util.py
@ -93,6 +93,7 @@ class registry(thinc.registry):
    # environment. spaCy models packaged with `spacy package` will "advertise"
    # themselves via entry points.
    models = catalogue.create("spacy", "models", entry_points=True)
+    cli = catalogue.create("spacy", "cli", entry_points=True)


 class SimpleFrozenDict(dict):
@ -647,7 +648,7 @@ def join_command(command: List[str]) -> str:
    return " ".join(shlex.quote(cmd) for cmd in command)


-def run_command(command: Union[str, List[str]], *, capture=False, stdin=None) -> None:
+def run_command(command: Union[str, List[str]], *, capture=False, stdin=None):
    """Run a command on the command line as a subprocess. If the subprocess
    returns a non-zero exit code, a system exit is performed.

--- a/website/README.md
+++ b/website/README.md
@ -290,10 +290,10 @@ always be the **last element** in the row.
 > ```

 | Name                    | Description                                                                                                                                                                 |
-| ------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| ----------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `vocab`                 | The shared vocabulary. ~~Vocab~~                                                                                                                                            |
 | `model`                 | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. ~~Model[List[Doc], FullTransformerBatch]~~                                                   |
-| `annotation_setter` | Function that takes a batch of `Doc` objects and transformer outputs can set additional annotations on the `Doc`. ~~Callable[[List[Doc], FullTransformerBatch], None]~~ |
+| `set_extra_annotations` | Function that takes a batch of `Doc` objects and transformer outputs and can set additional annotations on the `Doc`. ~~Callable[[List[Doc], FullTransformerBatch], None]~~ |

 ### List {#list}

@ -609,7 +609,6 @@ In addition to the native markdown elements, you can use the components
 ├── docs                 # the actual markdown content
 ├── meta                 # JSON-formatted site metadata
 |   ├── languages.json   # supported languages and statistical models
-|   ├── logos.json       # logos and links for landing page
 |   ├── sidebars.json    # sidebar navigations for different sections
 |   ├── site.json        # general site metadata
 |   └── universe.json    # data for the spaCy universe section
--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@ -181,10 +181,10 @@ characters would be `"jumpping"`: 4 from the start, 4 from the end. This ensures
 that the final character is always in the last position, instead of being in an
 arbitrary position depending on the word length.

-The characters are embedded in a embedding table with 256 rows, and the vectors
-concatenated. A hash-embedded vector of the `NORM` of the word is also
-concatenated on, and the result is then passed through a feed-forward network to
-construct a single vector to represent the information.
+The characters are embedded in a embedding table with a given number of rows,
+and the vectors concatenated. A hash-embedded vector of the `NORM` of the word
+is also concatenated on, and the result is then passed through a feed-forward
+network to construct a single vector to represent the information.

 | Name        | Description                                                                                                                                                     |
 | ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------- |
@ -456,62 +456,6 @@ consists of either two or three subnetworks:
 | `nO`                | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~                                                                                                                                                                                                                             |
 | **CREATES**         | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~                                                                                                                                                                                                                                                                                           |

-### spacy.BILUOTagger.v1 {#BILUOTagger source="spacy/ml/models/simple_ner.py"}
-
-> #### Example Config
->
-> ```ini
-> [model]
-> @architectures = "spacy.BILUOTagger.v1 "
->
-> [model.tok2vec]
-> @architectures = "spacy.HashEmbedCNN.v1"
-> # etc.
-> ```
-
-Construct a simple NER tagger that predicts
-[BILUO](/usage/linguistic-features#accessing-ner) tag scores for each token and
-uses greedy decoding with transition-constraints to return a valid BILUO tag
-sequence. A BILUO tag sequence encodes a sequence of non-overlapping labelled
-spans into tags assigned to each token. The first token of a span is given the
-tag `B-LABEL`, the last token of the span is given the tag `L-LABEL`, and tokens
-within the span are given the tag `U-LABEL`. Single-token spans are given the
-tag `U-LABEL`. All other tokens are assigned the tag `O`. The BILUO tag scheme
-generally results in better linear separation between classes, especially for
-non-CRF models, because there are more distinct classes for the different
-situations ([Ratinov et al., 2009](https://www.aclweb.org/anthology/W09-1119/)).
-
-| Name        | Description                                                                                |
-| ----------- | ------------------------------------------------------------------------------------------ |
-| `tok2vec`   | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ |
-| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~                     |
-
-### spacy.IOBTagger.v1 {#IOBTagger source="spacy/ml/models/simple_ner.py"}
-
-> #### Example Config
->
-> ```ini
-> [model]
-> @architectures = "spacy.IOBTagger.v1 "
->
-> [model.tok2vec]
-> @architectures = "spacy.HashEmbedCNN.v1"
-> # etc.
-> ```
-
-Construct a simple NER tagger, that predicts
-[IOB](/usage/linguistic-features#accessing-ner) tag scores for each token and
-uses greedy decoding with transition-constraints to return a valid IOB tag
-sequence. An IOB tag sequence encodes a sequence of non-overlapping labeled
-spans into tags assigned to each token. The first token of a span is given the
-tag B-LABEL, and subsequent tokens are given the tag I-LABEL. All other tokens
-are assigned the tag O.
-
-| Name        | Description                                                                                |
-| ----------- | ------------------------------------------------------------------------------------------ |
-| `tok2vec`   | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ |
-| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~                     |
-
 ## Tagging architectures {#tagger source="spacy/ml/models/tagger.py"}

 ### spacy.Tagger.v1 {#Tagger}
--- a/website/docs/api/attributeruler.md
+++ b/website/docs/api/attributeruler.md
@ -38,7 +38,7 @@ how the component should be configured. You can override its settings via the
 | `validate`      | Whether patterns should be validated (passed to the `Matcher`). Defaults to `False`. ~~bool~~                                                                                                                                                  |

 ```python
-https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/attributeruler.py
+%%GITHUB_SPACY/spacy/pipeline/attributeruler.py
 ```

 ## AttributeRuler.\_\_init\_\_ {#init tag="method"}
--- a/Show More
+++ b/Show More