Merge branch 'develop' into nightly.spacy.io

2025-10-17 17:24:14 +03:00 · 2020-09-12 17:55:45 +02:00 · 2020-09-12 17:55:45 +02:00 · 472b9b4fa3
commit 472b9b4fa3
parent fa101a1bb6 24e138b8ac
180 changed files with 2008 additions and 1902 deletions
--- a/2
+++ b/2
@ -1,7 +1,7 @@
 SHELL := /bin/bash
 ifndef SPACY_EXTRAS
-override SPACY_EXTRAS = spacy-lookups-data jieba pkuseg==0.0.25 sudachipy sudachidict_core
+override SPACY_EXTRAS = spacy-lookups-data==0.4.0.dev0 jieba pkuseg==0.0.25 sudachipy sudachidict_core
 endif
 ifndef PYVER
--- a/extra/example_data/textcat_example_data/textcatjsonl_to_trainjson.py
+++ b/extra/example_data/textcat_example_data/textcatjsonl_to_trainjson.py
@ -1,7 +1,7 @@
 from pathlib import Path
 import plac
 import spacy
-from spacy.gold import docs_to_json
+from spacy.training import docs_to_json
 import srsly
 import sys
--- a/extra/experiments/tok2vec-ner/multihashembed_tok2vec.cfg
+++ b/extra/experiments/tok2vec-ner/multihashembed_tok2vec.cfg
@ -31,10 +31,13 @@ lang = "en"
 vectors = null
 [nlp.pipeline.ner]
-factory = "simple_ner"
+factory = "ner"
 [nlp.pipeline.ner.model]
-@architectures = "spacy.BiluoTagger.v1"
+@architectures = "spacy.TransitionBasedParser.v1"
 nr_feature_tokens = 6
 hidden_width = 64
 maxout_pieces = 2
 [nlp.pipeline.ner.model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v1"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -6,7 +6,7 @@ requires = [
    "cymem>=2.0.2,<2.1.0",
    "preshed>=3.0.2,<3.1.0",
    "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=8.0.0a30,<8.0.0a40",
+    "thinc>=8.0.0a31,<8.0.0a40",
    "blis>=0.4.0,<0.5.0",
    "pytokenizations",
    "pathy"
--- a/requirements.txt
+++ b/requirements.txt
@ -1,7 +1,7 @@
 # Our libraries
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=8.0.0a30,<8.0.0a40
+thinc>=8.0.0a31,<8.0.0a40
 blis>=0.4.0,<0.5.0
 ml_datasets>=0.1.1
 murmurhash>=0.28.0,<1.1.0
--- a/setup.cfg
+++ b/setup.cfg
@ -34,13 +34,13 @@ setup_requires =
    cymem>=2.0.2,<2.1.0
    preshed>=3.0.2,<3.1.0
    murmurhash>=0.28.0,<1.1.0
-    thinc>=8.0.0a30,<8.0.0a40
+    thinc>=8.0.0a31,<8.0.0a40
 install_requires =
    # Our libraries
    murmurhash>=0.28.0,<1.1.0
    cymem>=2.0.2,<2.1.0
    preshed>=3.0.2,<3.1.0
-    thinc>=8.0.0a30,<8.0.0a40
+    thinc>=8.0.0a31,<8.0.0a40
    blis>=0.4.0,<0.5.0
    wasabi>=0.8.0,<1.1.0
    srsly>=2.1.0,<3.0.0
@ -64,7 +64,7 @@ console_scripts =
 [options.extras_require]
 lookups =
-    spacy_lookups_data>=0.3.2,<0.4.0
+    spacy_lookups_data==0.4.0.dev0
 cuda =
    cupy>=5.0.0b4,<9.0.0
 cuda80 =
--- a/setup.py
+++ b/setup.py
@ -23,7 +23,7 @@ Options.docstrings = True
 PACKAGES = find_packages()
 MOD_NAMES = [
-    "spacy.gold.example",
+    "spacy.training.example",
    "spacy.parts_of_speech",
    "spacy.strings",
    "spacy.lexeme",
@ -48,7 +48,7 @@ MOD_NAMES = [
    "spacy.pipeline._parser_internals.stateclass",
    "spacy.pipeline._parser_internals.transition_system",
    "spacy.tokenizer",
-    "spacy.gold.gold_io",
+    "spacy.training.gold_io",
    "spacy.tokens.doc",
    "spacy.tokens.span",
    "spacy.tokens.token",
--- a/spacy/about.py
+++ b/spacy/about.py
@ -1,7 +1,8 @@
 # fmt: off
 __title__ = "spacy-nightly"
-__version__ = "3.0.0a14"
+__version__ = "3.0.0a16"
 __release__ = True
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
-__projects__ = "https://github.com/explosion/spacy-boilerplates"
+__projects__ = "https://github.com/explosion/projects"
 __projects_branch__ = "v3"
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@ -1,4 +1,4 @@
-from typing import Dict, Any, Union, List, Optional, TYPE_CHECKING
+from typing import Dict, Any, Union, List, Optional, Tuple, TYPE_CHECKING
 import sys
 import shutil
 from pathlib import Path
@ -6,6 +6,7 @@ from wasabi import msg
 import srsly
 import hashlib
 import typer
 import subprocess
 from click import NoSuchOption
 from typer.main import get_command
 from contextlib import contextmanager
@ -13,7 +14,7 @@ from thinc.config import Config, ConfigValidationError
 from configparser import InterpolationError
 from ..schemas import ProjectConfigSchema, validate
-from ..util import import_file, run_command, make_tempdir
+from ..util import import_file, run_command, make_tempdir, registry
 if TYPE_CHECKING:
    from pathy import Pathy  # noqa: F401
@ -54,6 +55,8 @@ app.add_typer(init_cli)
 def setup_cli() -> None:
    # Make sure the entry-point for CLI runs, so that they get imported.
    registry.cli.get_all()
    # Ensure that the help messages always display the correct prompt
    command = get_command(app)
    command(prog_name=COMMAND)
@ -318,33 +321,87 @@ def git_sparse_checkout(repo: str, subpath: str, dest: Path, *, branch: str = "m
    # *that* we can do by path.
    # We're using Git and sparse checkout to only clone the files we need
    with make_tempdir() as tmp_dir:
        git_version = get_git_version()
        supports_sparse = git_version >= (2, 22)
        # This is the "clone, but don't download anything" part.
-        cmd = (
+        cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 " f"-b {branch} "
-            f"git clone {repo} {tmp_dir} --no-checkout --depth 1 "
+        if supports_sparse:
-            f"--filter=blob:none "  # <-- The key bit
+            cmd += f"--filter=blob:none"  # <-- The key bit
-            f"-b {branch}"
+        else:
            msg.warn(
                f"You're running an old version of Git (v{git_version[0]}.{git_version[1]}) "
                f"that doesn't fully support sparse checkout yet. This means that "
                f"more files than necessary may be downloaded temporarily. To "
                f"only download the files needed, upgrade to Git v2.22 or above."
            )
-        run_command(cmd, capture=True)
+        _attempt_run_command(cmd)
        # Now we need to find the missing filenames for the subpath we want.
        # Looking for this 'rev-list' command in the git --help? Hah.
-        cmd = f"git -C {tmp_dir} rev-list --objects --all --missing=print -- {subpath}"
+        cmd = f"git -C {tmp_dir} rev-list --objects --all {'--missing=print ' if supports_sparse else ''} -- {subpath}"
-        ret = run_command(cmd, capture=True)
+        ret = _attempt_run_command(cmd)
-        repo = _from_http_to_git(repo)
+        git_repo = _from_http_to_git(repo)
        # Now pass those missings into another bit of git internals
        missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")])
-        cmd = f"git -C {tmp_dir} fetch-pack {repo} {missings}"
+        if supports_sparse and not missings:
-        run_command(cmd, capture=True)
+            err = (
                f"Could not find any relevant files for '{subpath}'. "
                f"Did you specify a correct and complete path within repo '{repo}' "
                f"and branch {branch}?"
            )
            msg.fail(err, exits=1)
        if supports_sparse:
            cmd = f"git -C {tmp_dir} fetch-pack {git_repo} {missings}"
            _attempt_run_command(cmd)
        # And finally, we can checkout our subpath
        cmd = f"git -C {tmp_dir} checkout {branch} {subpath}"
-        run_command(cmd)
+        _attempt_run_command(cmd)
        # We need Path(name) to make sure we also support subdirectories
        shutil.move(str(tmp_dir / Path(subpath)), str(dest))
-def _from_http_to_git(repo):
+def get_git_version() -> Tuple[int, int]:
    ret = _attempt_run_command(["git", "--version"])
    # TODO: this seems kinda brittle?
    version = ret.stdout[11:].strip().split(".")
    return (int(version[0]), int(version[1]))
 def _attempt_run_command(cmd: Union[str, List[str]]):
    try:
        return run_command(cmd, capture=True)
    except subprocess.CalledProcessError as e:
        err = f"Could not run command"
        msg.fail(err)
        print(cmd)
        sys.exit(1)
 def _from_http_to_git(repo: str) -> str:
    if repo.startswith("http://"):
        repo = repo.replace(r"http://", r"https://")
    if repo.startswith(r"https://"):
        repo = repo.replace("https://", "git@").replace("/", ":", 1)
        if repo.endswith("/"):
            repo = repo[:-1]
        repo = f"{repo}.git"
    return repo
 def string_to_list(value, intify=False):
    """Parse a comma-separated string to a list"""
    if not value:
        return []
    if value.startswith("[") and value.endswith("]"):
        value = value[1:-1]
    result = []
    for p in value.split(","):
        p = p.strip()
        if p.startswith("'") and p.endswith("'"):
            p = p[1:-1]
        if p.startswith('"') and p.endswith('"'):
            p = p[1:-1]
        p = p.strip()
        if intify:
            p = int(p)
        result.append(p)
    return result
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@ -7,9 +7,9 @@ import re
 import sys
 from ._util import app, Arg, Opt
-from ..gold import docs_to_json
+from ..training import docs_to_json
 from ..tokens import DocBin
-from ..gold.converters import iob2docs, conll_ner2docs, json2docs, conllu2docs
+from ..training.converters import iob2docs, conll_ner2docs, json2docs, conllu2docs
 # Converters are matched by file extension except for ner/iob, which are
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@ -8,7 +8,7 @@ import typer
 from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
 from ._util import import_code, debug_cli, get_sourced_components
-from ..gold import Corpus, Example
+from ..training import Corpus, Example
 from ..pipeline._parser_internals import nonproj
 from ..language import Language
 from .. import util
--- a/spacy/cli/debug_model.py
+++ b/spacy/cli/debug_model.py
@ -5,7 +5,7 @@ from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam
 from thinc.api import Model, data_validation
 import typer
-from ._util import Arg, Opt, debug_cli, show_validation_error, parse_config_overrides
+from ._util import Arg, Opt, debug_cli, show_validation_error, parse_config_overrides, string_to_list
 from .. import util
@ -38,12 +38,13 @@ def debug_model_cli(
        require_gpu(use_gpu)
    else:
        msg.info("Using CPU")
    layers = string_to_list(layers, intify=True)
    print_settings = {
        "dimensions": dimensions,
        "parameters": parameters,
        "gradients": gradients,
        "attributes": attributes,
-        "layers": [int(x.strip()) for x in layers.split(",")] if layers else [],
+        "layers": layers,
        "print_before_training": P0,
        "print_after_init": P1,
        "print_after_training": P2,
@ -84,11 +85,11 @@ def debug_model(model: Model, *, print_settings: Optional[Dict[str, Any]] = None
        _print_model(model, print_settings)
    # STEP 1: Initializing the model and printing again
    X = _get_docs()
    Y = _get_output(model.ops.xp)
    _set_output_dim(nO=Y.shape[-1], model=model)
    # The output vector might differ from the official type of the output layer
    with data_validation(False):
-        model.initialize(X=_get_docs(), Y=Y)
+        model.initialize(X=X, Y=Y)
    if print_settings.get("print_after_init"):
        msg.divider(f"STEP 1 - after initialization")
        _print_model(model, print_settings)
@ -135,15 +136,6 @@ def _get_output(xp):
    return xp.asarray([i + 10 for i, _ in enumerate(_get_docs())], dtype="float32")
 def _set_output_dim(model, nO):
    # the dim inference doesn't always work 100%, we need this hack like we have it in pipe.pyx
    if model.has_dim("nO") is None:
        model.set_dim("nO", nO)
    if model.has_ref("output_layer"):
        if model.get_ref("output_layer").has_dim("nO") is None:
            model.get_ref("output_layer").set_dim("nO", nO)
 def _print_model(model, print_settings):
    layers = print_settings.get("layers", "")
    parameters = print_settings.get("parameters", False)
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@ -5,7 +5,7 @@ import re
 import srsly
 from thinc.api import require_gpu, fix_random_seed
-from ..gold import Corpus
+from ..training import Corpus
 from ..tokens import Doc
 from ._util import app, Arg, Opt
 from ..scorer import Scorer
--- a/spacy/cli/init_config.py
+++ b/spacy/cli/init_config.py
@ -9,7 +9,7 @@ import re
 from .. import util
 from ..language import DEFAULT_CONFIG_PRETRAIN_PATH
 from ..schemas import RecommendationSchema
-from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND
+from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND, string_to_list
 ROOT = Path(__file__).parent / "templates"
@ -42,7 +42,7 @@ def init_config_cli(
    """
    if isinstance(optimize, Optimizations):  # instance of enum from the CLI
        optimize = optimize.value
-    pipeline = [p.strip() for p in pipeline.split(",")]
+    pipeline = string_to_list(pipeline)
    init_config(output_file, lang=lang, pipeline=pipeline, optimize=optimize, cpu=cpu)
--- a/spacy/cli/init_model.py
+++ b/spacy/cli/init_model.py
@ -256,6 +256,7 @@ def add_vectors(
 def read_vectors(msg: Printer, vectors_loc: Path, truncate_vectors: int):
    f = open_file(vectors_loc)
    f = ensure_shape(f)
    shape = tuple(int(size) for size in next(f).split())
    if truncate_vectors >= 1:
        shape = (truncate_vectors, shape[1])
@ -274,6 +275,31 @@ def read_vectors(msg: Printer, vectors_loc: Path, truncate_vectors: int):
    return vectors_data, vectors_keys
 def ensure_shape(lines):
    """Ensure that the first line of the data is the vectors shape.
    If it's not, we read in the data and output the shape as the first result,
    so that the reader doesn't have to deal with the problem.
    """
    first_line = next(lines)
    try:
        shape = tuple(int(size) for size in first_line.split())
    except ValueError:
        shape = None
    if shape is not None:
        # All good, give the data
        yield first_line
        yield from lines
    else:
        # Figure out the shape, make it the first value, and then give the
        # rest of the data.
        width = len(first_line.split()) - 1
        captured = [first_line] + list(lines)
        length = len(captured)
        yield f"{length} {width}"
        yield from captured
 def read_freqs(
    freqs_loc: Path, max_length: int = 100, min_doc_freq: int = 5, min_freq: int = 50
 ):
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@ -18,6 +18,7 @@ def package_cli(
    output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False),
    meta_path: Optional[Path] = Opt(None, "--meta-path", "--meta", "-m", help="Path to meta.json", exists=True, dir_okay=False),
    create_meta: bool = Opt(False, "--create-meta", "-c", "-C", help="Create meta.json, even if one exists"),
    name: Optional[str] = Opt(None, "--name", "-n", help="Package name to override meta"),
    version: Optional[str] = Opt(None, "--version", "-v", help="Package version to override meta"),
    no_sdist: bool = Opt(False, "--no-sdist", "-NS", help="Don't build .tar.gz sdist, can be set if you want to run this step manually"),
    force: bool = Opt(False, "--force", "-f", "-F", help="Force overwriting existing data in output directory"),
@ -38,6 +39,7 @@ def package_cli(
        input_dir,
        output_dir,
        meta_path=meta_path,
        name=name,
        version=version,
        create_meta=create_meta,
        create_sdist=not no_sdist,
@ -50,6 +52,7 @@ def package(
    input_dir: Path,
    output_dir: Path,
    meta_path: Optional[Path] = None,
    name: Optional[str] = None,
    version: Optional[str] = None,
    create_meta: bool = False,
    create_sdist: bool = True,
@ -71,6 +74,8 @@ def package(
        msg.fail("Can't load pipeline meta.json", meta_path, exits=1)
    meta = srsly.read_json(meta_path)
    meta = get_meta(input_dir, meta)
    if name is not None:
        meta["name"] = name
    if version is not None:
        meta["version"] = version
    if not create_meta:  # only print if user doesn't want to overwrite
--- a/spacy/cli/project/assets.py
+++ b/spacy/cli/project/assets.py
@ -38,16 +38,21 @@ def project_assets(project_dir: Path) -> None:
        msg.warn(f"No assets specified in {PROJECT_FILE}", exits=0)
    msg.info(f"Fetching {len(assets)} asset(s)")
    for asset in assets:
-        dest = Path(asset["dest"])
+        dest = (project_dir / asset["dest"]).resolve()
        checksum = asset.get("checksum")
        if "git" in asset:
            if dest.exists():
                # If there's already a file, check for checksum
                if checksum and checksum == get_checksum(dest):
-                    msg.good(f"Skipping download with matching checksum: {dest}")
+                    msg.good(
                        f"Skipping download with matching checksum: {asset['dest']}"
                    )
                    continue
                else:
                    if dest.is_dir():
                        shutil.rmtree(dest)
                    else:
                        dest.unlink()
            git_sparse_checkout(
                asset["git"]["repo"],
                asset["git"]["path"],
@ -67,14 +72,16 @@ def check_private_asset(dest: Path, checksum: Optional[str] = None) -> None:
    """Check and validate assets without a URL (private assets that the user
    has to provide themselves) and give feedback about the checksum.
-    dest (Path): Desintation path of the asset.
+    dest (Path): Destination path of the asset.
    checksum (Optional[str]): Optional checksum of the expected file.
    """
    if not Path(dest).exists():
        err = f"No URL provided for asset. You need to add this file yourself: {dest}"
        msg.warn(err)
    else:
-        if checksum and checksum == get_checksum(dest):
+        if not checksum:
            msg.good(f"Asset already exists: {dest}")
        elif checksum == get_checksum(dest):
            msg.good(f"Asset exists with matching checksum: {dest}")
        else:
            msg.fail(f"Asset available but with incorrect checksum: {dest}")
--- a/spacy/cli/project/clone.py
+++ b/spacy/cli/project/clone.py
@ -16,6 +16,7 @@ def project_clone_cli(
    name: str = Arg(..., help="The name of the template to clone"),
    dest: Optional[Path] = Arg(None, help="Where to clone the project. Defaults to current working directory", exists=False),
    repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to clone from"),
    branch: str = Opt(about.__projects_branch__, "--branch", "-b", help="The branch to clone from")
    # fmt: on
 ):
    """Clone a project template from a repository. Calls into "git" and will
@ -26,23 +27,30 @@ def project_clone_cli(
    DOCS: https://nightly.spacy.io/api/cli#project-clone
    """
    if dest is None:
-        dest = Path.cwd() / name
+        dest = Path.cwd() / Path(name).parts[-1]
-    project_clone(name, dest, repo=repo)
+    project_clone(name, dest, repo=repo, branch=branch)
-def project_clone(name: str, dest: Path, *, repo: str = about.__projects__) -> None:
+def project_clone(
    name: str,
    dest: Path,
    *,
    repo: str = about.__projects__,
    branch: str = about.__projects_branch__,
 ) -> None:
    """Clone a project template from a repository.
    name (str): Name of subdirectory to clone.
    dest (Path): Destination path of cloned project.
    repo (str): URL of Git repo containing project templates.
    branch (str): The branch to clone from
    """
    dest = ensure_path(dest)
    check_clone(name, dest, repo)
    project_dir = dest.resolve()
    repo_name = re.sub(r"(http(s?)):\/\/github.com/", "", repo)
    try:
-        git_sparse_checkout(repo, name, dest)
+        git_sparse_checkout(repo, name, dest, branch=branch)
    except subprocess.CalledProcessError:
        err = f"Could not clone '{name}' from repo '{repo_name}'"
        msg.fail(err, exits=1)
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -1,4 +1,5 @@
 from typing import Optional, Dict, Any, Tuple, Union, Callable, List
 from timeit import default_timer as timer
 import srsly
 import tqdm
 from pathlib import Path
@ -15,7 +16,7 @@ from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
 from ._util import import_code, get_sourced_components
 from ..language import Language
 from .. import util
-from ..gold.example import Example
+from ..training.example import Example
 from ..errors import Errors
@ -286,9 +287,12 @@ def train_while_improving(
        ]
        raw_batches = util.minibatch(raw_examples, size=8)
    words_seen = 0
    start_time = timer()
    for step, (epoch, batch) in enumerate(train_data):
        dropout = next(dropouts)
        for subbatch in subdivide_batch(batch, accumulate_gradient):
            nlp.update(
                subbatch, drop=dropout, losses=losses, sgd=False, exclude=exclude
            )
@ -317,6 +321,7 @@ def train_while_improving(
        else:
            score, other_scores = (None, None)
            is_best_checkpoint = None
        words_seen += sum(len(eg) for eg in batch)
        info = {
            "epoch": epoch,
            "step": step,
@ -324,6 +329,8 @@ def train_while_improving(
            "other_scores": other_scores,
            "losses": losses,
            "checkpoints": results,
            "seconds": int(timer() - start_time),
            "words": words_seen,
        }
        yield batch, info, is_best_checkpoint
        if is_best_checkpoint is not None:
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@ -52,7 +52,7 @@ path = ${paths.train}
 # data is passed in sentence-by-sentence via some prior preprocessing.
 gold_preproc = false
 # Limitations on training document length
-max_length = 2000
+max_length = 0
 # Limitation on number of training examples
 limit = 0
@ -64,7 +64,7 @@ path = ${paths.dev}
 # data is passed in sentence-by-sentence via some prior preprocessing.
 gold_preproc = false
 # Limitations on training document length
-max_length = 2000
+max_length = 0
 # Limitation on number of training examples
 limit = 0
@ -88,9 +88,4 @@ L2 = 0.01
 grad_clip = 1.0
 use_averages = false
 eps = 1e-8
-
+learn_rate = 0.001
 [training.optimizer.learn_rate]
@schedules = "warmup_linear.v1"
 warmup_steps = 250
 total_steps = 20000
 initial_rate = 0.001
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -66,7 +66,7 @@ class Warnings:
            "in problems with the vocab further on in the pipeline.")
    W030 = ("Some entities could not be aligned in the text \"{text}\" with "
            "entities \"{entities}\". Use "
-            "`spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)`"
+            "`spacy.training.biluo_tags_from_offsets(nlp.make_doc(text), entities)`"
            " to check the alignment. Misaligned entities ('-') will be "
            "ignored during training.")
    W033 = ("Training a new {model} using a model with no lexeme normalization "
@ -247,8 +247,8 @@ class Errors:
            "Query string: {string}\nOrth cached: {orth}\nOrth ID: {orth_id}")
    E065 = ("Only one of the vector table's width and shape can be specified. "
            "Got width {width} and shape {shape}.")
-    E067 = ("Invalid BILUO tag sequence: Got a tag starting with 'I' (inside "
+    E067 = ("Invalid BILUO tag sequence: Got a tag starting with {start} "
-            "an entity) without a preceding 'B' (beginning of an entity). "
+            "without a preceding 'B' (beginning of an entity). "
            "Tag sequence:\n{tags}")
    E068 = ("Invalid BILUO tag: '{tag}'.")
    E071 = ("Error creating lexeme: specified orth ID ({orth}) does not "
@ -320,10 +320,6 @@ class Errors:
            "So instead of pickling the span, pickle the Doc it belongs to or "
            "use Span.as_doc to convert the span to a standalone Doc object.")
    E115 = ("All subtokens must have associated heads.")
    E116 = ("Cannot currently add labels to pretrained text classifier. Add "
            "labels before training begins. This functionality was available "
            "in previous versions, but had significant bugs that led to poor "
            "performance.")
    E117 = ("The newly split tokens must match the text of the original token. "
            "New orths: {new}. Old text: {old}.")
    E118 = ("The custom extension attribute '{attr}' is not registered on the "
@ -378,8 +374,9 @@ class Errors:
            "should be of equal length.")
    E141 = ("Entity vectors should be of length {required} instead of the "
            "provided {found}.")
-    E143 = ("Labels for component '{name}' not initialized. Did you forget to "
+    E143 = ("Labels for component '{name}' not initialized. This can be fixed "
-            "call add_label()?")
+            "by calling add_label, or by providing a representative batch of "
            "examples to the component's begin_training method.")
    E145 = ("Error reading `{param}` from input file.")
    E146 = ("Could not access `{path}`.")
    E147 = ("Unexpected error in the {method} functionality of the "
@ -483,6 +480,16 @@ class Errors:
    E201 = ("Span index out of range.")
    # TODO: fix numbering after merging develop into master
    E921 = ("The method 'set_output' can only be called on components that have "
            "a Model with a 'resize_output' attribute. Otherwise, the output "
            "layer can not be dynamically changed.")
    E922 = ("Component '{name}' has been initialized with an output dimension of "
            "{nO} - cannot add any more labels.")
    E923 = ("It looks like there is no proper sample data to initialize the "
            "Model of component '{name}'. "
            "This is likely a bug in spaCy, so feel free to open an issue.")
    E924 = ("The '{name}' component does not seem to be initialized properly. "
            "This is likely a bug in spaCy, so feel free to open an issue.")
    E925 = ("Invalid color values for displaCy visualizer: expected dictionary "
            "mapping label names to colors but got: {obj}")
    E926 = ("It looks like you're trying to modify nlp.{attr} directly. This "
--- a/spacy/language.py
+++ b/spacy/language.py
@ -17,7 +17,7 @@ from timeit import default_timer as timer
 from .tokens.underscore import Underscore
 from .vocab import Vocab, create_vocab
 from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
-from .gold import Example, validate_examples
+from .training import Example, validate_examples
 from .scorer import Scorer
 from .util import create_default_optimizer, registry, SimpleFrozenList
 from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER
@ -243,6 +243,7 @@ class Language:
        self._config["nlp"]["pipeline"] = list(self.component_names)
        self._config["nlp"]["disabled"] = list(self.disabled)
        self._config["components"] = pipeline
        if not self._config["training"].get("score_weights"):
            self._config["training"]["score_weights"] = combine_score_weights(score_weights)
        if not srsly.is_json_serializable(self._config):
            raise ValueError(Errors.E961.format(config=self._config))
@ -656,7 +657,7 @@ class Language:
        return resolved[factory_name]
    def create_pipe_from_source(
-        self, source_name: str, source: "Language", *, name: str,
+        self, source_name: str, source: "Language", *, name: str
    ) -> Tuple[Callable[[Doc], Doc], str]:
        """Create a pipeline component by copying it from an existing model.
@ -1155,10 +1156,13 @@ class Language:
        DOCS: https://nightly.spacy.io/api/language#begin_training
        """
        # TODO: throw warning when get_gold_tuples is provided instead of get_examples
        if get_examples is None:
-            get_examples = lambda: []
+            util.logger.debug(
-        else:  # Populate vocab
+                "No 'get_examples' callback provided to 'Language.begin_training', creating dummy examples"
            )
            doc = Doc(self.vocab, words=["x", "y", "z"])
            get_examples = lambda: [Example.from_dict(doc, {})]
        # Populate vocab
        if not hasattr(get_examples, "__call__"):
            err = Errors.E930.format(name="Language", obj=type(get_examples))
            raise ValueError(err)
@ -1187,7 +1191,7 @@ class Language:
        return self._optimizer
    def resume_training(
-        self, *, sgd: Optional[Optimizer] = None, device: int = -1,
+        self, *, sgd: Optional[Optimizer] = None, device: int = -1
    ) -> Optimizer:
        """Continue training a pretrained model.
--- a/spacy/ml/_biluo.py
+++ b/spacy/ml/_biluo.py
@ -1,105 +0,0 @@
 """Thinc layer to do simpler transition-based parsing, NER, etc."""
 from typing import Dict, Optional
 import numpy
 from thinc.api import Model
 from thinc.types import Padded, Floats3d
 def BILUO() -> Model[Padded, Padded]:
    return Model(
        "biluo",
        forward,
        init=init,
        dims={"nO": None},
        attrs={"get_num_actions": get_num_actions},
    )
 def init(model, X: Optional[Padded] = None, Y: Optional[Padded] = None):
    if X is not None and Y is not None:
        if X.data.shape != Y.data.shape:
            # TODO: Fix error
            raise ValueError("Mismatched shapes (TODO: Fix message)")
        model.set_dim("nO", X.data.shape[2])
    elif X is not None:
        model.set_dim("nO", X.data.shape[2])
    elif Y is not None:
        model.set_dim("nO", Y.data.shape[2])
    elif model.get_dim("nO") is None:
        raise ValueError("Dimension unset for BILUO: nO")
 def forward(model: Model[Padded, Padded], Xp: Padded, is_train: bool):
    n_labels = (model.get_dim("nO") - 1) // 4
    n_tokens, n_docs, n_actions = Xp.data.shape
    # At each timestep, we make a validity mask of shape (n_docs, n_actions)
    # to indicate which actions are valid next for each sequence. To construct
    # the mask, we have a state of shape (2, n_actions) and a validity table of
    # shape (2, n_actions+1, n_actions). The first dimension of the state indicates
    # whether it's the last token, the second dimension indicates the previous
    # action, plus a special 'null action' for the first entry.
    valid_transitions = model.ops.asarray(_get_transition_table(n_labels))
    prev_actions = model.ops.alloc1i(n_docs)
    # Initialize as though prev action was O
    prev_actions.fill(n_actions - 1)
    Y = model.ops.alloc3f(*Xp.data.shape)
    masks = model.ops.alloc3f(*Y.shape)
    max_value = Xp.data.max()
    for t in range(Xp.data.shape[0]):
        is_last = (Xp.lengths < (t + 2)).astype("i")
        masks[t] = valid_transitions[is_last, prev_actions]
        # Don't train the out-of-bounds sequences.
        masks[t, Xp.size_at_t[t] :] = 0
        # Valid actions get 0*10e8, invalid get large negative value
        Y[t] = Xp.data[t] + ((masks[t] - 1) * max_value * 10)
        prev_actions = Y[t].argmax(axis=-1)
    def backprop_biluo(dY: Padded) -> Padded:
        dY.data *= masks
        return dY
    return Padded(Y, Xp.size_at_t, Xp.lengths, Xp.indices), backprop_biluo
 def get_num_actions(n_labels: int) -> int:
    # One BEGIN action per label
    # One IN action per label
    # One LAST action per label
    # One UNIT action per label
    # One OUT action
    return n_labels + n_labels + n_labels + n_labels + 1
 def _get_transition_table(
    n_labels: int, *, _cache: Dict[int, Floats3d] = {}
 ) -> Floats3d:
    n_actions = get_num_actions(n_labels)
    if n_actions in _cache:
        return _cache[n_actions]
    table = numpy.zeros((2, n_actions, n_actions), dtype="f")
    B_start, B_end = (0, n_labels)
    I_start, I_end = (B_end, B_end + n_labels)
    L_start, L_end = (I_end, I_end + n_labels)
    U_start, _ = (L_end, L_end + n_labels)  # noqa: F841
    # Using ranges allows us to set specific cells, which is necessary to express
    # that only actions of the same label are valid continuations.
    B_range = numpy.arange(B_start, B_end)
    I_range = numpy.arange(I_start, I_end)
    L_range = numpy.arange(L_start, L_end)
    # If this is the last token and the previous action was B or I, only L
    # of that label is valid
    table[1, B_range, L_range] = 1
    table[1, I_range, L_range] = 1
    # If this isn't the last token and the previous action was B or I, only I or
    # L of that label are valid.
    table[0, B_range, I_range] = 1
    table[0, B_range, L_range] = 1
    table[0, I_range, I_range] = 1
    table[0, I_range, L_range] = 1
    # If this isn't the last token and the previous was L, U or O, B is valid
    table[0, L_start:, :B_end] = 1
    # Regardless of whether this is the last token, if the previous action was
    # {L, U, O}, U and O are valid.
    table[:, L_start:, U_start:] = 1
    _cache[n_actions] = table
    return table
--- a/spacy/ml/_iob.py
+++ b/spacy/ml/_iob.py
@ -1,90 +0,0 @@
 """Thinc layer to do simpler transition-based parsing, NER, etc."""
 from typing import Dict, Optional
 from thinc.api import Ops, Model
 from thinc.types import Padded, Floats3d
 def IOB() -> Model[Padded, Padded]:
    return Model(
        "biluo",
        forward,
        init=init,
        dims={"nO": None},
        attrs={"get_num_actions": get_num_actions},
    )
 def init(model: Model, X: Optional[Padded] = None, Y: Optional[Padded] = None) -> None:
    if X is not None and Y is not None:
        if X.data.shape != Y.data.shape:
            # TODO: Fix error
            raise ValueError("Mismatched shapes (TODO: Fix message)")
        model.set_dim("nO", X.data.shape[2])
    elif X is not None:
        model.set_dim("nO", X.data.shape[2])
    elif Y is not None:
        model.set_dim("nO", Y.data.shape[2])
    elif model.get_dim("nO") is None:
        raise ValueError("Dimension unset for BILUO: nO")
 def forward(model: Model[Padded, Padded], Xp: Padded, is_train: bool):
    n_labels = (model.get_dim("nO") - 1) // 2
    n_tokens, n_docs, n_actions = Xp.data.shape
    # At each timestep, we make a validity mask of shape (n_docs, n_actions)
    # to indicate which actions are valid next for each sequence. To construct
    # the mask, we have a state of shape (2, n_actions) and a validity table of
    # shape (2, n_actions+1, n_actions). The first dimension of the state indicates
    # whether it's the last token, the second dimension indicates the previous
    # action, plus a special 'null action' for the first entry.
    valid_transitions = _get_transition_table(model.ops, n_labels)
    prev_actions = model.ops.alloc1i(n_docs)
    # Initialize as though prev action was O
    prev_actions.fill(n_actions - 1)
    Y = model.ops.alloc3f(*Xp.data.shape)
    masks = model.ops.alloc3f(*Y.shape)
    for t in range(Xp.data.shape[0]):
        masks[t] = valid_transitions[prev_actions]
        # Don't train the out-of-bounds sequences.
        masks[t, Xp.size_at_t[t] :] = 0
        # Valid actions get 0*10e8, invalid get -1*10e8
        Y[t] = Xp.data[t] + ((masks[t] - 1) * 10e8)
        prev_actions = Y[t].argmax(axis=-1)
    def backprop_biluo(dY: Padded) -> Padded:
        # Masking the gradient seems to do poorly here. But why?
        # dY.data *= masks
        return dY
    return Padded(Y, Xp.size_at_t, Xp.lengths, Xp.indices), backprop_biluo
 def get_num_actions(n_labels: int) -> int:
    # One BEGIN action per label
    # One IN action per label
    # One LAST action per label
    # One UNIT action per label
    # One OUT action
    return n_labels * 2 + 1
 def _get_transition_table(
    ops: Ops, n_labels: int, _cache: Dict[int, Floats3d] = {}
 ) -> Floats3d:
    n_actions = get_num_actions(n_labels)
    if n_actions in _cache:
        return ops.asarray(_cache[n_actions])
    table = ops.alloc2f(n_actions, n_actions)
    B_start, B_end = (0, n_labels)
    I_start, I_end = (B_end, B_end + n_labels)
    O_action = I_end
    B_range = ops.xp.arange(B_start, B_end)
    I_range = ops.xp.arange(I_start, I_end)
    # B and O are always valid
    table[:, B_start:B_end] = 1
    table[:, O_action] = 1
    # I can only follow a matching B
    table[B_range, I_range] = 1
    _cache[n_actions] = table
    return table
--- a/spacy/ml/models/init.py
+++ b/spacy/ml/models/init.py
@ -1,6 +1,5 @@
 from .entity_linker import *  # noqa
 from .parser import *  # noqa
 from .simple_ner import *  # noqa
 from .tagger import *  # noqa
 from .textcat import *  # noqa
 from .tok2vec import *  # noqa
--- a/spacy/ml/models/simple_ner.py
+++ b/spacy/ml/models/simple_ner.py
@ -1,104 +0,0 @@
 from typing import List
 from thinc.api import Model, Linear, with_array, softmax_activation, padded2list
 from thinc.api import chain, list2padded, configure_normal_init
 from thinc.api import Dropout
 from thinc.types import Floats2d
 from ...tokens import Doc
 from .._biluo import BILUO
 from .._iob import IOB
 from ...util import registry
@registry.architectures.register("spacy.BILUOTagger.v1")
 def BiluoTagger(
    tok2vec: Model[List[Doc], List[Floats2d]]
 ) -> Model[List[Doc], List[Floats2d]]:
    """Construct a simple NER tagger, that predicts BILUO tag scores for each
    token and uses greedy decoding with transition-constraints to return a valid
    BILUO tag sequence.
    A BILUO tag sequence encodes a sequence of non-overlapping labelled spans
    into tags assigned to each token. The first token of a span is given the
    tag B-LABEL, the last token of the span is given the tag L-LABEL, and tokens
    within the span are given the tag U-LABEL. Single-token spans are given
    the tag U-LABEL. All other tokens are assigned the tag O.
    The BILUO tag scheme generally results in better linear separation between
    classes, especially for non-CRF models, because there are more distinct classes
    for the different situations (Ratinov et al., 2009).
    """
    biluo = BILUO()
    linear = Linear(
        nO=None, nI=tok2vec.get_dim("nO"), init_W=configure_normal_init(mean=0.02)
    )
    model = chain(
        tok2vec,
        list2padded(),
        with_array(chain(Dropout(0.1), linear)),
        biluo,
        with_array(softmax_activation()),
        padded2list(),
    )
    return Model(
        "biluo-tagger",
        forward,
        init=init,
        layers=[model, linear],
        refs={"tok2vec": tok2vec, "linear": linear, "biluo": biluo},
        dims={"nO": None},
        attrs={"get_num_actions": biluo.attrs["get_num_actions"]},
    )
@registry.architectures.register("spacy.IOBTagger.v1")
 def IOBTagger(
    tok2vec: Model[List[Doc], List[Floats2d]]
 ) -> Model[List[Doc], List[Floats2d]]:
    """Construct a simple NER tagger, that predicts IOB tag scores for each
    token and uses greedy decoding with transition-constraints to return a valid
    IOB tag sequence.
    An IOB tag sequence encodes a sequence of non-overlapping labelled spans
    into tags assigned to each token. The first token of a span is given the
    tag B-LABEL, and subsequent tokens are given the tag I-LABEL.
    All other tokens are assigned the tag O.
    """
    biluo = IOB()
    linear = Linear(nO=None, nI=tok2vec.get_dim("nO"))
    model = chain(
        tok2vec,
        list2padded(),
        with_array(linear),
        biluo,
        with_array(softmax_activation()),
        padded2list(),
    )
    return Model(
        "iob-tagger",
        forward,
        init=init,
        layers=[model],
        refs={"tok2vec": tok2vec, "linear": linear, "biluo": biluo},
        dims={"nO": None},
        attrs={"get_num_actions": biluo.attrs["get_num_actions"]},
    )
 def init(model: Model[List[Doc], List[Floats2d]], X=None, Y=None) -> None:
    if model.get_dim("nO") is None and Y:
        model.set_dim("nO", Y[0].shape[1])
    nO = model.get_dim("nO")
    biluo = model.get_ref("biluo")
    linear = model.get_ref("linear")
    biluo.set_dim("nO", nO)
    if linear.has_dim("nO") is None:
        linear.set_dim("nO", nO)
    model.layers[0].initialize(X=X, Y=Y)
 def forward(model: Model, X: List[Doc], is_train: bool):
    return model.layers[0](X, is_train)
 __all__ = ["BiluoTagger"]
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@ -165,7 +165,7 @@ def MultiHashEmbed(
@registry.architectures.register("spacy.CharacterEmbed.v1")
 def CharacterEmbed(width: int, rows: int, nM: int, nC: int):
-    """Construct an embedded representations based on character embeddings, using
+    """Construct an embedded representation based on character embeddings, using
    a feed-forward network. A fixed number of UTF-8 byte characters are used for
    each word, taken from the beginning and end of the word equally. Padding is
    used in the centre for words that are too short.
@ -176,8 +176,8 @@ def CharacterEmbed(width: int, rows: int, nM: int, nC: int):
    ensures that the final character is always in the last position, instead
    of being in an arbitrary position depending on the word length.
-    The characters are embedded in a embedding table with 256 rows, and the
+    The characters are embedded in a embedding table with a given number of rows,
-    vectors concatenated. A hash-embedded vector of the NORM of the word is
+    and the vectors concatenated. A hash-embedded vector of the NORM of the word is
    also concatenated on, and the result is then passed through a feed-forward
    network to construct a single vector to represent the information.
--- a/spacy/pipeline/init.py
+++ b/spacy/pipeline/init.py
@ -8,7 +8,6 @@ from .morphologizer import Morphologizer
 from .pipe import Pipe
 from .senter import SentenceRecognizer
 from .sentencizer import Sentencizer
 from .simple_ner import SimpleNER
 from .tagger import Tagger
 from .textcat import TextCategorizer
 from .tok2vec import Tok2Vec
@ -25,7 +24,6 @@ __all__ = [
    "Pipe",
    "SentenceRecognizer",
    "Sentencizer",
    "SimpleNER",
    "Tagger",
    "TextCategorizer",
    "Tok2Vec",
--- a/spacy/pipeline/_parser_internals/arc_eager.pyx
+++ b/spacy/pipeline/_parser_internals/arc_eager.pyx
@ -8,7 +8,7 @@ from ...typedefs cimport hash_t, attr_t
 from ...strings cimport hash_string
 from ...structs cimport TokenC
 from ...tokens.doc cimport Doc, set_children_from_heads
-from ...gold.example cimport Example
+from ...training.example cimport Example
 from ...errors import Errors
 from .stateclass cimport StateClass
 from ._state cimport StateC
--- a/spacy/pipeline/_parser_internals/ner.pyx
+++ b/spacy/pipeline/_parser_internals/ner.pyx
@ -5,7 +5,7 @@ from cymem.cymem cimport Pool
 from ...typedefs cimport weight_t, attr_t
 from ...lexeme cimport Lexeme
 from ...attrs cimport IS_SPACE
-from ...gold.example cimport Example
+from ...training.example cimport Example
 from ...errors import Errors
 from .stateclass cimport StateClass
 from ._state cimport StateC
--- a/spacy/pipeline/_parser_internals/transition_system.pxd
+++ b/spacy/pipeline/_parser_internals/transition_system.pxd
@ -3,7 +3,7 @@ from cymem.cymem cimport Pool
 from ...typedefs cimport attr_t, weight_t
 from ...structs cimport TokenC
 from ...strings cimport StringStore
-from ...gold.example cimport Example
+from ...training.example cimport Example
 from .stateclass cimport StateClass
 from ._state cimport StateC
--- a/spacy/pipeline/attributeruler.py
+++ b/spacy/pipeline/attributeruler.py
@ -4,7 +4,7 @@ from pathlib import Path
 from .pipe import Pipe
 from ..errors import Errors
-from ..gold import validate_examples
+from ..training import validate_examples
 from ..language import Language
 from ..matcher import Matcher
 from ..scorer import Scorer
--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.pyx
@ -9,7 +9,7 @@ from .functions import merge_subtokens
 from ..language import Language
 from ._parser_internals import nonproj
 from ..scorer import Scorer
-from ..gold import validate_examples
+from ..training import validate_examples
 default_model_config = """
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@ -1,3 +1,4 @@
 from itertools import islice
 from typing import Optional, Iterable, Callable, Dict, Iterator, Union, List, Tuple
 from pathlib import Path
 import srsly
@ -11,7 +12,7 @@ from ..tokens import Doc
 from .pipe import Pipe, deserialize_config
 from ..language import Language
 from ..vocab import Vocab
-from ..gold import Example, validate_examples
+from ..training import Example, validate_examples
 from ..errors import Errors, Warnings
 from ..util import SimpleFrozenList
 from .. import util
@ -128,7 +129,7 @@ class EntityLinker(Pipe):
        # how many neightbour sentences to take into account
        self.n_sents = cfg.get("n_sents", 0)
-    def require_kb(self) -> None:
+    def _require_kb(self) -> None:
        # Raise an error if the knowledge base is not initialized.
        if len(self.kb) == 0:
            raise ValueError(Errors.E139.format(name=self.name))
@ -140,10 +141,11 @@ class EntityLinker(Pipe):
        pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None,
        sgd: Optional[Optimizer] = None,
    ) -> Optimizer:
-        """Initialize the pipe for training, using data examples if available.
+        """Initialize the pipe for training, using a representative set
        of data examples.
-        get_examples (Callable[[], Iterable[Example]]): Optional function that
+        get_examples (Callable[[], Iterable[Example]]): Function that
-            returns gold-standard Example objects.
+            returns a representative sample of gold-standard Example objects.
        pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
            components that this component is part of. Corresponds to
            nlp.pipeline.
@ -153,10 +155,19 @@ class EntityLinker(Pipe):
        DOCS: https://nightly.spacy.io/api/entitylinker#begin_training
        """
-        self.require_kb()
+        self._ensure_examples(get_examples)
        self._require_kb()
        nO = self.kb.entity_vector_length
-        self.set_output(nO)
+        doc_sample = []
-        self.model.initialize()
+        vector_sample = []
        for example in islice(get_examples(), 10):
            doc_sample.append(example.x)
            vector_sample.append(self.model.ops.alloc1f(nO))
        assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
        assert len(vector_sample) > 0, Errors.E923.format(name=self.name)
        self.model.initialize(
            X=doc_sample, Y=self.model.ops.asarray(vector_sample, dtype="float32")
        )
        if sgd is None:
            sgd = self.create_optimizer()
        return sgd
@ -184,7 +195,7 @@ class EntityLinker(Pipe):
        DOCS: https://nightly.spacy.io/api/entitylinker#update
        """
-        self.require_kb()
+        self._require_kb()
        if losses is None:
            losses = {}
        losses.setdefault(self.name, 0.0)
@ -296,7 +307,7 @@ class EntityLinker(Pipe):
        DOCS: https://nightly.spacy.io/api/entitylinker#predict
        """
-        self.require_kb()
+        self._require_kb()
        entity_count = 0
        final_kb_ids = []
        if not docs:
@ -405,7 +416,7 @@ class EntityLinker(Pipe):
                    token.ent_kb_id_ = kb_id
    def to_disk(
-        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList(),
+        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
    ) -> None:
        """Serialize the pipe to disk.
@ -422,7 +433,7 @@ class EntityLinker(Pipe):
        util.to_disk(path, serialize, exclude)
    def from_disk(
-        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList(),
+        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
    ) -> "EntityLinker":
        """Load the pipe from disk. Modifies the object in place and returns it.
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@ -9,7 +9,7 @@ from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList
 from ..tokens import Doc, Span
 from ..matcher import Matcher, PhraseMatcher
 from ..scorer import Scorer
-from ..gold import validate_examples
+from ..training import validate_examples
 DEFAULT_ENT_ID_SEP = "||"
--- a/spacy/pipeline/lemmatizer.py
+++ b/spacy/pipeline/lemmatizer.py
@ -8,7 +8,7 @@ from ..lookups import Lookups, load_lookups
 from ..scorer import Scorer
 from ..tokens import Doc, Token
 from ..vocab import Vocab
-from ..gold import validate_examples
+from ..training import validate_examples
 from .. import util
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@ -2,6 +2,7 @@
 from typing import Optional
 import srsly
 from thinc.api import SequenceCategoricalCrossentropy, Model, Config
 from itertools import islice
 from ..tokens.doc cimport Doc
 from ..vocab cimport Vocab
@ -15,7 +16,7 @@ from .pipe import deserialize_config
 from .tagger import Tagger
 from .. import util
 from ..scorer import Scorer
-from ..gold import validate_examples
+from ..training import validate_examples
 default_model_config = """
@ -112,6 +113,7 @@ class Morphologizer(Tagger):
            raise ValueError(Errors.E187)
        if label in self.labels:
            return 0
        self._allow_extra_label()
        # normalize label
        norm_label = self.vocab.morphology.normalize_features(label)
        # extract separate POS and morph tags
@ -128,10 +130,11 @@ class Morphologizer(Tagger):
        return 1
    def begin_training(self, get_examples, *, pipeline=None, sgd=None):
-        """Initialize the pipe for training, using data examples if available.
+        """Initialize the pipe for training, using a representative set
        of data examples.
-        get_examples (Callable[[], Iterable[Example]]): Optional function that
+        get_examples (Callable[[], Iterable[Example]]): Function that
-            returns gold-standard Example objects.
+            returns a representative sample of gold-standard Example objects.
        pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
            components that this component is part of. Corresponds to
            nlp.pipeline.
@ -141,9 +144,8 @@ class Morphologizer(Tagger):
        DOCS: https://nightly.spacy.io/api/morphologizer#begin_training
        """
-        if not hasattr(get_examples, "__call__"):
+        self._ensure_examples(get_examples)
-            err = Errors.E930.format(name="Morphologizer", obj=type(get_examples))
+        # First, fetch all labels from the data
            raise ValueError(err)
        for example in get_examples():
            for i, token in enumerate(example.reference):
                pos = token.pos_
@ -157,8 +159,25 @@ class Morphologizer(Tagger):
                if norm_label not in self.cfg["labels_morph"]:
                    self.cfg["labels_morph"][norm_label] = morph
                    self.cfg["labels_pos"][norm_label] = POS_IDS[pos]
-        self.set_output(len(self.labels))
+        if len(self.labels) <= 1:
-        self.model.initialize()
+            raise ValueError(Errors.E143.format(name=self.name))
        doc_sample = []
        label_sample = []
        for example in islice(get_examples(), 10):
            gold_array = []
            for i, token in enumerate(example.reference):
                pos = token.pos_
                morph = token.morph_
                morph_dict = Morphology.feats_to_dict(morph)
                if pos:
                    morph_dict[self.POS_FEAT] = pos
                norm_label = self.vocab.strings[self.vocab.morphology.add(morph_dict)]
                gold_array.append([1.0 if label == norm_label else 0.0 for label in self.labels])
            doc_sample.append(example.x)
            label_sample.append(self.model.ops.asarray(gold_array, dtype="float32"))
        assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
        assert len(label_sample) > 0, Errors.E923.format(name=self.name)
        self.model.initialize(X=doc_sample, Y=label_sample)
        if sgd is None:
            sgd = self.create_optimizer()
        return sgd
--- a/spacy/pipeline/multitask.pyx
+++ b/spacy/pipeline/multitask.pyx
@ -8,7 +8,7 @@ from ..tokens.doc cimport Doc
 from .pipe import Pipe
 from .tagger import Tagger
-from ..gold import validate_examples
+from ..training import validate_examples
 from ..language import Language
 from ._parser_internals import nonproj
 from ..attrs import POS, ID
@ -90,7 +90,7 @@ class MultitaskObjective(Tagger):
                label = self.make_label(token)
                if label is not None and label not in self.labels:
                    self.labels[label] = len(self.labels)
-        self.model.initialize()
+        self.model.initialize()   # TODO: fix initialization by defining X and Y
        if sgd is None:
            sgd = self.create_optimizer()
        return sgd
@ -178,7 +178,7 @@ class ClozeMultitask(Pipe):
        pass
    def begin_training(self, get_examples, pipeline=None, sgd=None):
-        self.model.initialize()
+        self.model.initialize()  # TODO: fix initialization by defining X and Y
        X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO")))
        self.model.output_layer.begin_training(X)
        if sgd is None:
--- a/spacy/pipeline/ner.pyx
+++ b/spacy/pipeline/ner.pyx
@ -7,7 +7,7 @@ from ._parser_internals.ner cimport BiluoPushDown
 from ..language import Language
 from ..scorer import Scorer
-from ..gold import validate_examples
+from ..training import validate_examples
 default_model_config = """
--- a/spacy/pipeline/pipe.pyx
+++ b/spacy/pipeline/pipe.pyx
@ -4,7 +4,7 @@ from thinc.api import set_dropout_rate, Model
 from ..tokens.doc cimport Doc
-from ..gold import validate_examples
+from ..training import validate_examples
 from ..errors import Errors
 from .. import util
@ -160,6 +160,20 @@ cdef class Pipe:
        """
        raise NotImplementedError(Errors.E931.format(method="add_label", name=self.name))
    def _require_labels(self) -> None:
        """Raise an error if the component's model has no labels defined."""
        if not self.labels or list(self.labels) == [""]:
            raise ValueError(Errors.E143.format(name=self.name))
    def _allow_extra_label(self) -> None:
        """Raise an error if the component can not add any more labels."""
        if self.model.has_dim("nO") and self.model.get_dim("nO") == len(self.labels):
            if not self.is_resizable():
                raise ValueError(Errors.E922.format(name=self.name, nO=self.model.get_dim("nO")))
    def create_optimizer(self):
        """Create an optimizer for the pipeline component.
@ -171,9 +185,12 @@ cdef class Pipe:
    def begin_training(self, get_examples, *, pipeline=None, sgd=None):
        """Initialize the pipe for training, using data examples if available.
        This method needs to be implemented by each Pipe component,
        ensuring the internal model (if available) is initialized properly
        using the provided sample of Example objects.
-        get_examples (Callable[[], Iterable[Example]]): Optional function that
+        get_examples (Callable[[], Iterable[Example]]): Function that
-            returns gold-standard Example objects.
+            returns a representative sample of gold-standard Example objects.
        pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
            components that this component is part of. Corresponds to
            nlp.pipeline.
@ -183,16 +200,24 @@ cdef class Pipe:
        DOCS: https://nightly.spacy.io/api/pipe#begin_training
        """
-        self.model.initialize()
+        raise NotImplementedError(Errors.E931.format(method="add_label", name=self.name))
-        if sgd is None:
+
-            sgd = self.create_optimizer()
+    def _ensure_examples(self, get_examples):
-        return sgd
+        if get_examples is None or not hasattr(get_examples, "__call__"):
            err = Errors.E930.format(name=self.name, obj=type(get_examples))
            raise ValueError(err)
        if not get_examples():
            err = Errors.E930.format(name=self.name, obj=get_examples())
            raise ValueError(err)
    def is_resizable(self):
        return hasattr(self, "model") and "resize_output" in self.model.attrs
    def set_output(self, nO):
-        if self.model.has_dim("nO") is not False:
+        if self.is_resizable():
-            self.model.set_dim("nO", nO)
+            self.model.attrs["resize_output"](self.model, nO)
-        if self.model.has_ref("output_layer"):
+        else:
-            self.model.get_ref("output_layer").set_dim("nO", nO)
+            raise NotImplementedError(Errors.E921)
    def use_params(self, params):
        """Modify the pipe's model, to use the given parameter values. At the
--- a/spacy/pipeline/sentencizer.pyx
+++ b/spacy/pipeline/sentencizer.pyx
@ -7,7 +7,7 @@ from ..tokens.doc cimport Doc
 from .pipe import Pipe
 from ..language import Language
 from ..scorer import Scorer
-from ..gold import validate_examples
+from ..training import validate_examples
 from .. import util
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@ -1,4 +1,6 @@
 # cython: infer_types=True, profile=True, binding=True
 from itertools import islice
 import srsly
 from thinc.api import Model, SequenceCategoricalCrossentropy, Config
@ -9,7 +11,7 @@ from .tagger import Tagger
 from ..language import Language
 from ..errors import Errors
 from ..scorer import Scorer
-from ..gold import validate_examples
+from ..training import validate_examples
 from .. import util
@ -124,10 +126,11 @@ class SentenceRecognizer(Tagger):
        return float(loss), d_scores
    def begin_training(self, get_examples, *, pipeline=None, sgd=None):
-        """Initialize the pipe for training, using data examples if available.
+        """Initialize the pipe for training, using a representative set
        of data examples.
-        get_examples (Callable[[], Iterable[Example]]): Optional function that
+        get_examples (Callable[[], Iterable[Example]]): Function that
-            returns gold-standard Example objects.
+            returns a representative sample of gold-standard Example objects.
        pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
            components that this component is part of. Corresponds to
            nlp.pipeline.
@ -137,8 +140,18 @@ class SentenceRecognizer(Tagger):
        DOCS: https://nightly.spacy.io/api/sentencerecognizer#begin_training
        """
-        self.set_output(len(self.labels))
+        self._ensure_examples(get_examples)
-        self.model.initialize()
+        doc_sample = []
        label_sample = []
        assert self.labels, Errors.E924.format(name=self.name)
        for example in islice(get_examples(), 10):
            doc_sample.append(example.x)
            gold_tags = example.get_aligned("SENT_START")
            gold_array = [[1.0 if tag == gold_tag else 0.0 for tag in self.labels] for gold_tag in gold_tags]
            label_sample.append(self.model.ops.asarray(gold_array, dtype="float32"))
        assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
        assert len(label_sample) > 0, Errors.E923.format(name=self.name)
        self.model.initialize(X=doc_sample, Y=label_sample)
        if sgd is None:
            sgd = self.create_optimizer()
        return sgd
--- a/spacy/pipeline/simple_ner.py
+++ b/spacy/pipeline/simple_ner.py
@ -1,211 +0,0 @@
 from typing import List, Iterable, Optional, Dict, Tuple, Callable, Set
 from thinc.types import Floats2d
 from thinc.api import SequenceCategoricalCrossentropy, set_dropout_rate, Model
 from thinc.api import Optimizer, Config
 from thinc.util import to_numpy
 from ..errors import Errors
 from ..gold import Example, spans_from_biluo_tags, iob_to_biluo, biluo_to_iob
 from ..gold import validate_examples
 from ..tokens import Doc
 from ..language import Language
 from ..vocab import Vocab
 from ..scorer import Scorer
 from .pipe import Pipe
 default_model_config = """
 [model]
@architectures = "spacy.BILUOTagger.v1"
 [model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v1"
 pretrained_vectors = null
 width = 128
 depth = 4
 embed_size = 7000
 window_size = 1
 maxout_pieces = 3
 subword_features = true
 """
 DEFAULT_SIMPLE_NER_MODEL = Config().from_str(default_model_config)["model"]
@Language.factory(
    "simple_ner",
    assigns=["doc.ents"],
    default_config={"labels": [], "model": DEFAULT_SIMPLE_NER_MODEL},
    scores=["ents_p", "ents_r", "ents_f", "ents_per_type"],
    default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0},
 )
 def make_simple_ner(
    nlp: Language, name: str, model: Model, labels: Iterable[str]
 ) -> "SimpleNER":
    return SimpleNER(nlp.vocab, model, name, labels=labels)
 class SimpleNER(Pipe):
    """Named entity recognition with a tagging model. The model should include
    validity constraints to ensure that only valid tag sequences are returned."""
    def __init__(
        self,
        vocab: Vocab,
        model: Model,
        name: str = "simple_ner",
        *,
        labels: Iterable[str],
    ) -> None:
        self.vocab = vocab
        self.model = model
        self.name = name
        self.cfg = {"labels": []}
        for label in labels:
            self.add_label(label)
        self.loss_func = SequenceCategoricalCrossentropy(
            names=self.get_tag_names(), normalize=True, missing_value=None
        )
        assert self.model is not None
    @property
    def is_biluo(self) -> bool:
        return self.model.name.startswith("biluo")
    @property
    def labels(self) -> Tuple[str]:
        return tuple(self.cfg["labels"])
    def add_label(self, label: str) -> None:
        """Add a new label to the pipe.
        label (str): The label to add.
        DOCS: https://nightly.spacy.io/api/simplener#add_label
        """
        if not isinstance(label, str):
            raise ValueError(Errors.E187)
        if label not in self.labels:
            self.cfg["labels"].append(label)
            self.vocab.strings.add(label)
    def get_tag_names(self) -> List[str]:
        if self.is_biluo:
            return (
                [f"B-{label}" for label in self.labels]
                + [f"I-{label}" for label in self.labels]
                + [f"L-{label}" for label in self.labels]
                + [f"U-{label}" for label in self.labels]
                + ["O"]
            )
        else:
            return (
                [f"B-{label}" for label in self.labels]
                + [f"I-{label}" for label in self.labels]
                + ["O"]
            )
    def predict(self, docs: List[Doc]) -> List[Floats2d]:
        scores = self.model.predict(docs)
        return scores
    def set_annotations(self, docs: List[Doc], scores: List[Floats2d]) -> None:
        """Set entities on a batch of documents from a batch of scores."""
        tag_names = self.get_tag_names()
        for i, doc in enumerate(docs):
            actions = to_numpy(scores[i].argmax(axis=1))
            tags = [tag_names[actions[j]] for j in range(len(doc))]
            if not self.is_biluo:
                tags = iob_to_biluo(tags)
            doc.ents = spans_from_biluo_tags(doc, tags)
    def update(
        self,
        examples: List[Example],
        *,
        set_annotations: bool = False,
        drop: float = 0.0,
        sgd: Optional[Optimizer] = None,
        losses: Optional[Dict[str, float]] = None,
    ) -> Dict[str, float]:
        if losses is None:
            losses = {}
        losses.setdefault("ner", 0.0)
        validate_examples(examples, "SimpleNER.update")
        if not any(_has_ner(eg) for eg in examples):
            return losses
        docs = [eg.predicted for eg in examples]
        set_dropout_rate(self.model, drop)
        scores, bp_scores = self.model.begin_update(docs)
        loss, d_scores = self.get_loss(examples, scores)
        bp_scores(d_scores)
        if set_annotations:
            self.set_annotations(docs, scores)
        if sgd is not None:
            self.model.finish_update(sgd)
        losses["ner"] += loss
        return losses
    def get_loss(self, examples: List[Example], scores) -> Tuple[List[Floats2d], float]:
        validate_examples(examples, "SimpleNER.get_loss")
        truths = []
        for eg in examples:
            tags = eg.get_aligned_ner()
            gold_tags = [(tag if tag != "-" else None) for tag in tags]
            if not self.is_biluo:
                gold_tags = biluo_to_iob(gold_tags)
            truths.append(gold_tags)
        for i in range(len(scores)):
            if len(scores[i]) != len(truths[i]):
                raise ValueError(
                    f"Mismatched output and gold sizes.\n"
                    f"Output: {len(scores[i])}, gold: {len(truths[i])}."
                    f"Input: {len(examples[i].doc)}"
                )
        d_scores, loss = self.loss_func(scores, truths)
        return loss, d_scores
    def begin_training(
        self,
        get_examples: Callable[[], Iterable[Example]],
        pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None,
        sgd: Optional[Optimizer] = None,
    ):
        all_labels = set()
        if not hasattr(get_examples, "__call__"):
            err = Errors.E930.format(name="SimpleNER", obj=type(get_examples))
            raise ValueError(err)
        for example in get_examples():
            all_labels.update(_get_labels(example))
        for label in sorted(all_labels):
            self.add_label(label)
        labels = self.labels
        n_actions = self.model.attrs["get_num_actions"](len(labels))
        self.model.set_dim("nO", n_actions)
        self.model.initialize()
        if pipeline is not None:
            self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg)
        self.loss_func = SequenceCategoricalCrossentropy(
            names=self.get_tag_names(), normalize=True, missing_value=None
        )
        return sgd
    def init_multitask_objectives(self, *args, **kwargs):
        pass
    def score(self, examples, **kwargs):
        validate_examples(examples, "SimpleNER.score")
        return Scorer.score_spans(examples, "ents", **kwargs)
 def _has_ner(example: Example) -> bool:
    for ner_tag in example.get_aligned_ner():
        if ner_tag != "-" and ner_tag is not None:
            return True
    else:
        return False
 def _get_labels(example: Example) -> Set[str]:
    labels = set()
    for ner_tag in example.get_aligned("ENT_TYPE", as_string=True):
        if ner_tag != "O" and ner_tag != "-":
            labels.add(ner_tag)
    return labels
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@ -5,6 +5,7 @@ import srsly
 from thinc.api import Model, set_dropout_rate, SequenceCategoricalCrossentropy, Config
 from thinc.types import Floats2d
 import warnings
 from itertools import islice
 from ..tokens.doc cimport Doc
 from ..morphology cimport Morphology
@ -16,7 +17,7 @@ from ..attrs import POS, ID
 from ..parts_of_speech import X
 from ..errors import Errors, TempErrors, Warnings
 from ..scorer import Scorer
-from ..gold import validate_examples
+from ..training import validate_examples
 from .. import util
@ -258,10 +259,11 @@ class Tagger(Pipe):
        return float(loss), d_scores
    def begin_training(self, get_examples, *, pipeline=None, sgd=None):
-        """Initialize the pipe for training, using data examples if available.
+        """Initialize the pipe for training, using a representative set
        of data examples.
-        get_examples (Callable[[], Iterable[Example]]): Optional function that
+        get_examples (Callable[[], Iterable[Example]]): Function that
-            returns gold-standard Example objects.
+            returns a representative sample of gold-standard Example objects..
        pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
            components that this component is part of. Corresponds to
            nlp.pipeline.
@ -271,32 +273,24 @@ class Tagger(Pipe):
        DOCS: https://nightly.spacy.io/api/tagger#begin_training
        """
-        if not hasattr(get_examples, "__call__"):
+        self._ensure_examples(get_examples)
            err = Errors.E930.format(name="Tagger", obj=type(get_examples))
            raise ValueError(err)
        tags = set()
        doc_sample = []
        label_sample = []
        tags = set()
        for example in get_examples():
            for token in example.y:
                if token.tag_:
                    tags.add(token.tag_)
            if len(doc_sample) < 10:
                doc_sample.append(example.x)
        if not doc_sample:
            doc_sample.append(Doc(self.vocab, words=["hello"]))
        for tag in sorted(tags):
            self.add_label(tag)
-        if len(self.labels) == 0:
+        for example in islice(get_examples(), 10):
-            err = Errors.E1006.format(name="Tagger")
+            doc_sample.append(example.x)
-            raise ValueError(err)
+            gold_tags = example.get_aligned("TAG", as_string=True)
-        self.set_output(len(self.labels))
+            gold_array = [[1.0 if tag == gold_tag else 0.0 for tag in self.labels] for gold_tag in gold_tags]
-        if doc_sample:
+            label_sample.append(self.model.ops.asarray(gold_array, dtype="float32"))
-            label_sample = [
+        assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
-                self.model.ops.alloc2f(len(doc), len(self.labels))
+        assert len(label_sample) > 0, Errors.E923.format(name=self.name)
                for doc in doc_sample
            ]
        self.model.initialize(X=doc_sample, Y=label_sample)
        else:
            self.model.initialize()
        if sgd is None:
            sgd = self.create_optimizer()
        return sgd
@ -313,6 +307,7 @@ class Tagger(Pipe):
            raise ValueError(Errors.E187)
        if label in self.labels:
            return 0
        self._allow_extra_label()
        self.cfg["labels"].append(label)
        self.vocab.strings.add(label)
        return 1
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@ -1,3 +1,4 @@
 from itertools import islice
 from typing import Iterable, Tuple, Optional, Dict, List, Callable, Iterator, Any
 from thinc.api import get_array_module, Model, Optimizer, set_dropout_rate, Config
 from thinc.types import Floats2d
@ -5,7 +6,7 @@ import numpy
 from .pipe import Pipe
 from ..language import Language
-from ..gold import Example, validate_examples
+from ..training import Example, validate_examples
 from ..errors import Errors
 from ..scorer import Scorer
 from .. import util
@ -128,11 +129,6 @@ class TextCategorizer(Pipe):
        """
        return tuple(self.cfg.setdefault("labels", []))
    def require_labels(self) -> None:
        """Raise an error if the component's model has no labels defined."""
        if not self.labels:
            raise ValueError(Errors.E143.format(name=self.name))
    @labels.setter
    def labels(self, value: Iterable[str]) -> None:
        self.cfg["labels"] = tuple(value)
@ -311,17 +307,7 @@ class TextCategorizer(Pipe):
            raise ValueError(Errors.E187)
        if label in self.labels:
            return 0
-        if self.model.has_dim("nO"):
+        self._allow_extra_label()
            # This functionality was available previously, but was broken.
            # The problem is that we resize the last layer, but the last layer
            # is actually just an ensemble. We're not resizing the child layers
            # - a huge problem.
            raise ValueError(Errors.E116)
            # smaller = self.model._layers[-1]
            # larger = Linear(len(self.labels)+1, smaller.nI)
            # copy_array(larger.W[:smaller.nO], smaller.W)
            # copy_array(larger.b[:smaller.nO], smaller.b)
            # self.model._layers[-1] = larger
        self.labels = tuple(list(self.labels) + [label])
        return 1
@ -332,10 +318,11 @@ class TextCategorizer(Pipe):
        pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None,
        sgd: Optional[Optimizer] = None,
    ) -> Optimizer:
-        """Initialize the pipe for training, using data examples if available.
+        """Initialize the pipe for training, using a representative set
        of data examples.
-        get_examples (Callable[[], Iterable[Example]]): Optional function that
+        get_examples (Callable[[], Iterable[Example]]): Function that
-            returns gold-standard Example objects.
+            returns a representative sample of gold-standard Example objects.
        pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
            components that this component is part of. Corresponds to
            nlp.pipeline.
@ -345,22 +332,19 @@ class TextCategorizer(Pipe):
        DOCS: https://nightly.spacy.io/api/textcategorizer#begin_training
        """
-        if not hasattr(get_examples, "__call__"):
+        self._ensure_examples(get_examples)
            err = Errors.E930.format(name="TextCategorizer", obj=type(get_examples))
            raise ValueError(err)
        subbatch = []  # Select a subbatch of examples to initialize the model
-        for example in get_examples():
+        for example in islice(get_examples(), 10):
            if len(subbatch) < 2:
                subbatch.append(example)
            for cat in example.y.cats:
                self.add_label(cat)
-        self.require_labels()
+        doc_sample = [eg.reference for eg in subbatch]
-        docs = [eg.reference for eg in subbatch]
+        label_sample, _ = self._examples_to_truth(subbatch)
-        if not docs:  # need at least one doc
+        self._require_labels()
-            docs = [Doc(self.vocab, words=["hello"])]
+        assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
-        truths, _ = self._examples_to_truth(subbatch)
+        assert len(label_sample) > 0, Errors.E923.format(name=self.name)
-        self.set_output(len(self.labels))
+        self.model.initialize(X=doc_sample, Y=label_sample)
        self.model.initialize(X=docs, Y=truths)
        if sgd is None:
            sgd = self.create_optimizer()
        return sgd
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@ -1,8 +1,9 @@
 from typing import Iterator, Sequence, Iterable, Optional, Dict, Callable, List, Tuple
 from thinc.api import Model, set_dropout_rate, Optimizer, Config
 from itertools import islice
 from .pipe import Pipe
-from ..gold import Example, validate_examples
+from ..training import Example, validate_examples
 from ..tokens import Doc
 from ..vocab import Vocab
 from ..language import Language
@ -209,10 +210,11 @@ class Tok2Vec(Pipe):
        pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None,
        sgd: Optional[Optimizer] = None,
    ):
-        """Initialize the pipe for training, using data examples if available.
+        """Initialize the pipe for training, using a representative set
        of data examples.
-        get_examples (Callable[[], Iterable[Example]]): Optional function that
+        get_examples (Callable[[], Iterable[Example]]): Function that
-            returns gold-standard Example objects.
+            returns a representative sample of gold-standard Example objects.
        pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
            components that this component is part of. Corresponds to
            nlp.pipeline.
@ -222,8 +224,12 @@ class Tok2Vec(Pipe):
        DOCS: https://nightly.spacy.io/api/tok2vec#begin_training
        """
-        docs = [Doc(self.vocab, words=["hello"])]
+        self._ensure_examples(get_examples)
-        self.model.initialize(X=docs)
+        doc_sample = []
        for example in islice(get_examples(), 10):
            doc_sample.append(example.x)
        assert doc_sample, Errors.E923.format(name=self.name)
        self.model.initialize(X=doc_sample)
    def add_label(self, label):
        raise NotImplementedError
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@ -21,7 +21,7 @@ from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
 from ..ml.parser_model cimport get_c_weights, get_c_sizes
 from ..tokens.doc cimport Doc
-from ..gold import validate_examples
+from ..training import validate_examples
 from ..errors import Errors, Warnings
 from .. import util
@ -244,7 +244,7 @@ cdef class Parser(Pipe):
            int nr_class, int batch_size) nogil:
        # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
        with gil:
-            assert self.moves.n_moves > 0
+            assert self.moves.n_moves > 0, Errors.E924.format(name=self.name)
        is_valid = <int*>calloc(self.moves.n_moves, sizeof(int))
        cdef int i, guess
        cdef Transition action
@ -378,7 +378,7 @@ cdef class Parser(Pipe):
        cdef int i
        # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
-        assert self.moves.n_moves > 0
+        assert self.moves.n_moves > 0, Errors.E924.format(name=self.name)
        is_valid = <int*>mem.alloc(self.moves.n_moves, sizeof(int))
        costs = <float*>mem.alloc(self.moves.n_moves, sizeof(float))
@ -406,9 +406,7 @@ cdef class Parser(Pipe):
        self.model.attrs["resize_output"](self.model, nO)
    def begin_training(self, get_examples, pipeline=None, sgd=None, **kwargs):
-        if not hasattr(get_examples, "__call__"):
+        self._ensure_examples(get_examples)
            err = Errors.E930.format(name="DependencyParser/EntityRecognizer", obj=type(get_examples))
            raise ValueError(err)
        self.cfg.update(kwargs)
        lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {})
        if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS:
@ -430,9 +428,6 @@ cdef class Parser(Pipe):
        if sgd is None:
            sgd = self.create_optimizer()
        doc_sample = []
        for example in islice(get_examples(), 10):
            doc_sample.append(example.predicted)
        if pipeline is not None:
            for name, component in pipeline:
                if component is self:
@ -441,10 +436,11 @@ cdef class Parser(Pipe):
                    doc_sample = list(component.pipe(doc_sample, batch_size=8))
                else:
                    doc_sample = [component(doc) for doc in doc_sample]
-        if doc_sample:
+        if not doc_sample:
            for example in islice(get_examples(), 10):
                doc_sample.append(example.predicted)
        assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
        self.model.initialize(doc_sample)
        else:
            self.model.initialize()
        if pipeline is not None:
            self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg)
        return sgd
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@ -12,7 +12,7 @@ from .attrs import NAMES
 if TYPE_CHECKING:
    # This lets us add type hints for mypy etc. without causing circular imports
    from .language import Language  # noqa: F401
-    from .gold import Example  # noqa: F401
+    from .training import Example  # noqa: F401
 ItemT = TypeVar("ItemT")
@ -180,7 +180,7 @@ class ModelMetaSchema(BaseModel):
    url: StrictStr = Field("", title="Model author URL")
    sources: Optional[Union[List[StrictStr], List[Dict[str, str]]]] = Field(None, title="Training data sources")
    vectors: Dict[str, Any] = Field({}, title="Included word vectors")
-    labels: Dict[str, Dict[str, List[str]]] = Field({}, title="Component labels, keyed by component name")
+    labels: Dict[str, List[str]] = Field({}, title="Component labels, keyed by component name")
    accuracy: Dict[str, Union[float, Dict[str, float]]] = Field({}, title="Accuracy numbers")
    speed: Dict[str, Union[float, int]] = Field({}, title="Speed evaluation numbers")
    spacy_git_version: StrictStr = Field("", title="Commit of spaCy version used")
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@ -1,7 +1,7 @@
 from typing import Optional, Iterable, Dict, Any, Callable, TYPE_CHECKING
 import numpy as np
-from .gold import Example
+from .training import Example
 from .tokens import Token, Doc, Span
 from .errors import Errors
 from .util import get_lang_class, SimpleFrozenList
--- a/spacy/tests/doc/test_add_entities.py
+++ b/spacy/tests/doc/test_add_entities.py
@ -1,5 +1,6 @@
 from spacy.training import Example
 from spacy.pipeline import EntityRecognizer
-from spacy.tokens import Span
+from spacy.tokens import Span, Doc
 from spacy import registry
 import pytest
@ -7,6 +8,12 @@ from ..util import get_doc
 from spacy.pipeline.ner import DEFAULT_NER_MODEL
 def _ner_example(ner):
    doc = Doc(ner.vocab, words=["Joe", "loves", "visiting", "London", "during", "the", "weekend"])
    gold = {"entities": [(0, 3, "PERSON"), (19, 25, "LOC")]}
    return Example.from_dict(doc, gold)
 def test_doc_add_entities_set_ents_iob(en_vocab):
    text = ["This", "is", "a", "lion"]
    doc = get_doc(en_vocab, text)
@ -18,10 +25,8 @@ def test_doc_add_entities_set_ents_iob(en_vocab):
    cfg = {"model": DEFAULT_NER_MODEL}
    model = registry.make_from_config(cfg, validate=True)["model"]
    ner = EntityRecognizer(en_vocab, model, **config)
-    ner.begin_training(lambda: [])
+    ner.begin_training(lambda: [_ner_example(ner)])
    ner(doc)
    assert len(list(doc.ents)) == 0
    assert [w.ent_iob_ for w in doc] == (["O"] * len(doc))
    doc.ents = [(doc.vocab.strings["ANIMAL"], 3, 4)]
    assert [w.ent_iob_ for w in doc] == ["O", "O", "O", "B"]
@ -31,6 +36,7 @@ def test_doc_add_entities_set_ents_iob(en_vocab):
 def test_ents_reset(en_vocab):
    """Ensure that resetting doc.ents does not change anything"""
    text = ["This", "is", "a", "lion"]
    doc = get_doc(en_vocab, text)
    config = {
@ -41,11 +47,11 @@ def test_ents_reset(en_vocab):
    cfg = {"model": DEFAULT_NER_MODEL}
    model = registry.make_from_config(cfg, validate=True)["model"]
    ner = EntityRecognizer(en_vocab, model, **config)
-    ner.begin_training(lambda: [])
+    ner.begin_training(lambda: [_ner_example(ner)])
    ner(doc)
-    assert [t.ent_iob_ for t in doc] == (["O"] * len(doc))
+    orig_iobs = [t.ent_iob_ for t in doc]
    doc.ents = list(doc.ents)
-    assert [t.ent_iob_ for t in doc] == (["O"] * len(doc))
+    assert [t.ent_iob_ for t in doc] == orig_iobs
 def test_add_overlapping_entities(en_vocab):
--- a/spacy/tests/parser/test_add_label.py
+++ b/spacy/tests/parser/test_add_label.py
@ -3,7 +3,7 @@ from thinc.api import Adam, fix_random_seed
 from spacy import registry
 from spacy.attrs import NORM
 from spacy.vocab import Vocab
-from spacy.gold import Example
+from spacy.training import Example
 from spacy.tokens import Doc
 from spacy.pipeline import DependencyParser, EntityRecognizer
 from spacy.pipeline.ner import DEFAULT_NER_MODEL
@ -35,7 +35,7 @@ def test_init_parser(parser):
 def _train_parser(parser):
    fix_random_seed(1)
    parser.add_label("left")
-    parser.begin_training(lambda: [], **parser.cfg)
+    parser.begin_training(lambda: [_parser_example(parser)], **parser.cfg)
    sgd = Adam(0.001)
    for i in range(5):
@ -47,16 +47,25 @@ def _train_parser(parser):
    return parser
 def _parser_example(parser):
    doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
    gold = {"heads": [1, 1, 3, 3], "deps": ["right", "ROOT", "left", "ROOT"]}
    return Example.from_dict(doc, gold)
 def _ner_example(ner):
    doc = Doc(ner.vocab, words=["Joe", "loves", "visiting", "London", "during", "the", "weekend"])
    gold = {"entities": [(0, 3, "PERSON"), (19, 25, "LOC")]}
    return Example.from_dict(doc, gold)
 def test_add_label(parser):
    parser = _train_parser(parser)
    parser.add_label("right")
    sgd = Adam(0.001)
    for i in range(100):
        losses = {}
-        doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
+        parser.update([_parser_example(parser)], sgd=sgd, losses=losses)
        gold = {"heads": [1, 1, 3, 3], "deps": ["right", "ROOT", "left", "ROOT"]}
        example = Example.from_dict(doc, gold)
        parser.update([example], sgd=sgd, losses=losses)
    doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
    doc = parser(doc)
    assert doc[0].dep_ == "right"
@ -75,7 +84,7 @@ def test_add_label_deserializes_correctly():
    ner1.add_label("C")
    ner1.add_label("B")
    ner1.add_label("A")
-    ner1.begin_training(lambda: [])
+    ner1.begin_training(lambda: [_ner_example(ner1)])
    ner2 = EntityRecognizer(Vocab(), model, **config)
    # the second model needs to be resized before we can call from_bytes
--- a/spacy/tests/parser/test_arc_eager_oracle.py
+++ b/spacy/tests/parser/test_arc_eager_oracle.py
@ -1,7 +1,7 @@
 import pytest
 from spacy.vocab import Vocab
 from spacy import registry
-from spacy.gold import Example
+from spacy.training import Example
 from spacy.pipeline import DependencyParser
 from spacy.tokens import Doc
 from spacy.pipeline._parser_internals.nonproj import projectivize
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@ -4,7 +4,7 @@ from spacy.lang.en import English
 from spacy.language import Language
 from spacy.lookups import Lookups
 from spacy.pipeline._parser_internals.ner import BiluoPushDown
-from spacy.gold import Example
+from spacy.training import Example
 from spacy.tokens import Doc
 from spacy.vocab import Vocab
 import logging
--- a/spacy/tests/parser/test_neural_parser.py
+++ b/spacy/tests/parser/test_neural_parser.py
@ -1,7 +1,7 @@
 import pytest
 from spacy import registry
-from spacy.gold import Example
+from spacy.training import Example
 from spacy.vocab import Vocab
 from spacy.pipeline._parser_internals.arc_eager import ArcEager
 from spacy.pipeline.transition_parser import Parser
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@ -3,7 +3,7 @@ import pytest
 from spacy.lang.en import English
 from ..util import get_doc, apply_transition_sequence, make_tempdir
 from ... import util
-from ...gold import Example
+from ...training import Example
 TRAIN_DATA = [
    (
@ -85,7 +85,7 @@ def test_parser_merge_pp(en_tokenizer):
    pos = ["DET", "NOUN", "ADP", "DET", "NOUN", "VERB"]
    tokens = en_tokenizer(text)
    doc = get_doc(
-        tokens.vocab, words=[t.text for t in tokens], deps=deps, heads=heads, pos=pos,
+        tokens.vocab, words=[t.text for t in tokens], deps=deps, heads=heads, pos=pos
    )
    with doc.retokenize() as retokenizer:
        for np in doc.noun_chunks:
--- a/spacy/tests/parser/test_preset_sbd.py
+++ b/spacy/tests/parser/test_preset_sbd.py
@ -3,7 +3,7 @@ from thinc.api import Adam
 from spacy.attrs import NORM
 from spacy.vocab import Vocab
 from spacy import registry
-from spacy.gold import Example
+from spacy.training import Example
 from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
 from spacy.tokens import Doc
 from spacy.pipeline import DependencyParser
@ -14,6 +14,12 @@ def vocab():
    return Vocab(lex_attr_getters={NORM: lambda s: s})
 def _parser_example(parser):
    doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
    gold = {"heads": [1, 1, 3, 3], "deps": ["right", "ROOT", "left", "ROOT"]}
    return Example.from_dict(doc, gold)
@pytest.fixture
 def parser(vocab):
    config = {
@ -28,7 +34,7 @@ def parser(vocab):
    parser.cfg["hidden_width"] = 32
    # parser.add_label('right')
    parser.add_label("left")
-    parser.begin_training(lambda: [], **parser.cfg)
+    parser.begin_training(lambda: [_parser_example(parser)], **parser.cfg)
    sgd = Adam(0.001)
    for i in range(10):
--- a/spacy/tests/pipeline/test_attributeruler.py
+++ b/spacy/tests/pipeline/test_attributeruler.py
@ -1,6 +1,6 @@
 import pytest
 import numpy
-from spacy.gold import Example
+from spacy.training import Example
 from spacy.lang.en import English
 from spacy.pipeline import AttributeRuler
 from spacy import util, registry
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@ -4,7 +4,7 @@ import pytest
 from spacy.kb import KnowledgeBase, get_candidates, Candidate
 from spacy import util, registry
-from spacy.gold import Example
+from spacy.training import Example
 from spacy.lang.en import English
 from spacy.tests.util import make_tempdir
 from spacy.tokens import Span
@ -281,11 +281,12 @@ def test_append_invalid_alias(nlp):
 def test_preserving_links_asdoc(nlp):
    """Test that Span.as_doc preserves the existing entity links"""
    vector_length = 1
    @registry.misc.register("myLocationsKB.v1")
    def dummy_kb() -> Callable[["Vocab"], KnowledgeBase]:
        def create_kb(vocab):
-            mykb = KnowledgeBase(vocab, entity_vector_length=1)
+            mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
            # adding entities
            mykb.add_entity(entity="Q1", freq=19, entity_vector=[1])
            mykb.add_entity(entity="Q2", freq=8, entity_vector=[1])
@ -305,10 +306,9 @@ def test_preserving_links_asdoc(nlp):
    ruler = nlp.add_pipe("entity_ruler")
    ruler.add_patterns(patterns)
    el_config = {"kb_loader": {"@misc": "myLocationsKB.v1"}, "incl_prior": False}
-    el_pipe = nlp.add_pipe("entity_linker", config=el_config, last=True)
+    entity_linker = nlp.add_pipe("entity_linker", config=el_config, last=True)
-    el_pipe.begin_training(lambda: [])
+    nlp.begin_training()
-    el_pipe.incl_context = False
+    assert entity_linker.model.get_dim("nO") == vector_length
    el_pipe.incl_prior = True
    # test whether the entity links are preserved by the `as_doc()` function
    text = "She lives in Boston. He lives in Denver."
@ -373,6 +373,7 @@ def test_overfitting_IO():
    # Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly
    nlp = English()
    nlp.add_pipe("sentencizer")
    vector_length = 3
    # Add a custom component to recognize "Russ Cochran" as an entity for the example training data
    patterns = [
@ -393,7 +394,7 @@ def test_overfitting_IO():
            # create artificial KB - assign same prior weight to the two russ cochran's
            # Q2146908 (Russ Cochran): American golfer
            # Q7381115 (Russ Cochran): publisher
-            mykb = KnowledgeBase(vocab, entity_vector_length=3)
+            mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
            mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
            mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7])
            mykb.add_alias(
@ -406,14 +407,17 @@ def test_overfitting_IO():
        return create_kb
    # Create the Entity Linker component and add it to the pipeline
-    nlp.add_pipe(
+    entity_linker = nlp.add_pipe(
        "entity_linker",
        config={"kb_loader": {"@misc": "myOverfittingKB.v1"}},
        last=True,
    )
    # train the NEL pipe
-    optimizer = nlp.begin_training()
+    optimizer = nlp.begin_training(get_examples=lambda: train_examples)
    assert entity_linker.model.get_dim("nO") == vector_length
    assert entity_linker.model.get_dim("nO") == entity_linker.kb.entity_vector_length
    for i in range(50):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)
--- a/spacy/tests/pipeline/test_morphologizer.py
+++ b/spacy/tests/pipeline/test_morphologizer.py
@ -1,7 +1,7 @@
 import pytest
 from spacy import util
-from spacy.gold import Example
+from spacy.training import Example
 from spacy.lang.en import English
 from spacy.language import Language
 from spacy.tests.util import make_tempdir
@ -25,27 +25,61 @@ TRAIN_DATA = [
        },
    ),
    # test combinations of morph+POS
-    ("Eat blue ham", {"morphs": ["Feat=V", "", ""], "pos": ["", "ADJ", ""]},),
+    ("Eat blue ham", {"morphs": ["Feat=V", "", ""], "pos": ["", "ADJ", ""]}),
 ]
 def test_no_label():
    nlp = Language()
    nlp.add_pipe("morphologizer")
    with pytest.raises(ValueError):
        nlp.begin_training()
 def test_implicit_label():
    nlp = Language()
    nlp.add_pipe("morphologizer")
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
    nlp.begin_training(get_examples=lambda: train_examples)
 def test_no_resize():
    nlp = Language()
    morphologizer = nlp.add_pipe("morphologizer")
    morphologizer.add_label("POS" + Morphology.FIELD_SEP + "NOUN")
    morphologizer.add_label("POS" + Morphology.FIELD_SEP + "VERB")
    nlp.begin_training()
    # this throws an error because the morphologizer can't be resized after initialization
    with pytest.raises(ValueError):
        morphologizer.add_label("POS" + Morphology.FIELD_SEP + "ADJ")
 def test_begin_training_examples():
    nlp = Language()
    morphologizer = nlp.add_pipe("morphologizer")
    morphologizer.add_label("POS" + Morphology.FIELD_SEP + "NOUN")
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
    # you shouldn't really call this more than once, but for testing it should be fine
    nlp.begin_training()
    nlp.begin_training(get_examples=lambda: train_examples)
    with pytest.raises(TypeError):
        nlp.begin_training(get_examples=lambda: None)
    with pytest.raises(ValueError):
        nlp.begin_training(get_examples=train_examples)
 def test_overfitting_IO():
    # Simple test to try and quickly overfit the morphologizer - ensuring the ML models work correctly
    nlp = English()
-    morphologizer = nlp.add_pipe("morphologizer")
+    nlp.add_pipe("morphologizer")
    train_examples = []
    for inst in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(inst[0]), inst[1]))
-        for morph, pos in zip(inst[1]["morphs"], inst[1]["pos"]):
+    optimizer = nlp.begin_training(get_examples=lambda: train_examples)
            if morph and pos:
                morphologizer.add_label(
                    morph + Morphology.FEATURE_SEP + "POS" + Morphology.FIELD_SEP + pos
                )
            elif pos:
                morphologizer.add_label("POS" + Morphology.FIELD_SEP + pos)
            elif morph:
                morphologizer.add_label(morph)
    optimizer = nlp.begin_training()
    for i in range(50):
        losses = {}
@ -55,18 +89,8 @@ def test_overfitting_IO():
    # test the trained model
    test_text = "I like blue ham"
    doc = nlp(test_text)
-    gold_morphs = [
+    gold_morphs = ["Feat=N", "Feat=V", "", ""]
-        "Feat=N",
+    gold_pos_tags = ["NOUN", "VERB", "ADJ", ""]
        "Feat=V",
        "",
        "",
    ]
    gold_pos_tags = [
        "NOUN",
        "VERB",
        "ADJ",
        "",
    ]
    assert [t.morph_ for t in doc] == gold_morphs
    assert [t.pos_ for t in doc] == gold_pos_tags
--- a/spacy/tests/pipeline/test_senter.py
+++ b/spacy/tests/pipeline/test_senter.py
@ -1,7 +1,7 @@
 import pytest
 from spacy import util
-from spacy.gold import Example
+from spacy.training import Example
 from spacy.lang.en import English
 from spacy.language import Language
 from spacy.tests.util import make_tempdir
@ -30,6 +30,20 @@ TRAIN_DATA = [
    ),
 ]
 def test_begin_training_examples():
    nlp = Language()
    senter = nlp.add_pipe("senter")
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
    # you shouldn't really call this more than once, but for testing it should be fine
    nlp.begin_training()
    nlp.begin_training(get_examples=lambda: train_examples)
    with pytest.raises(TypeError):
        nlp.begin_training(get_examples=lambda: None)
    with pytest.raises(ValueError):
        nlp.begin_training(get_examples=train_examples)
 def test_overfitting_IO():
    # Simple test to try and quickly overfit the senter - ensuring the ML models work correctly
--- a/spacy/tests/pipeline/test_simple_ner.py
+++ b/spacy/tests/pipeline/test_simple_ner.py
@ -1,45 +0,0 @@
 from spacy.lang.en import English
 from spacy.gold import Example
 from spacy import util
 from ..util import make_tempdir
 TRAIN_DATA = [
    ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
    ("I like London and Berlin.", {"entities": [(7, 13, "LOC"), (18, 24, "LOC")]}),
 ]
 def test_overfitting_IO():
    # Simple test to try and quickly overfit the SimpleNER component - ensuring the ML models work correctly
    nlp = English()
    ner = nlp.add_pipe("simple_ner")
    train_examples = []
    for text, annotations in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])
    optimizer = nlp.begin_training()
    for i in range(50):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)
    assert losses["ner"] < 0.0001
    # test the trained model
    test_text = "I like London."
    doc = nlp(test_text)
    ents = doc.ents
    assert len(ents) == 1
    assert ents[0].text == "London"
    assert ents[0].label_ == "LOC"
    # Also test the results are still the same after IO
    with make_tempdir() as tmp_dir:
        nlp.to_disk(tmp_dir)
        nlp2 = util.load_model_from_path(tmp_dir)
        doc2 = nlp2(test_text)
        ents2 = doc2.ents
        assert len(ents2) == 1
        assert ents2[0].text == "London"
        assert ents2[0].label_ == "LOC"
--- a/spacy/tests/pipeline/test_tagger.py
+++ b/spacy/tests/pipeline/test_tagger.py
@ -1,6 +1,6 @@
 import pytest
 from spacy import util
-from spacy.gold import Example
+from spacy.training import Example
 from spacy.lang.en import English
 from spacy.language import Language
@ -34,6 +34,56 @@ TRAIN_DATA = [
 ]
 def test_no_label():
    nlp = Language()
    nlp.add_pipe("tagger")
    with pytest.raises(ValueError):
        nlp.begin_training()
 def test_no_resize():
    nlp = Language()
    tagger = nlp.add_pipe("tagger")
    tagger.add_label("N")
    tagger.add_label("V")
    assert tagger.labels == ("N", "V")
    nlp.begin_training()
    assert tagger.model.get_dim("nO") == 2
    # this throws an error because the tagger can't be resized after initialization
    with pytest.raises(ValueError):
        tagger.add_label("J")
 def test_implicit_label():
    nlp = Language()
    nlp.add_pipe("tagger")
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
    nlp.begin_training(get_examples=lambda: train_examples)
 def test_begin_training_examples():
    nlp = Language()
    tagger = nlp.add_pipe("tagger")
    train_examples = []
    for tag in TAGS:
        tagger.add_label(tag)
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
    # you shouldn't really call this more than once, but for testing it should be fine
    nlp.begin_training()
    nlp.begin_training(get_examples=lambda: train_examples)
    with pytest.raises(TypeError):
        nlp.begin_training(get_examples=lambda: None)
    with pytest.raises(TypeError):
        nlp.begin_training(get_examples=lambda: train_examples[0])
    with pytest.raises(ValueError):
        nlp.begin_training(get_examples=lambda: [])
    with pytest.raises(ValueError):
        nlp.begin_training(get_examples=train_examples)
 def test_overfitting_IO():
    # Simple test to try and quickly overfit the tagger - ensuring the ML models work correctly
    nlp = English()
@ -41,9 +91,8 @@ def test_overfitting_IO():
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
-    for tag in TAGS:
+    optimizer = nlp.begin_training(get_examples=lambda: train_examples)
-        tagger.add_label(tag)
+    assert tagger.model.get_dim("nO") == len(TAGS)
    optimizer = nlp.begin_training()
    for i in range(50):
        losses = {}
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@ -10,7 +10,7 @@ from spacy.tokens import Doc
 from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
 from ..util import make_tempdir
-from ...gold import Example
+from ...training import Example
 TRAIN_DATA = [
@ -80,6 +80,51 @@ def test_label_types():
        textcat.add_label(9)
 def test_no_label():
    nlp = Language()
    nlp.add_pipe("textcat")
    with pytest.raises(ValueError):
        nlp.begin_training()
 def test_implicit_label():
    nlp = Language()
    textcat = nlp.add_pipe("textcat")
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
    nlp.begin_training(get_examples=lambda: train_examples)
 def test_no_resize():
    nlp = Language()
    textcat = nlp.add_pipe("textcat")
    textcat.add_label("POSITIVE")
    textcat.add_label("NEGATIVE")
    nlp.begin_training()
    assert textcat.model.get_dim("nO") == 2
    # this throws an error because the textcat can't be resized after initialization
    with pytest.raises(ValueError):
        textcat.add_label("NEUTRAL")
 def test_begin_training_examples():
    nlp = Language()
    textcat = nlp.add_pipe("textcat")
    train_examples = []
    for text, annotations in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
        for label, value in annotations.get("cats").items():
            textcat.add_label(label)
    # you shouldn't really call this more than once, but for testing it should be fine
    nlp.begin_training()
    nlp.begin_training(get_examples=lambda: train_examples)
    with pytest.raises(TypeError):
        nlp.begin_training(get_examples=lambda: None)
    with pytest.raises(ValueError):
        nlp.begin_training(get_examples=train_examples)
 def test_overfitting_IO():
    # Simple test to try and quickly overfit the textcat component - ensuring the ML models work correctly
    fix_random_seed(0)
@ -89,9 +134,8 @@ def test_overfitting_IO():
    train_examples = []
    for text, annotations in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
-        for label, value in annotations.get("cats").items():
+    optimizer = nlp.begin_training(get_examples=lambda: train_examples)
-            textcat.add_label(label)
+    assert textcat.model.get_dim("nO") == 2
    optimizer = nlp.begin_training()
    for i in range(50):
        losses = {}
--- a/spacy/tests/regression/test_issue1-1000.py
+++ b/spacy/tests/regression/test_issue1-1000.py
@ -1,7 +1,7 @@
 import pytest
 import random
 from spacy import util
-from spacy.gold import Example
+from spacy.training import Example
 from spacy.matcher import Matcher
 from spacy.attrs import IS_PUNCT, ORTH, LOWER
 from spacy.vocab import Vocab
--- a/spacy/tests/regression/test_issue1501-2000.py
+++ b/spacy/tests/regression/test_issue1501-2000.py
@ -3,7 +3,7 @@ import gc
 import numpy
 import copy
-from spacy.gold import Example
+from spacy.training import Example
 from spacy.lang.en import English
 from spacy.lang.en.stop_words import STOP_WORDS
 from spacy.lang.lex_attrs import is_stop
--- a/spacy/tests/regression/test_issue2001-2500.py
+++ b/spacy/tests/regression/test_issue2001-2500.py
@ -3,7 +3,7 @@ import numpy
 from spacy.tokens import Doc
 from spacy.matcher import Matcher
 from spacy.displacy import render
-from spacy.gold import iob_to_biluo
+from spacy.training import iob_to_biluo
 from spacy.lang.it import Italian
 from spacy.lang.en import English
--- a/spacy/tests/regression/test_issue2501-3000.py
+++ b/spacy/tests/regression/test_issue2501-3000.py
@ -1,6 +1,6 @@
 import pytest
 from spacy import displacy
-from spacy.gold import Example
+from spacy.training import Example
 from spacy.lang.en import English
 from spacy.lang.ja import Japanese
 from spacy.lang.xx import MultiLanguage
@ -20,7 +20,7 @@ def test_issue2564():
    nlp = Language()
    tagger = nlp.add_pipe("tagger")
    tagger.add_label("A")
-    tagger.begin_training(lambda: [])
+    nlp.begin_training()
    doc = nlp("hello world")
    assert doc.is_tagged
    docs = nlp.pipe(["hello", "world"])
--- a/spacy/tests/regression/test_issue3501-4000.py
+++ b/spacy/tests/regression/test_issue3501-4000.py
@ -9,7 +9,7 @@ from spacy.tokens import Doc, Token
 from spacy.matcher import Matcher, PhraseMatcher
 from spacy.errors import MatchPatternError
 from spacy.util import minibatch
-from spacy.gold import Example
+from spacy.training import Example
 from spacy.lang.hi import Hindi
 from spacy.lang.es import Spanish
 from spacy.lang.en import English
@ -251,6 +251,12 @@ def test_issue3803():
    assert [t.like_num for t in doc] == [True, True, True, True, True, True]
 def _parser_example(parser):
    doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
    gold = {"heads": [1, 1, 3, 3], "deps": ["right", "ROOT", "left", "ROOT"]}
    return Example.from_dict(doc, gold)
 def test_issue3830_no_subtok():
    """Test that the parser doesn't have subtok label if not learn_tokens"""
    config = {
@ -264,7 +270,7 @@ def test_issue3830_no_subtok():
    parser = DependencyParser(Vocab(), model, **config)
    parser.add_label("nsubj")
    assert "subtok" not in parser.labels
-    parser.begin_training(lambda: [])
+    parser.begin_training(lambda: [_parser_example(parser)])
    assert "subtok" not in parser.labels
@ -281,7 +287,7 @@ def test_issue3830_with_subtok():
    parser = DependencyParser(Vocab(), model, **config)
    parser.add_label("nsubj")
    assert "subtok" not in parser.labels
-    parser.begin_training(lambda: [])
+    parser.begin_training(lambda: [_parser_example(parser)])
    assert "subtok" in parser.labels
--- a/spacy/tests/regression/test_issue4001-4500.py
+++ b/spacy/tests/regression/test_issue4001-4500.py
@ -2,8 +2,8 @@ import pytest
 from spacy.pipeline import Pipe
 from spacy.matcher import PhraseMatcher, Matcher
 from spacy.tokens import Doc, Span, DocBin
-from spacy.gold import Example, Corpus
+from spacy.training import Example, Corpus
-from spacy.gold.converters import json2docs
+from spacy.training.converters import json2docs
 from spacy.vocab import Vocab
 from spacy.lang.en import English
 from spacy.util import minibatch, ensure_path, load_model
--- a/spacy/tests/regression/test_issue4501-5000.py
+++ b/spacy/tests/regression/test_issue4501-5000.py
@ -1,9 +1,7 @@
 import pytest
 from mock import Mock
 from spacy.matcher import DependencyMatcher
 from spacy.tokens import Doc, Span, DocBin
-from spacy.gold import Example
+from spacy.training import Example
-from spacy.gold.converters.conllu2docs import conllu2docs
+from spacy.training.converters.conllu2docs import conllu2docs
 from spacy.lang.en import English
 from spacy.kb import KnowledgeBase
 from spacy.vocab import Vocab
@ -12,7 +10,7 @@ from spacy.util import ensure_path, load_model_from_path
 import numpy
 import pickle
-from ..util import get_doc, make_tempdir
+from ..util import make_tempdir
 def test_issue4528(en_vocab):
--- a/spacy/tests/regression/test_issue5230.py
+++ b/spacy/tests/regression/test_issue5230.py
@ -64,7 +64,7 @@ def tagger():
    # 1. no model leads to error in serialization,
    # 2. the affected line is the one for model serialization
    tagger.add_label("A")
-    tagger.begin_training(lambda: [], pipeline=nlp.pipeline)
+    nlp.begin_training()
    return tagger
@ -85,7 +85,7 @@ def entity_linker():
    # need to add model for two reasons:
    # 1. no model leads to error in serialization,
    # 2. the affected line is the one for model serialization
-    entity_linker.begin_training(lambda: [], pipeline=nlp.pipeline)
+    nlp.begin_training()
    return entity_linker
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@ -1,14 +1,15 @@
 import pytest
 from click import NoSuchOption
-from spacy.gold import docs_to_json, biluo_tags_from_offsets
+from spacy.training import docs_to_json, biluo_tags_from_offsets
-from spacy.gold.converters import iob2docs, conll_ner2docs, conllu2docs
+from spacy.training.converters import iob2docs, conll_ner2docs, conllu2docs
 from spacy.lang.en import English
 from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
 from spacy.cli.pretrain import make_docs
 from spacy.cli.init_config import init_config, RECOMMENDATIONS
 from spacy.cli._util import validate_project_commands, parse_config_overrides
 from spacy.cli._util import load_project_config, substitute_project_variables
 from spacy.cli._util import string_to_list
 from thinc.config import ConfigValidationError
 import srsly
@ -372,17 +373,13 @@ def test_parse_config_overrides(args, expected):
    assert parse_config_overrides(args) == expected
-@pytest.mark.parametrize(
+@pytest.mark.parametrize("args", [["--foo"], ["--x.foo", "bar", "--baz"]])
    "args", [["--foo"], ["--x.foo", "bar", "--baz"]],
 )
 def test_parse_config_overrides_invalid(args):
    with pytest.raises(NoSuchOption):
        parse_config_overrides(args)
-@pytest.mark.parametrize(
+@pytest.mark.parametrize("args", [["--x.foo", "bar", "baz"], ["x.foo"]])
    "args", [["--x.foo", "bar", "baz"], ["x.foo"]],
 )
 def test_parse_config_overrides_invalid_2(args):
    with pytest.raises(SystemExit):
        parse_config_overrides(args)
@ -401,3 +398,44 @@ def test_init_config(lang, pipeline, optimize):
 def test_model_recommendations():
    for lang, data in RECOMMENDATIONS.items():
        assert RecommendationSchema(**data)
@pytest.mark.parametrize(
    "value",
    [
        # fmt: off
        "parser,textcat,tagger",
        " parser, textcat ,tagger ",
        'parser,textcat,tagger',
        ' parser, textcat ,tagger ',
        ' "parser"," textcat " ,"tagger "',
        " 'parser',' textcat ' ,'tagger '",
        '[parser,textcat,tagger]',
        '["parser","textcat","tagger"]',
        '[" parser" ,"textcat ", " tagger " ]',
        "[parser,textcat,tagger]",
        "[ parser, textcat , tagger]",
        "['parser','textcat','tagger']",
        "[' parser' , 'textcat', ' tagger ' ]",
        # fmt: on
    ],
 )
 def test_string_to_list(value):
    assert string_to_list(value, intify=False) == ["parser", "textcat", "tagger"]
@pytest.mark.parametrize(
    "value",
    [
        # fmt: off
        "1,2,3",
        '[1,2,3]',
        '["1","2","3"]',
        '[" 1" ,"2 ", " 3 " ]',
        "[' 1' , '2', ' 3 ' ]",
        # fmt: on
    ],
 )
 def test_string_to_list_intify(value):
    assert string_to_list(value, intify=False) == ["1", "2", "3"]
    assert string_to_list(value, intify=True) == [1, 2, 3]
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@ -3,7 +3,7 @@ import pytest
 from spacy.language import Language
 from spacy.tokens import Doc, Span
 from spacy.vocab import Vocab
-from spacy.gold import Example
+from spacy.training import Example
 from spacy.lang.en import English
 from spacy.util import registry
--- a/spacy/tests/test_new_example.py
+++ b/spacy/tests/test_new_example.py
@ -1,5 +1,5 @@
 import pytest
-from spacy.gold.example import Example
+from spacy.training.example import Example
 from spacy.tokens import Doc
 from spacy.vocab import Vocab
--- a/spacy/tests/test_scorer.py
+++ b/spacy/tests/test_scorer.py
@ -1,8 +1,8 @@
 from numpy.testing import assert_almost_equal, assert_array_almost_equal
 import pytest
 from pytest import approx
-from spacy.gold import Example
+from spacy.training import Example
-from spacy.gold.iob_utils import biluo_tags_from_offsets
+from spacy.training.iob_utils import biluo_tags_from_offsets
 from spacy.scorer import Scorer, ROCAUCScore
 from spacy.scorer import _roc_auc_score, _roc_curve
 from .util import get_doc
--- a/spacy/tests/test_tok2vec.py
+++ b/spacy/tests/test_tok2vec.py
@ -6,7 +6,7 @@ from spacy.ml.models.tok2vec import MishWindowEncoder, MaxoutWindowEncoder
 from spacy.pipeline.tok2vec import Tok2Vec, Tok2VecListener
 from spacy.vocab import Vocab
 from spacy.tokens import Doc
-from spacy.gold import Example
+from spacy.training import Example
 from spacy import util
 from spacy.lang.en import English
 from .util import get_batch
@ -89,6 +89,7 @@ def test_init_tok2vec():
    tok2vec = nlp.add_pipe("tok2vec")
    assert tok2vec.listeners == []
    nlp.begin_training()
    assert tok2vec.model.get_dim("nO")
 cfg_string = """
--- a/spacy/tests/test_training.py
+++ b/spacy/tests/test_training.py
@ -1,9 +1,10 @@
 import numpy
-from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags, Alignment
+from spacy.training import biluo_tags_from_offsets, offsets_from_biluo_tags, Alignment
-from spacy.gold import spans_from_biluo_tags, iob_to_biluo
+from spacy.training import spans_from_biluo_tags, iob_to_biluo
-from spacy.gold import Corpus, docs_to_json
+from spacy.training import Corpus, docs_to_json
-from spacy.gold.example import Example
+from spacy.training.example import Example
-from spacy.gold.converters import json2docs
+from spacy.training.converters import json2docs
 from spacy.training.augment import make_orth_variants_example
 from spacy.lang.en import English
 from spacy.tokens import Doc, DocBin
 from spacy.util import get_words_and_spaces, minibatch
@ -12,7 +13,6 @@ import pytest
 import srsly
 from .util import make_tempdir
 from ..gold.augment import make_orth_variants_example
@pytest.fixture
--- a/spacy/tests/test_util.py
+++ b/spacy/tests/test_util.py
@ -5,7 +5,7 @@ from .util import get_random_doc
 from spacy import util
 from spacy.util import dot_to_object, SimpleFrozenList
 from thinc.api import Config, Optimizer
-from spacy.gold.batchers import minibatch_by_words
+from spacy.training.batchers import minibatch_by_words
 from ..lang.en import English
 from ..lang.nl import Dutch
 from ..language import DEFAULT_CONFIG_PATH
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -24,7 +24,7 @@ from .util import registry
 from .attrs import intify_attrs
 from .symbols import ORTH
 from .scorer import Scorer
-from .gold import validate_examples
+from .training import validate_examples
 cdef class Tokenizer:
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -576,7 +576,7 @@ cdef class Doc:
                entity_type = 0
                kb_id = 0
-                # Set ent_iob to Missing (0) bij default unless this token was nered before
+                # Set ent_iob to Missing (0) by default unless this token was nered before
                ent_iob = 0
                if self.c[i].ent_iob != 0:
                    ent_iob = 2
--- a/spacy/training/init.pxd
+++ b/spacy/training/init.pxd
--- a/spacy/training/init.py
+++ b/spacy/training/init.py
--- a/spacy/training/align.py
+++ b/spacy/training/align.py
--- a/spacy/training/augment.py
+++ b/spacy/training/augment.py
--- a/spacy/training/batchers.py
+++ b/spacy/training/batchers.py
--- a/spacy/training/converters/init.py
+++ b/spacy/training/converters/init.py
--- a/spacy/training/converters/conll_ner2docs.py
+++ b/spacy/training/converters/conll_ner2docs.py
@ -1,7 +1,7 @@
 from wasabi import Printer
 from .. import tags_to_entities
-from ...gold import iob_to_biluo
+from ...training import iob_to_biluo
 from ...lang.xx import MultiLanguage
 from ...tokens import Doc, Span
 from ...util import load_model
--- a/spacy/training/converters/conllu2docs.py
+++ b/spacy/training/converters/conllu2docs.py
@ -1,7 +1,7 @@
 import re
 from .conll_ner2docs import n_sents_info
-from ...gold import iob_to_biluo, spans_from_biluo_tags
+from ...training import iob_to_biluo, spans_from_biluo_tags
 from ...tokens import Doc, Token, Span
 from ...vocab import Vocab
 from wasabi import Printer
--- a/spacy/training/converters/iob2docs.py
+++ b/spacy/training/converters/iob2docs.py
@ -1,7 +1,7 @@
 from wasabi import Printer
 from .conll_ner2docs import n_sents_info
-from ...gold import iob_to_biluo, tags_to_entities
+from ...training import iob_to_biluo, tags_to_entities
 from ...tokens import Doc, Span
 from ...util import minibatch
--- a/spacy/training/converters/json2docs.py
+++ b/spacy/training/converters/json2docs.py
--- a/spacy/training/corpus.py
+++ b/spacy/training/corpus.py
--- a/spacy/training/example.pxd
+++ b/spacy/training/example.pxd
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
--- a/spacy/training/gold_io.pyx
+++ b/spacy/training/gold_io.pyx
--- a/spacy/training/iob_utils.py
+++ b/spacy/training/iob_utils.py
@ -195,13 +195,15 @@ def tags_to_entities(tags):
            continue
        elif tag.startswith("I"):
            if start is None:
-                raise ValueError(Errors.E067.format(tags=tags[: i + 1]))
+                raise ValueError(Errors.E067.format(start="I", tags=tags[: i + 1]))
            continue
        if tag.startswith("U"):
            entities.append((tag[2:], i, i))
        elif tag.startswith("B"):
            start = i
        elif tag.startswith("L"):
            if start is None:
                raise ValueError(Errors.E067.format(start="L", tags=tags[: i + 1]))
            entities.append((tag[2:], start, i))
            start = None
        else:
--- a/spacy/training/loggers.py
+++ b/spacy/training/loggers.py
--- a/spacy/util.py
+++ b/spacy/util.py
@ -93,6 +93,7 @@ class registry(thinc.registry):
    # environment. spaCy models packaged with `spacy package` will "advertise"
    # themselves via entry points.
    models = catalogue.create("spacy", "models", entry_points=True)
    cli = catalogue.create("spacy", "cli", entry_points=True)
 class SimpleFrozenDict(dict):
@ -647,7 +648,7 @@ def join_command(command: List[str]) -> str:
    return " ".join(shlex.quote(cmd) for cmd in command)
-def run_command(command: Union[str, List[str]], *, capture=False, stdin=None) -> None:
+def run_command(command: Union[str, List[str]], *, capture=False, stdin=None):
    """Run a command on the command line as a subprocess. If the subprocess
    returns a non-zero exit code, a system exit is performed.
--- a/website/README.md
+++ b/website/README.md
@ -290,10 +290,10 @@ always be the **last element** in the row.
 > ```
 | Name                    | Description                                                                                                                                                                 |
-| ------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| ----------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `vocab`                 | The shared vocabulary. ~~Vocab~~                                                                                                                                            |
 | `model`                 | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. ~~Model[List[Doc], FullTransformerBatch]~~                                                   |
-| `annotation_setter` | Function that takes a batch of `Doc` objects and transformer outputs can set additional annotations on the `Doc`. ~~Callable[[List[Doc], FullTransformerBatch], None]~~ |
+| `set_extra_annotations` | Function that takes a batch of `Doc` objects and transformer outputs and can set additional annotations on the `Doc`. ~~Callable[[List[Doc], FullTransformerBatch], None]~~ |
 ### List {#list}
@ -609,7 +609,6 @@ In addition to the native markdown elements, you can use the components
 ├── docs                 # the actual markdown content
 ├── meta                 # JSON-formatted site metadata
 |   ├── languages.json   # supported languages and statistical models
 |   ├── logos.json       # logos and links for landing page
 |   ├── sidebars.json    # sidebar navigations for different sections
 |   ├── site.json        # general site metadata
 |   └── universe.json    # data for the spaCy universe section
--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@ -181,10 +181,10 @@ characters would be `"jumpping"`: 4 from the start, 4 from the end. This ensures
 that the final character is always in the last position, instead of being in an
 arbitrary position depending on the word length.
-The characters are embedded in a embedding table with 256 rows, and the vectors
+The characters are embedded in a embedding table with a given number of rows,
-concatenated. A hash-embedded vector of the `NORM` of the word is also
+and the vectors concatenated. A hash-embedded vector of the `NORM` of the word
-concatenated on, and the result is then passed through a feed-forward network to
+is also concatenated on, and the result is then passed through a feed-forward
-construct a single vector to represent the information.
+network to construct a single vector to represent the information.
 | Name        | Description                                                                                                                                                     |
 | ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------- |
@ -456,62 +456,6 @@ consists of either two or three subnetworks:
 | `nO`                | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~                                                                                                                                                                                                                             |
 | **CREATES**         | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~                                                                                                                                                                                                                                                                                           |
 ### spacy.BILUOTagger.v1 {#BILUOTagger source="spacy/ml/models/simple_ner.py"}
 > #### Example Config
 >
 > ```ini
 > [model]
 > @architectures = "spacy.BILUOTagger.v1 "
 >
 > [model.tok2vec]
 > @architectures = "spacy.HashEmbedCNN.v1"
 > # etc.
 > ```
 Construct a simple NER tagger that predicts
 [BILUO](/usage/linguistic-features#accessing-ner) tag scores for each token and
 uses greedy decoding with transition-constraints to return a valid BILUO tag
 sequence. A BILUO tag sequence encodes a sequence of non-overlapping labelled
 spans into tags assigned to each token. The first token of a span is given the
 tag `B-LABEL`, the last token of the span is given the tag `L-LABEL`, and tokens
 within the span are given the tag `U-LABEL`. Single-token spans are given the
 tag `U-LABEL`. All other tokens are assigned the tag `O`. The BILUO tag scheme
 generally results in better linear separation between classes, especially for
 non-CRF models, because there are more distinct classes for the different
 situations ([Ratinov et al., 2009](https://www.aclweb.org/anthology/W09-1119/)).
 | Name        | Description                                                                                |
 | ----------- | ------------------------------------------------------------------------------------------ |
 | `tok2vec`   | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ |
 | **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~                     |
 ### spacy.IOBTagger.v1 {#IOBTagger source="spacy/ml/models/simple_ner.py"}
 > #### Example Config
 >
 > ```ini
 > [model]
 > @architectures = "spacy.IOBTagger.v1 "
 >
 > [model.tok2vec]
 > @architectures = "spacy.HashEmbedCNN.v1"
 > # etc.
 > ```
 Construct a simple NER tagger, that predicts
 [IOB](/usage/linguistic-features#accessing-ner) tag scores for each token and
 uses greedy decoding with transition-constraints to return a valid IOB tag
 sequence. An IOB tag sequence encodes a sequence of non-overlapping labeled
 spans into tags assigned to each token. The first token of a span is given the
 tag B-LABEL, and subsequent tokens are given the tag I-LABEL. All other tokens
 are assigned the tag O.
 | Name        | Description                                                                                |
 | ----------- | ------------------------------------------------------------------------------------------ |
 | `tok2vec`   | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ |
 | **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~                     |
 ## Tagging architectures {#tagger source="spacy/ml/models/tagger.py"}
 ### spacy.Tagger.v1 {#Tagger}
--- a/website/docs/api/attributeruler.md
+++ b/website/docs/api/attributeruler.md
@ -38,7 +38,7 @@ how the component should be configured. You can override its settings via the
 | `validate`      | Whether patterns should be validated (passed to the `Matcher`). Defaults to `False`. ~~bool~~                                                                                                                                                  |
 ```python
-https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/attributeruler.py
+%%GITHUB_SPACY/spacy/pipeline/attributeruler.py
 ```
 ## AttributeRuler.\_\_init\_\_ {#init tag="method"}
--- a/Show More
+++ b/Show More