Merge branch 'develop' of https://github.com/explosion/spaCy into develop

2025-08-09 14:44:52 +03:00 · 2020-09-30 16:52:42 +02:00 · 2020-09-30 16:52:42 +02:00 · c379a4274a
commit c379a4274a
parent e58dca3028 115481aca7
22 changed files with 356 additions and 173 deletions
--- a/2
+++ b/2
@ -1,7 +1,7 @@
 SHELL := /bin/bash

 ifndef SPACY_EXTRAS
-override SPACY_EXTRAS = spacy-lookups-data==0.4.0.dev0 jieba pkuseg==0.0.25 pickle5 sudachipy sudachidict_core
+override SPACY_EXTRAS = spacy-lookups-data==1.0.0rc0 jieba pkuseg==0.0.25 pickle5 sudachipy sudachidict_core
 endif

 ifndef PYVER
--- a/pyproject.toml
+++ b/pyproject.toml
@ -6,7 +6,7 @@ requires = [
    "cymem>=2.0.2,<2.1.0",
    "preshed>=3.0.2,<3.1.0",
    "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=8.0.0a41,<8.0.0a50",
+    "thinc>=8.0.0a42,<8.0.0a50",
    "blis>=0.4.0,<0.5.0",
    "pytokenizations",
    "pathy"
--- a/requirements.txt
+++ b/requirements.txt
@ -1,7 +1,7 @@
 # Our libraries
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=8.0.0a41,<8.0.0a50
+thinc>=8.0.0a42,<8.0.0a50
 blis>=0.4.0,<0.5.0
 ml_datasets==0.2.0a0
 murmurhash>=0.28.0,<1.1.0
--- a/setup.cfg
+++ b/setup.cfg
@ -34,13 +34,13 @@ setup_requires =
    cymem>=2.0.2,<2.1.0
    preshed>=3.0.2,<3.1.0
    murmurhash>=0.28.0,<1.1.0
-    thinc>=8.0.0a41,<8.0.0a50
+    thinc>=8.0.0a42,<8.0.0a50
 install_requires =
    # Our libraries
    murmurhash>=0.28.0,<1.1.0
    cymem>=2.0.2,<2.1.0
    preshed>=3.0.2,<3.1.0
-    thinc>=8.0.0a41,<8.0.0a50
+    thinc>=8.0.0a42,<8.0.0a50
    blis>=0.4.0,<0.5.0
    wasabi>=0.8.0,<1.1.0
    srsly>=2.1.0,<3.0.0
@ -65,7 +65,7 @@ console_scripts =

 [options.extras_require]
 lookups =
-    spacy_lookups_data==0.4.0.dev0
+    spacy_lookups_data==1.0.0rc0
 cuda =
    cupy>=5.0.0b4,<9.0.0
 cuda80 =
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@ -16,6 +16,7 @@ import os

 from ..schemas import ProjectConfigSchema, validate
 from ..util import import_file, run_command, make_tempdir, registry, logger
+from ..util import ENV_VARS

 if TYPE_CHECKING:
    from pathy import Pathy  # noqa: F401
@ -39,7 +40,6 @@ commands to check and validate your config files, training and evaluation data,
 and custom model implementations.
 """
 INIT_HELP = """Commands for initializing configs and pipeline packages."""
-OVERRIDES_ENV_VAR = "SPACY_CONFIG_OVERRIDES"

 # Wrappers for Typer's annotations. Initially created to set defaults and to
 # keep the names short, but not needed at the moment.
@ -65,7 +65,7 @@ def setup_cli() -> None:


 def parse_config_overrides(
-    args: List[str], env_var: Optional[str] = OVERRIDES_ENV_VAR
+    args: List[str], env_var: Optional[str] = ENV_VARS.CONFIG_OVERRIDES
 ) -> Dict[str, Any]:
    """Generate a dictionary of config overrides based on the extra arguments
    provided on the CLI, e.g. --training.batch_size to override
--- a/spacy/cli/init_pipeline.py
+++ b/spacy/cli/init_pipeline.py
@ -27,7 +27,7 @@ def init_vectors_cli(
    you can use in the [initialize.vocab] block of your config to initialize
    a model with vectors.
    """
-    util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR)
+    util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
    msg.info(f"Creating blank nlp object for language '{lang}'")
    nlp = util.get_lang_class(lang)()
    convert_vectors(nlp, vectors_loc, truncate=truncate, prune=prune, name=name)
@ -55,14 +55,14 @@ def init_pipeline_cli(
    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
    # fmt: on
 ):
-    util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR)
+    util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
    overrides = parse_config_overrides(ctx.args)
    import_code(code_path)
    setup_gpu(use_gpu)
    with show_validation_error(config_path):
        config = util.load_config(config_path, overrides=overrides)
    with show_validation_error(hint_fill=False):
-        nlp = init_nlp(config, use_gpu=use_gpu, silent=False)
+        nlp = init_nlp(config, use_gpu=use_gpu)
    nlp.to_disk(output_path)
    msg.good(f"Saved initialized pipeline to {output_path}")

@ -81,9 +81,12 @@ def init_labels_cli(
    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
    # fmt: on
 ):
+    """Generate a JSON file for labels in the data. This helps speed up the
+    training process, since spaCy won't have to preprocess the data to
+    extract the labels."""
+    util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
    if not output_path.exists():
        output_path.mkdir()
-    util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR)
    overrides = parse_config_overrides(ctx.args)
    import_code(code_path)
    setup_gpu(use_gpu)
@ -93,7 +96,8 @@ def init_labels_cli(
        nlp = init_nlp(config, use_gpu=use_gpu)
    for name, component in nlp.pipeline:
        if getattr(component, "label_data", None) is not None:
-            srsly.write_json(output_path / f"{name}.json", component.label_data)
-            msg.good(f"Saving {name} labels to {output_path}/{name}.json")
+            output_file = output_path / f"{name}.json"
+            srsly.write_json(output_file, component.label_data)
+            msg.good(f"Saving {name} labels to {output_file}")
        else:
            msg.info(f"No labels found for {name}")
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@ -4,8 +4,8 @@ can help generate the best possible configuration, given a user's requirements.
 {%- set use_transformer = (transformer_data and hardware != "cpu") -%}
 {%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
 [paths]
-train = ""
-dev = ""
+train = null
+dev = null

 [system]
 {% if use_transformer -%}
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -40,7 +40,7 @@ def train_cli(

    DOCS: https://nightly.spacy.io/api/cli#train
    """
-    util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR)
+    util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
    verify_cli_args(config_path, output_path)
    overrides = parse_config_overrides(ctx.args)
    import_code(code_path)
@ -50,6 +50,7 @@ def train_cli(
    msg.divider("Initializing pipeline")
    with show_validation_error(config_path, hint_fill=False):
        nlp = init_nlp(config, use_gpu=use_gpu)
+    msg.good("Initialized pipeline")
    msg.divider("Training pipeline")
    train(nlp, output_path, use_gpu=use_gpu, silent=False)

--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@ -1,6 +1,6 @@
 [paths]
-train = ""
-dev = ""
+train = null
+dev = null
 vectors = null
 vocab_data = null
 init_tok2vec = null
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -477,6 +477,8 @@ class Errors:
    E201 = ("Span index out of range.")

    # TODO: fix numbering after merging develop into master
+    E913 = ("Corpus path can't be None. Maybe you forgot to define it in your "
+            "config.cfg or override it on the CLI?")
    E914 = ("Executing {name} callback failed. Expected the function to "
            "return the nlp object but got: {value}. Maybe you forgot to return "
            "the modified object in your function?")
--- a/spacy/pipeline/pipe.pyx
+++ b/spacy/pipeline/pipe.pyx
@ -36,7 +36,7 @@ cdef class Pipe:
    @property
    def labels(self) -> Optional[Tuple[str]]:
        return []
-    
+
    @property
    def label_data(self):
        """Optional JSON-serializable data that would be sufficient to recreate
@ -207,7 +207,7 @@ cdef class Pipe:

        DOCS: https://nightly.spacy.io/api/pipe#initialize
        """
-        raise NotImplementedError(Errors.E931.format(method="initialize", name=self.name))
+        pass

    def _ensure_examples(self, get_examples):
        if get_examples is None or not hasattr(get_examples, "__call__"):
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@ -14,8 +14,8 @@ from ..util import make_tempdir

 nlp_config_string = """
 [paths]
-train = ""
-dev = ""
+train = null
+dev = null

 [corpora]

@ -309,7 +309,7 @@ def test_config_interpolation():
    config = Config().from_str(nlp_config_string, interpolate=False)
    assert config["corpora"]["train"]["path"] == "${paths.train}"
    interpolated = config.interpolate()
-    assert interpolated["corpora"]["train"]["path"] == ""
+    assert interpolated["corpora"]["train"]["path"] is None
    nlp = English.from_config(config)
    assert nlp.config["corpora"]["train"]["path"] == "${paths.train}"
    # Ensure that variables are preserved in nlp config
@ -317,10 +317,10 @@ def test_config_interpolation():
    assert config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
    assert nlp.config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
    interpolated2 = nlp.config.interpolate()
-    assert interpolated2["corpora"]["train"]["path"] == ""
+    assert interpolated2["corpora"]["train"]["path"] is None
    assert interpolated2["components"]["tagger"]["model"]["tok2vec"]["width"] == 342
    nlp2 = English.from_config(interpolated)
-    assert nlp2.config["corpora"]["train"]["path"] == ""
+    assert nlp2.config["corpora"]["train"]["path"] is None
    assert nlp2.config["components"]["tagger"]["model"]["tok2vec"]["width"] == 342


--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@ -3,10 +3,11 @@ from click import NoSuchOption
 from spacy.training import docs_to_json, offsets_to_biluo_tags
 from spacy.training.converters import iob_to_docs, conll_ner_to_docs, conllu_to_docs
 from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
+from spacy.util import ENV_VARS
 from spacy.cli.init_config import init_config, RECOMMENDATIONS
 from spacy.cli._util import validate_project_commands, parse_config_overrides
 from spacy.cli._util import load_project_config, substitute_project_variables
-from spacy.cli._util import string_to_list, OVERRIDES_ENV_VAR
+from spacy.cli._util import string_to_list
 from thinc.api import ConfigValidationError
 import srsly
 import os
@ -342,21 +343,22 @@ def test_parse_config_overrides_invalid_2(args):


 def test_parse_cli_overrides():
-    os.environ[OVERRIDES_ENV_VAR] = "--x.foo bar --x.bar=12 --x.baz false --y.foo=hello"
+    overrides = "--x.foo bar --x.bar=12 --x.baz false --y.foo=hello"
+    os.environ[ENV_VARS.CONFIG_OVERRIDES] = overrides
    result = parse_config_overrides([])
    assert len(result) == 4
    assert result["x.foo"] == "bar"
    assert result["x.bar"] == 12
    assert result["x.baz"] is False
    assert result["y.foo"] == "hello"
-    os.environ[OVERRIDES_ENV_VAR] = "--x"
+    os.environ[ENV_VARS.CONFIG_OVERRIDES] = "--x"
    assert parse_config_overrides([], env_var=None) == {}
    with pytest.raises(SystemExit):
        parse_config_overrides([])
-    os.environ[OVERRIDES_ENV_VAR] = "hello world"
+    os.environ[ENV_VARS.CONFIG_OVERRIDES] = "hello world"
    with pytest.raises(SystemExit):
        parse_config_overrides([])
-    del os.environ[OVERRIDES_ENV_VAR]
+    del os.environ[ENV_VARS.CONFIG_OVERRIDES]


@pytest.mark.parametrize("lang", ["en", "nl"])
--- a/spacy/training/corpus.py
+++ b/spacy/training/corpus.py
@ -7,7 +7,7 @@ import srsly
 from .. import util
 from .augment import dont_augment
 from .example import Example
-from ..errors import Warnings
+from ..errors import Warnings, Errors
 from ..tokens import DocBin, Doc
 from ..vocab import Vocab

@ -20,12 +20,14 @@ FILE_TYPE = ".spacy"

@util.registry.readers("spacy.Corpus.v1")
 def create_docbin_reader(
-    path: Path,
+    path: Optional[Path],
    gold_preproc: bool,
    max_length: int = 0,
    limit: int = 0,
    augmenter: Optional[Callable] = None,
 ) -> Callable[["Language"], Iterable[Example]]:
+    if path is None:
+        raise ValueError(Errors.E913)
    util.logger.debug(f"Loading corpus from path: {path}")
    return Corpus(
        path,
--- a/spacy/util.py
+++ b/spacy/util.py
@ -67,10 +67,14 @@ CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "co
 # fmt: on


-logging.basicConfig()
+logging.basicConfig(format="%(message)s")
 logger = logging.getLogger("spacy")


+class ENV_VARS:
+    CONFIG_OVERRIDES = "SPACY_CONFIG_OVERRIDES"
+
+
 class registry(thinc.registry):
    languages = catalogue.create("spacy", "languages", entry_points=True)
    architectures = catalogue.create("spacy", "architectures", entry_points=True)
--- a/website/docs/api/corpus.md
+++ b/website/docs/api/corpus.md
@ -32,14 +32,16 @@ streaming.
 > gold_preproc = false
 > max_length = 0
 > limit = 0
+> augmenter = null
 > ```

-| Name            | Description                                                                                                                                              |
-| --------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `path`          | The directory or filename to read from. Expects data in spaCy's binary [`.spacy` format](/api/data-formats#binary-training). ~~Path~~                    |
-|  `gold_preproc` | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. See [`Corpus`](/api/corpus#init) for details. ~~bool~~ |
-| `max_length`    | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~      |
-| `limit`         | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~                                                          |
+| Name            | Description                                                                                                                                                                                                                                                                              |
+| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `path`          | The directory or filename to read from. Expects data in spaCy's binary [`.spacy` format](/api/data-formats#binary-training). ~~Path~~                                                                                                                                                    |
+|  `gold_preproc` | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. See [`Corpus`](/api/corpus#init) for details. ~~bool~~                                                                                                                                 |
+| `max_length`    | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~                                                                                                                                      |
+| `limit`         | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~                                                                                                                                                                                          |
+| `augmenter`     | Apply some simply data augmentation, where we replace tokens with variations. This is especially useful for punctuation and case replacement, to help generalize beyond corpora that don't have smart-quotes, or only have smart quotes, etc. Defaults to `None`. ~~Optional[Callable]~~ |

 ```python
 %%GITHUB_SPACY/spacy/training/corpus.py
@ -74,7 +76,7 @@ train/test skew.
 |  `gold_preproc` | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. Defaults to `False`. ~~bool~~                     |
 | `max_length`    | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~ |
 | `limit`         | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~                                                     |
-| `augmenter`     | Optional data augmentation callback. ~~Callable[[Language, Example], Iterable[Example]]~~
+| `augmenter`     | Optional data augmentation callback. ~~Callable[[Language, Example], Iterable[Example]]~~                                                           |

 ## Corpus.\_\_call\_\_ {#call tag="method"}

--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@ -191,16 +191,16 @@ browser. Will run a simple web server.
 > displacy.serve([doc1, doc2], style="dep")
 > ```

-| Name      | Description                                                                                                                                                        |
-| --------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `docs`    | Document(s) or span(s) to visualize. ~~Union[Iterable[Union[Doc, Span]], Doc, Span]~~                                                                              |
-| `style`   | Visualization style, `"dep"` or `"ent"`. Defaults to `"dep"`. ~~str~~                                                                                              |
-| `page`    | Render markup as full HTML page. Defaults to `True`. ~~bool~~                                                                                                      |
-| `minify`  | Minify HTML markup. Defaults to `False`. ~~bool~~                                                                                                                  |
-| `options` | [Visualizer-specific options](#displacy_options), e.g. colors. ~~Dict[str, Any]~~                                                                                  |
+| Name      | Description                                                                                                                                                       |
+| --------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `docs`    | Document(s) or span(s) to visualize. ~~Union[Iterable[Union[Doc, Span]], Doc, Span]~~                                                                             |
+| `style`   | Visualization style, `"dep"` or `"ent"`. Defaults to `"dep"`. ~~str~~                                                                                             |
+| `page`    | Render markup as full HTML page. Defaults to `True`. ~~bool~~                                                                                                     |
+| `minify`  | Minify HTML markup. Defaults to `False`. ~~bool~~                                                                                                                 |
+| `options` | [Visualizer-specific options](#displacy_options), e.g. colors. ~~Dict[str, Any]~~                                                                                 |
 | `manual`  | Don't parse `Doc` and instead expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. Defaults to `False`. ~~bool~~ |
-| `port`    | Port to serve visualization. Defaults to `5000`. ~~int~~                                                                                                           |
-| `host`    | Host to serve visualization. Defaults to `"0.0.0.0"`. ~~str~~                                                                                                      |
+| `port`    | Port to serve visualization. Defaults to `5000`. ~~int~~                                                                                                          |
+| `host`    | Host to serve visualization. Defaults to `"0.0.0.0"`. ~~str~~                                                                                                     |

 ### displacy.render {#displacy.render tag="method" new="2"}

@ -223,7 +223,7 @@ Render a dependency parse tree or named entity visualization.
 | `page`      | Render markup as full HTML page. Defaults to `True`. ~~bool~~                                                                                                                          |
 | `minify`    | Minify HTML markup. Defaults to `False`. ~~bool~~                                                                                                                                      |
 | `options`   | [Visualizer-specific options](#displacy_options), e.g. colors. ~~Dict[str, Any]~~                                                                                                      |
-| `manual`    | Don't parse `Doc` and instead expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. Defaults to `False`. ~~bool~~                     |
+| `manual`    | Don't parse `Doc` and instead expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. Defaults to `False`. ~~bool~~                      |
 | `jupyter`   | Explicitly enable or disable "[Jupyter](http://jupyter.org/) mode" to return markup ready to be rendered in a notebook. Detected automatically if `None` (default). ~~Optional[bool]~~ |
 | **RETURNS** | The rendered HTML markup. ~~str~~                                                                                                                                                      |

@ -244,7 +244,7 @@ If a setting is not present in the options, the default value will be used.
 | Name                                       | Description                                                                                                                                  |
 | ------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------- |
 | `fine_grained`                             | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). Defaults to `False`. ~~bool~~             |
-| `add_lemma` <Tag variant="new">2.2.4</Tag> | Print the lemmas in a separate row below the token texts. Defaults to `False`. ~~bool~~                                                     |
+| `add_lemma` <Tag variant="new">2.2.4</Tag> | Print the lemmas in a separate row below the token texts. Defaults to `False`. ~~bool~~                                                      |
 | `collapse_punct`                           | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. Defaults to `True`. ~~bool~~ |
 | `collapse_phrases`                         | Merge noun phrases into one token. Defaults to `False`. ~~bool~~                                                                             |
 | `compact`                                  | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~                                                    |
@ -498,12 +498,13 @@ the [`Corpus`](/api/corpus) class.
 > limit = 0
 > ```

-| Name            | Description                                                                                                                                              |
-| --------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `path`          | The directory or filename to read from. Expects data in spaCy's binary [`.spacy` format](/api/data-formats#binary-training). ~~Union[str, Path]~~        |
-|  `gold_preproc` | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. See [`Corpus`](/api/corpus#init) for details. ~~bool~~ |
-| `max_length`    | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~      |
-| `limit`         | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~                                                          |
+| Name            | Description                                                                                                                                                                                                                                                                              |
+| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `path`          | The directory or filename to read from. Expects data in spaCy's binary [`.spacy` format](/api/data-formats#binary-training). ~~Union[str, Path]~~                                                                                                                                        |
+|  `gold_preproc` | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. See [`Corpus`](/api/corpus#init) for details. ~~bool~~                                                                                                                                 |
+| `max_length`    | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~                                                                                                                                      |
+| `limit`         | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~                                                                                                                                                                                          |
+| `augmenter`     | Apply some simply data augmentation, where we replace tokens with variations. This is especially useful for punctuation and case replacement, to help generalize beyond corpora that don't have smart-quotes, or only have smart quotes, etc. Defaults to `None`. ~~Optional[Callable]~~ |

 ### JsonlReader {#jsonlreader}

@ -935,7 +936,7 @@ Compile a sequence of prefix rules into a regex object.
 | Name        | Description                                                                                                                                 |
 | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
 | `entries`   | The prefix rules, e.g. [`lang.punctuation.TOKENIZER_PREFIXES`](%%GITHUB_SPACY/spacy/lang/punctuation.py). ~~Iterable[Union[str, Pattern]]~~ |
-| **RETURNS** | The regex object to be used for [`Tokenizer.prefix_search`](/api/tokenizer#attributes). ~~Pattern~~                                        |
+| **RETURNS** | The regex object to be used for [`Tokenizer.prefix_search`](/api/tokenizer#attributes). ~~Pattern~~                                         |

 ### util.compile_suffix_regex {#util.compile_suffix_regex tag="function"}

@ -952,7 +953,7 @@ Compile a sequence of suffix rules into a regex object.
 | Name        | Description                                                                                                                                 |
 | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
 | `entries`   | The suffix rules, e.g. [`lang.punctuation.TOKENIZER_SUFFIXES`](%%GITHUB_SPACY/spacy/lang/punctuation.py). ~~Iterable[Union[str, Pattern]]~~ |
-| **RETURNS** | The regex object to be used for [`Tokenizer.suffix_search`](/api/tokenizer#attributes). ~~Pattern~~                                        |
+| **RETURNS** | The regex object to be used for [`Tokenizer.suffix_search`](/api/tokenizer#attributes). ~~Pattern~~                                         |

 ### util.compile_infix_regex {#util.compile_infix_regex tag="function"}

@ -969,7 +970,7 @@ Compile a sequence of infix rules into a regex object.
 | Name        | Description                                                                                                                               |
 | ----------- | ----------------------------------------------------------------------------------------------------------------------------------------- |
 | `entries`   | The infix rules, e.g. [`lang.punctuation.TOKENIZER_INFIXES`](%%GITHUB_SPACY/spacy/lang/punctuation.py). ~~Iterable[Union[str, Pattern]]~~ |
-| **RETURNS** | The regex object to be used for [`Tokenizer.infix_finditer`](/api/tokenizer#attributes). ~~Pattern~~                                     |
+| **RETURNS** | The regex object to be used for [`Tokenizer.infix_finditer`](/api/tokenizer#attributes). ~~Pattern~~                                      |

 ### util.minibatch {#util.minibatch tag="function" new="2"}

--- a/website/docs/images/lifecycle.svg
+++ b/website/docs/images/lifecycle.svg
--- a/website/docs/usage/101/_pipelines.md
+++ b/website/docs/usage/101/_pipelines.md
@ -32,7 +32,7 @@ the [config](/usage/training#config):

 ```ini
 [nlp]
-pipeline = ["tagger", "parser", "ner"]
+pipeline = ["tok2vec", "tagger", "parser", "ner"]
 ```

 import Accordion from 'components/accordion.js'
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@ -167,8 +167,8 @@ the binary data:
 ```python
 ### spacy.load under the hood
 lang = "en"
-pipeline = ["tagger", "parser", "ner"]
-data_path = "path/to/en_core_web_sm/en_core_web_sm-2.0.0"
+pipeline = ["tok2vec", "tagger", "parser", "ner"]
+data_path = "path/to/en_core_web_sm/en_core_web_sm-3.0.0"

 cls = spacy.util.get_lang_class(lang)  # 1. Get Language class, e.g. English
 nlp = cls()                            # 2. Initialize it
@ -197,9 +197,9 @@ list of human-readable component names.

 ```python
 print(nlp.pipeline)
-# [('tagger', <spacy.pipeline.Tagger>), ('parser', <spacy.pipeline.DependencyParser>), ('ner', <spacy.pipeline.EntityRecognizer>)]
+# [('tok2vec', <spacy.pipeline.Tok2Vec>), ('tagger', <spacy.pipeline.Tagger>), ('parser', <spacy.pipeline.DependencyParser>), ('ner', <spacy.pipeline.EntityRecognizer>)]
 print(nlp.pipe_names)
-# ['tagger', 'parser', 'ner']
+# ['tok2vec', 'tagger', 'parser', 'ner']
 ```

 ### Built-in pipeline components {#built-in}
@ -1126,12 +1126,12 @@ For some use cases, it makes sense to also overwrite additional methods to
 customize how the model is updated from examples, how it's initialized, how the
 loss is calculated and to add evaluation scores to the training output.

-| Name                                 | Description                                                                                                                                                                                                                                                                                                        |
-| ------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| [`update`](/api/pipe#update)         | Learn from a batch of [`Example`](/api/example) objects containing the predictions and gold-standard annotations, and update the component's model.                                                                                                                                                                |
-| [`initialize`](/api/pipe#initialize) | Initialize the model. Typically calls into [`Model.initialize`](https://thinc.ai/docs/api-model#initialize) and [`Pipe.create_optimizer`](/api/pipe#create_optimizer) if no optimizer is provided.                                                                                                                 |
-| [`get_loss`](/api/pipe#get_loss)     | Return a tuple of the loss and the gradient for a batch of [`Example`](/api/example) objects.                                                                                                                                                                                                                      |
-| [`score`](/api/pipe#score)           | Score a batch of [`Example`](/api/example) objects and return a dictionary of scores. The [`@Language.factory`](/api/language#factory) decorator can define the `default_socre_weights` of the component to decide which keys of the scores to display during training and how they count towards the final score. |
+| Name                                 | Description                                                                                                                                                                                                                                                                                                                                   |
+| ------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| [`update`](/api/pipe#update)         | Learn from a batch of [`Example`](/api/example) objects containing the predictions and gold-standard annotations, and update the component's model.                                                                                                                                                                                           |
+| [`initialize`](/api/pipe#initialize) | Initialize the model. Typically calls into [`Model.initialize`](https://thinc.ai/docs/api-model#initialize) and can be passed custom arguments via the [`[initialize]`](/api/data-formats#config-initialize) config block that are only loaded during training or when you call [`nlp.initialize`](/api/language#initialize), not at runtime. |
+| [`get_loss`](/api/pipe#get_loss)     | Return a tuple of the loss and the gradient for a batch of [`Example`](/api/example) objects.                                                                                                                                                                                                                                                 |
+| [`score`](/api/pipe#score)           | Score a batch of [`Example`](/api/example) objects and return a dictionary of scores. The [`@Language.factory`](/api/language#factory) decorator can define the `default_socre_weights` of the component to decide which keys of the scores to display during training and how they count towards the final score.                            |

 <Infobox title="Custom trainable components and models" emoji="📖">

--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@ -6,8 +6,9 @@ menu:
  - ['Introduction', 'basics']
  - ['Quickstart', 'quickstart']
  - ['Config System', 'config']
-  <!-- - ['Data Utilities', 'data'] -->
+  - ['Custom Training', 'config-custom']
  - ['Custom Functions', 'custom-functions']
+  - ['Data Utilities', 'data']
  - ['Parallel Training', 'parallel-training']
  - ['Internal API', 'api']
 ---
@ -122,7 +123,7 @@ treebank.

 </Project>

-## Training config {#config}
+## Training config system {#config}

 Training config files include all **settings and hyperparameters** for training
 your pipeline. Instead of providing lots of arguments on the command line, you
@ -177,6 +178,7 @@ sections of a config file are:
 | `system`      | Settings related to system and hardware. Re-used across the config as variables, e.g. `${system.seed}`, and can be [overwritten](#config-overrides) on the CLI. |
 | `training`    | Settings and controls for the training and evaluation process.                                                                                                  |
 | `pretraining` | Optional settings and controls for the [language model pretraining](/usage/embeddings-transformers#pretraining).                                                |
+| `initialize`  | Data resources and arguments passed to components when [`nlp.initialize`](/api/language#initialize) is called before training (but not at runtime).             |

 <Infobox title="Config format and settings" emoji="📖">

@ -190,6 +192,20 @@ available for the different architectures are documented with the

 </Infobox>

+### Config lifecycle at runtime and training {#config-lifecycle}
+
+A pipeline's `config.cfg` is considered the "single source of truth", both at
+**training** and **runtime**. Under the hood,
+[`Language.from_config`](/api/language#from_config) takes care of constructing
+the `nlp` object using the settings defined in the config. An `nlp` object's
+config is available as [`nlp.config`](/api/language#config) and it includes all
+information about the pipeline, as well as the settings used to train and
+initialize it.
+
+![Illustration of pipeline lifecycle](../images/lifecycle.svg)
+
+<!-- TODO: explain lifecycle and initialization -->
+
 ### Overwriting config settings on the command line {#config-overrides}

 The config system means that you can define all settings **in one place** and in
@ -233,6 +249,61 @@ defined in the config file.
 $ SPACY_CONFIG_OVERRIDES="--system.gpu_allocator pytorch --training.batch_size 128" ./your_script.sh
 ```

+### Using variable interpolation {#config-interpolation}
+
+Another very useful feature of the config system is that it supports variable
+interpolation for both **values and sections**. This means that you only need to
+define a setting once and can reference it across your config using the
+`${section.value}` syntax. In this example, the value of `seed` is reused within
+the `[training]` block, and the whole block of `[training.optimizer]` is reused
+in `[pretraining]` and will become `pretraining.optimizer`.
+
+```ini
+### config.cfg (excerpt) {highlight="5,18"}
+[system]
+seed = 0
+
+[training]
+seed = ${system.seed}
+
+[training.optimizer]
+@optimizers = "Adam.v1"
+beta1 = 0.9
+beta2 = 0.999
+L2_is_weight_decay = true
+L2 = 0.01
+grad_clip = 1.0
+use_averages = false
+eps = 1e-8
+
+[pretraining]
+optimizer = ${training.optimizer}
+```
+
+You can also use variables inside strings. In that case, it works just like
+f-strings in Python. If the value of a variable is not a string, it's converted
+to a string.
+
+```ini
+[paths]
+version = 5
+root = "/Users/you/data"
+train = "${paths.root}/train_${paths.version}.spacy"
+# Result: /Users/you/data/train_5.spacy
+```
+
+<Infobox title="Tip: Override variables on the CLI" emoji="💡">
+
+If you need to change certain values between training runs, you can define them
+once, reference them as variables and then [override](#config-overrides) them on
+the CLI. For example, `--paths.root /other/root` will change the value of `root`
+in the block `[paths]` and the change will be reflected across all other values
+that reference this variable.
+
+</Infobox>
+
+## Customizing the pipeline and training {#config-custom}
+
 ### Defining pipeline components {#config-components}

 You typically train a [pipeline](/usage/processing-pipelines) of **one or more
@ -353,59 +424,6 @@ stop = 1000
 compound = 1.001
 ```

-### Using variable interpolation {#config-interpolation}
-
-Another very useful feature of the config system is that it supports variable
-interpolation for both **values and sections**. This means that you only need to
-define a setting once and can reference it across your config using the
-`${section.value}` syntax. In this example, the value of `seed` is reused within
-the `[training]` block, and the whole block of `[training.optimizer]` is reused
-in `[pretraining]` and will become `pretraining.optimizer`.
-
-```ini
-### config.cfg (excerpt) {highlight="5,18"}
-[system]
-seed = 0
-
-[training]
-seed = ${system.seed}
-
-[training.optimizer]
-@optimizers = "Adam.v1"
-beta1 = 0.9
-beta2 = 0.999
-L2_is_weight_decay = true
-L2 = 0.01
-grad_clip = 1.0
-use_averages = false
-eps = 1e-8
-
-[pretraining]
-optimizer = ${training.optimizer}
-```
-
-You can also use variables inside strings. In that case, it works just like
-f-strings in Python. If the value of a variable is not a string, it's converted
-to a string.
-
-```ini
-[paths]
-version = 5
-root = "/Users/you/data"
-train = "${paths.root}/train_${paths.version}.spacy"
-# Result: /Users/you/data/train_5.spacy
-```
-
-<Infobox title="Tip: Override variables on the CLI" emoji="💡">
-
-If you need to change certain values between training runs, you can define them
-once, reference them as variables and then [override](#config-overrides) them on
-the CLI. For example, `--paths.root /other/root` will change the value of `root`
-in the block `[paths]` and the change will be reflected across all other values
-that reference this variable.
-
-</Infobox>
-
 ### Model architectures {#model-architectures}

 > #### 💡 Model type annotations
@ -506,17 +524,7 @@ still look good.

 </Accordion>

-<!--
-## Data Utilities {#data-utilities}
-
-* spacy convert
-* The [corpora] block
-* Custom corpus class
-* Minibatching
-* Data augmentation
-->
-
-## Custom Functions {#custom-functions}
+## Custom functions {#custom-functions}

 Registered functions in the training config files can refer to built-in
 implementations, but you can also plug in fully **custom implementations**. All
@ -763,7 +771,96 @@ start = 2
 factor = 1.005
 ```

-#### Example: Custom data reading and batching {#custom-code-readers-batchers}
+### Defining custom architectures {#custom-architectures}
+
+Built-in pipeline components such as the tagger or named entity recognizer are
+constructed with default neural network [models](/api/architectures). You can
+change the model architecture entirely by implementing your own custom models
+and providing those in the config when creating the pipeline component. See the
+documentation on [layers and model architectures](/usage/layers-architectures)
+for more details.
+
+> ```ini
+> ### config.cfg
+> [components.tagger]
+> factory = "tagger"
+>
+> [components.tagger.model]
+> @architectures = "custom_neural_network.v1"
+> output_width = 512
+> ```
+
+```python
+### functions.py
+from typing import List
+from thinc.types import Floats2d
+from thinc.api import Model
+import spacy
+from spacy.tokens import Doc
+
+@spacy.registry.architectures("custom_neural_network.v1")
+def MyModel(output_width: int) -> Model[List[Doc], List[Floats2d]]:
+    return create_model(output_width)
+```
+
+## Data utilities {#data}
+
+spaCy includes various features and utilities to make it easy to train from your
+own data. If you have training data in a standard format like `.conll` or
+`.conllu`, the easiest way to convert it for use with spaCy is to run
+[`spacy convert`](/api/cli#convert) and pass it a file and an output directory:
+
+```cli
+$ python -m spacy convert ./train.gold.conll ./corpus
+```
+
+<Infobox title="Tip: Manage multi-step workflows with projects" emoji="💡">
+
+Training workflows often consist of multiple steps, from preprocessing the data
+all the way to packaging and deploying the trained model.
+[spaCy projects](/usage/projects) let you define all steps in one file, manage
+data assets, track changes and share your end-to-end processes with your team.
+
+</Infobox>
+
+### Working with corpora {#data-corpora}
+
+> #### Example
+>
+> ```ini
+> [corpora]
+>
+> [corpora.train]
+> @readers = "spacy.Corpus.v1"
+> path = ${paths.train}
+> gold_preproc = false
+> max_length = 0
+> limit = 0
+> augmenter = null
+>
+> [training]
+> train_corpus = "corpora.train"
+> ```
+
+The [`[corpora]`](/api/data-formats#config-corpora) block in your config lets
+you define **data resources** to use for training, evaluation, pretraining or
+any other custom workflows. `corpora.train` and `corpora.dev` are used as
+conventions within spaCy's default configs, but you can also define any other
+custom blocks. Each section in the corpora config should resolve to a
+[`Corpus`](/api/corpus) – for example, using spaCy's built-in
+[corpus reader](/api/top-level#readers) that takes a path to a binary `.spacy`
+file. The `train_corpus` and `dev_corpus` fields in the
+[`[training]`](/api/data-formats#config-training) block specify where to find
+the corpus in your config. This makes it easy to **swap out** different corpora
+by only changing a single config setting.
+
+Instead of making `[corpora]` a block with multiple subsections for each portion
+of the data, you can also use a single function that returns a dictionary of
+corpora, keyed by corpus name, e.g. `"train"` and `"dev"`. This can be
+especially useful if you need to split a single file into corpora for training
+and evaluation, without loading the same file twice.
+
+### Custom data reading and batching {#custom-code-readers-batchers}

 Some use-cases require **streaming in data** or manipulating datasets on the
 fly, rather than generating all data beforehand and storing it to file. Instead
@ -859,37 +956,11 @@ def filter_batch(size: int) -> Callable[[Iterable[Example]], Iterator[List[Examp
    return create_filtered_batches
 ```

-### Defining custom architectures {#custom-architectures}
-
-Built-in pipeline components such as the tagger or named entity recognizer are
-constructed with default neural network [models](/api/architectures). You can
-change the model architecture entirely by implementing your own custom models
-and providing those in the config when creating the pipeline component. See the
-documentation on [layers and model architectures](/usage/layers-architectures)
-for more details.
-
-> ```ini
-> ### config.cfg
-> [components.tagger]
-> factory = "tagger"
->
-> [components.tagger.model]
-> @architectures = "custom_neural_network.v1"
-> output_width = 512
-> ```
-
-```python
-### functions.py
-from typing import List
-from thinc.types import Floats2d
-from thinc.api import Model
-import spacy
-from spacy.tokens import Doc
-
-@spacy.registry.architectures("custom_neural_network.v1")
-def MyModel(output_width: int) -> Model[List[Doc], List[Floats2d]]:
-    return create_model(output_width)
-```
+<!-- TODO:
+* Custom corpus class
+* Minibatching
+* Data augmentation
+-->

 ## Parallel & distributed training with Ray {#parallel-training}

--- a/website/docs/usage/v3.md
+++ b/website/docs/usage/v3.md
@ -123,13 +123,14 @@ training run, with no hidden defaults, making it easy to rerun your experiments
 and track changes. You can use the
 [quickstart widget](/usage/training#quickstart) or the `init config` command to
 get started. Instead of providing lots of arguments on the command line, you
-only need to pass your `config.cfg` file to `spacy train`.
-
+only need to pass your `config.cfg` file to [`spacy train`](/api/cli#train).
 Training config files include all **settings and hyperparameters** for training
 your pipeline. Some settings can also be registered **functions** that you can
 swap out and customize, making it easy to implement your own custom models and
 architectures.

+![Illustration of pipeline lifecycle](../images/lifecycle.svg)
+
 <Infobox title="Details & Documentation" emoji="📖" list>

 - **Usage:** [Training pipelines and models](/usage/training)
@ -723,7 +724,7 @@ nlp = spacy.blank("en")

 Because pipeline components are now added using their string names, you won't
 have to instantiate the [component classes](/api/#architecture-pipeline)
-directly anynore. To configure the component, you can now use the `config`
+directly anymore. To configure the component, you can now use the `config`
 argument on [`nlp.add_pipe`](/api/language#add_pipe).

 > #### config.cfg (excerpt)