Merge branch 'develop' into nightly.spacy.io

2025-08-03 03:40:24 +03:00 · 2020-09-13 22:31:22 +02:00 · 2020-09-13 22:31:22 +02:00 · ceb850f099
commit ceb850f099
parent 472b9b4fa3 47acb45850
40 changed files with 643 additions and 313 deletions
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@ -301,6 +301,7 @@ def ensure_pathy(path):


 def git_sparse_checkout(repo: str, subpath: str, dest: Path, *, branch: str = "master"):
+    git_version = get_git_version()
    if dest.exists():
        msg.fail("Destination of checkout must not exist", exits=1)
    if not dest.parent.exists():
@ -321,24 +322,28 @@ def git_sparse_checkout(repo: str, subpath: str, dest: Path, *, branch: str = "m
    # *that* we can do by path.
    # We're using Git and sparse checkout to only clone the files we need
    with make_tempdir() as tmp_dir:
-        git_version = get_git_version()
        supports_sparse = git_version >= (2, 22)
        # This is the "clone, but don't download anything" part.
        cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 " f"-b {branch} "
        if supports_sparse:
            cmd += f"--filter=blob:none"  # <-- The key bit
        else:
-            msg.warn(
+            err_old = (
                f"You're running an old version of Git (v{git_version[0]}.{git_version[1]}) "
-                f"that doesn't fully support sparse checkout yet. This means that "
-                f"more files than necessary may be downloaded temporarily. To "
-                f"only download the files needed, upgrade to Git v2.22 or above."
+                f"that doesn't fully support sparse checkout yet."
            )
-        _attempt_run_command(cmd)
+            err_unk = "You're running an unknown version of Git, so sparse checkout has been disabled."
+            msg.warn(
+                f"{err_unk if git_version == (0, 0) else err_old} "
+                f"This means that more files than necessary may be downloaded "
+                f"temporarily. To only download the files needed, make sure "
+                f"you're using Git v2.22 or above."
+            )
+        try_run_command(cmd)
        # Now we need to find the missing filenames for the subpath we want.
        # Looking for this 'rev-list' command in the git --help? Hah.
        cmd = f"git -C {tmp_dir} rev-list --objects --all {'--missing=print ' if supports_sparse else ''} -- {subpath}"
-        ret = _attempt_run_command(cmd)
+        ret = try_run_command(cmd)
        git_repo = _from_http_to_git(repo)
        # Now pass those missings into another bit of git internals
        missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")])
@ -351,27 +356,44 @@ def git_sparse_checkout(repo: str, subpath: str, dest: Path, *, branch: str = "m
            msg.fail(err, exits=1)
        if supports_sparse:
            cmd = f"git -C {tmp_dir} fetch-pack {git_repo} {missings}"
-            _attempt_run_command(cmd)
+            try_run_command(cmd)
        # And finally, we can checkout our subpath
        cmd = f"git -C {tmp_dir} checkout {branch} {subpath}"
-        _attempt_run_command(cmd)
+        try_run_command(cmd)
        # We need Path(name) to make sure we also support subdirectories
        shutil.move(str(tmp_dir / Path(subpath)), str(dest))


-def get_git_version() -> Tuple[int, int]:
-    ret = _attempt_run_command(["git", "--version"])
-    # TODO: this seems kinda brittle?
-    version = ret.stdout[11:].strip().split(".")
+def get_git_version(
+    error: str = "Could not run 'git'. Make sure it's installed and the executable is available.",
+) -> Tuple[int, int]:
+    """Get the version of git and raise an error if calling 'git --version' fails.
+
+    error (str): The error message to show.
+    RETURNS (Tuple[int, int]): The version as a (major, minor) tuple. Returns
+        (0, 0) if the version couldn't be determined.
+    """
+    ret = try_run_command(["git", "--version"], error=error)
+    stdout = ret.stdout.strip()
+    if not stdout or not stdout.startswith("git version"):
+        return (0, 0)
+    version = stdout[11:].strip().split(".")
    return (int(version[0]), int(version[1]))


-def _attempt_run_command(cmd: Union[str, List[str]]):
+def try_run_command(
+    cmd: Union[str, List[str]], error: str = "Could not run command"
+) -> subprocess.CompletedProcess:
+    """Try running a command and raise an error if it fails.
+
+    cmd (Union[str, List[str]]): The command to run.
+    error (str): The error message.
+    RETURNS (CompletedProcess): The completed process if the command ran.
+    """
    try:
        return run_command(cmd, capture=True)
    except subprocess.CalledProcessError as e:
-        err = f"Could not run command"
-        msg.fail(err)
+        msg.fail(error)
        print(cmd)
        sys.exit(1)

@ -387,8 +409,15 @@ def _from_http_to_git(repo: str) -> str:
    return repo


-def string_to_list(value, intify=False):
-    """Parse a comma-separated string to a list"""
+def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[int]]:
+    """Parse a comma-separated string to a list and account for various
+    formatting options. Mostly used to handle CLI arguments that take a list of
+    comma-separated values.
+
+    value (str): The value to parse.
+    intify (bool): Whether to convert values to ints.
+    RETURNS (Union[List[str], List[int]]): A list of strings or ints.
+    """
    if not value:
        return []
    if value.startswith("[") and value.endswith("]"):
--- a/spacy/cli/debug_model.py
+++ b/spacy/cli/debug_model.py
@ -5,7 +5,8 @@ from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam
 from thinc.api import Model, data_validation
 import typer

-from ._util import Arg, Opt, debug_cli, show_validation_error, parse_config_overrides, string_to_list
+from ._util import Arg, Opt, debug_cli, show_validation_error
+from ._util import parse_config_overrides, string_to_list
 from .. import util


--- a/spacy/cli/init_model.py
+++ b/spacy/cli/init_model.py
@ -277,7 +277,7 @@ def read_vectors(msg: Printer, vectors_loc: Path, truncate_vectors: int):

 def ensure_shape(lines):
    """Ensure that the first line of the data is the vectors shape.
-    
+
    If it's not, we read in the data and output the shape as the first result,
    so that the reader doesn't have to deal with the problem.
    """
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@ -1,10 +1,10 @@
-from typing import Optional, Dict, Any
-import random
+from typing import Optional
 import numpy
 import time
 import re
 from collections import Counter
 from pathlib import Path
+from thinc.api import Config
 from thinc.api import use_pytorch_for_gpu_memory, require_gpu
 from thinc.api import set_dropout_rate, to_categorical, fix_random_seed
 from thinc.api import CosineDistance, L2Distance
@ -15,11 +15,10 @@ import typer

 from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
 from ._util import import_code
-from ..errors import Errors
 from ..ml.models.multi_task import build_cloze_multi_task_model
 from ..ml.models.multi_task import build_cloze_characters_multi_task_model
 from ..tokens import Doc
-from ..attrs import ID, HEAD
+from ..attrs import ID
 from .. import util


@ -30,9 +29,8 @@ from .. import util
 def pretrain_cli(
    # fmt: off
    ctx: typer.Context,  # This is only used to read additional arguments
-    texts_loc: Path = Arg(..., help="Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the key 'tokens'", exists=True),
-    output_dir: Path = Arg(..., help="Directory to write weights to on each epoch"),
    config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False),
+    output_dir: Path = Arg(..., help="Directory to write weights to on each epoch"),
    code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
    resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
    epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."),
@ -60,13 +58,35 @@ def pretrain_cli(

    DOCS: https://nightly.spacy.io/api/cli#pretrain
    """
-    overrides = parse_config_overrides(ctx.args)
+    config_overrides = parse_config_overrides(ctx.args)
    import_code(code_path)
+    verify_cli_args(config_path, output_dir, resume_path, epoch_resume)
+    if use_gpu >= 0:
+        msg.info("Using GPU")
+        require_gpu(use_gpu)
+    else:
+        msg.info("Using CPU")
+    msg.info(f"Loading config from: {config_path}")
+
+    with show_validation_error(config_path):
+        config = util.load_config(
+            config_path,
+            overrides=config_overrides,
+            interpolate=True
+        )
+    if not config.get("pretraining"):
+        # TODO: What's the solution here? How do we handle optional blocks?
+        msg.fail("The [pretraining] block in your config is empty", exits=1)
+    if not output_dir.exists():
+        output_dir.mkdir()
+        msg.good(f"Created output directory: {output_dir}")
+
+    config.to_disk(output_dir / "config.cfg")
+    msg.good("Saved config file in the output directory")
+ 
    pretrain(
-        texts_loc,
+        config,
        output_dir,
-        config_path,
-        config_overrides=overrides,
        resume_path=resume_path,
        epoch_resume=epoch_resume,
        use_gpu=use_gpu,
@ -74,52 +94,22 @@ def pretrain_cli(


 def pretrain(
-    texts_loc: Path,
+    config: Config,
    output_dir: Path,
-    config_path: Path,
-    config_overrides: Dict[str, Any] = {},
    resume_path: Optional[Path] = None,
    epoch_resume: Optional[int] = None,
-    use_gpu: int = -1,
+    use_gpu: int=-1
 ):
-    verify_cli_args(texts_loc, output_dir, config_path, resume_path, epoch_resume)
-    if use_gpu >= 0:
-        msg.info("Using GPU")
-        require_gpu(use_gpu)
-    else:
-        msg.info("Using CPU")
-    msg.info(f"Loading config from: {config_path}")
-    with show_validation_error(config_path):
-        config = util.load_config(config_path, overrides=config_overrides)
-        nlp, config = util.load_model_from_config(config)
-    pretrain_config = config["pretraining"]
-    if not pretrain_config:
-        # TODO: What's the solution here? How do we handle optional blocks?
-        msg.fail("The [pretraining] block in your config is empty", exits=1)
-    if not output_dir.exists():
-        output_dir.mkdir()
-        msg.good(f"Created output directory: {output_dir}")
-    seed = pretrain_config["seed"]
-    if seed is not None:
-        fix_random_seed(seed)
-    if use_gpu >= 0 and pretrain_config["use_pytorch_for_gpu_memory"]:
+    if config["system"].get("seed") is not None:
+        fix_random_seed(config["system"]["seed"])
+    if use_gpu >= 0 and config["system"].get("use_pytorch_for_gpu_memory"):
        use_pytorch_for_gpu_memory()
-    config.to_disk(output_dir / "config.cfg")
-    msg.good("Saved config file in the output directory")
-    if texts_loc != "-":  # reading from a file
-        with msg.loading("Loading input texts..."):
-            texts = list(srsly.read_jsonl(texts_loc))
-        random.shuffle(texts)
-    else:  # reading from stdin
-        msg.info("Reading input text from stdin...")
-        texts = srsly.read_jsonl("-")
-
-    tok2vec_path = pretrain_config["tok2vec_model"]
-    tok2vec = config
-    for subpath in tok2vec_path.split("."):
-        tok2vec = tok2vec.get(subpath)
-    model = create_pretraining_model(nlp, tok2vec, pretrain_config)
-    optimizer = pretrain_config["optimizer"]
+    nlp, config = util.load_model_from_config(config)
+    P_cfg = config["pretraining"]
+    corpus = P_cfg["corpus"]
+    batcher = P_cfg["batcher"]
+    model = create_pretraining_model(nlp, config["pretraining"])
+    optimizer = config["pretraining"]["optimizer"]

    # Load in pretrained weights to resume from
    if resume_path is not None:
@ -147,38 +137,35 @@ def pretrain(
            with (output_dir / "log.jsonl").open("a") as file_:
                file_.write(srsly.json_dumps(log) + "\n")

-    skip_counter = 0
-    objective = create_objective(pretrain_config["objective"])
-    for epoch in range(epoch_resume, pretrain_config["max_epochs"]):
-        batches = util.minibatch_by_words(texts, size=pretrain_config["batch_size"])
-        for batch_id, batch in enumerate(batches):
-            docs, count = make_docs(
-                nlp,
-                batch,
-                max_length=pretrain_config["max_length"],
-                min_length=pretrain_config["min_length"],
-            )
-            skip_counter += count
+    objective = create_objective(P_cfg["objective"])
+    # TODO: I think we probably want this to look more like the
+    # 'create_train_batches' function?
+    for epoch in range(epoch_resume, P_cfg["max_epochs"]):
+        for batch_id, batch in enumerate(batcher(corpus(nlp))):
+            docs = ensure_docs(batch)
            loss = make_update(model, docs, optimizer, objective)
            progress = tracker.update(epoch, loss, docs)
            if progress:
                msg.row(progress, **row_settings)
-                if texts_loc == "-" and tracker.words_per_epoch[epoch] >= 10 ** 7:
-                    break
-            if pretrain_config["n_save_every"] and (
-                batch_id % pretrain_config["n_save_every"] == 0
+            if P_cfg["n_save_every"] and (
+                batch_id % P_cfg["n_save_every"] == 0
            ):
                _save_model(epoch, is_temp=True)
        _save_model(epoch)
        tracker.epoch_loss = 0.0
-        if texts_loc != "-":
-            # Reshuffle the texts if texts were loaded from a file
-            random.shuffle(texts)
-    if skip_counter > 0:
-        msg.warn(f"Skipped {skip_counter} empty values")
    msg.good("Successfully finished pretrain")


+def ensure_docs(examples_or_docs):
+    docs = []
+    for eg_or_doc in examples_or_docs:
+        if isinstance(eg_or_doc, Doc):
+            docs.append(eg_or_doc)
+        else:
+            docs.append(eg_or_doc.reference)
+    return docs
+
+
 def _resume_model(model, resume_path, epoch_resume):
    msg.info(f"Resume training tok2vec from: {resume_path}")
    with resume_path.open("rb") as file_:
@ -211,36 +198,6 @@ def make_update(model, docs, optimizer, objective_func):
    return float(loss)


-def make_docs(nlp, batch, min_length, max_length):
-    docs = []
-    skip_count = 0
-    for record in batch:
-        if not isinstance(record, dict):
-            raise TypeError(Errors.E137.format(type=type(record), line=record))
-        if "tokens" in record:
-            words = record["tokens"]
-            if not words:
-                skip_count += 1
-                continue
-            doc = Doc(nlp.vocab, words=words)
-        elif "text" in record:
-            text = record["text"]
-            if not text:
-                skip_count += 1
-                continue
-            doc = nlp.make_doc(text)
-        else:
-            raise ValueError(Errors.E138.format(text=record))
-        if "heads" in record:
-            heads = record["heads"]
-            heads = numpy.asarray(heads, dtype="uint64")
-            heads = heads.reshape((len(doc), 1))
-            doc = doc.from_array([HEAD], heads)
-        if min_length <= len(doc) < max_length:
-            docs.append(doc)
-    return docs, skip_count
-
-
 def create_objective(config):
    """Create the objective for pretraining.

@ -296,7 +253,7 @@ def get_characters_loss(ops, docs, prediction, nr_char):
    return loss, d_target


-def create_pretraining_model(nlp, tok2vec, pretrain_config):
+def create_pretraining_model(nlp, pretrain_config):
    """Define a network for the pretraining. We simply add an output layer onto
    the tok2vec input model. The tok2vec input model needs to be a model that
    takes a batch of Doc objects (as a list), and returns a list of arrays.
@ -304,6 +261,12 @@ def create_pretraining_model(nlp, tok2vec, pretrain_config):
    The actual tok2vec layer is stored as a reference, and only this bit will be
    serialized to file and read back in when calling the 'train' command.
    """
+    component = nlp.get_pipe(pretrain_config["component"])
+    if pretrain_config.get("layer"):
+        tok2vec = component.model.get_ref(pretrain_config["layer"])
+    else:
+        tok2vec = component.model
+
    # TODO
    maxout_pieces = 3
    hidden_size = 300
@ -372,7 +335,7 @@ def _smart_round(figure, width=10, max_decimal=4):
        return format_str % figure


-def verify_cli_args(texts_loc, output_dir, config_path, resume_path, epoch_resume):
+def verify_cli_args(config_path, output_dir, resume_path, epoch_resume):
    if not config_path or not config_path.exists():
        msg.fail("Config file not found", config_path, exits=1)
    if output_dir.exists() and [p for p in output_dir.iterdir()]:
@ -388,16 +351,6 @@ def verify_cli_args(texts_loc, output_dir, config_path, resume_path, epoch_resum
                "It is better to use an empty directory or refer to a new output path, "
                "then the new directory will be created for you.",
            )
-    if texts_loc != "-":  # reading from a file
-        texts_loc = Path(texts_loc)
-        if not texts_loc.exists():
-            msg.fail("Input text file doesn't exist", texts_loc, exits=1)
-
-        for text in srsly.read_jsonl(texts_loc):
-            break
-        else:
-            msg.fail("Input file is empty", texts_loc, exits=1)
-
    if resume_path is not None:
        model_name = re.search(r"model\d+\.bin", str(resume_path))
        if not model_name and not epoch_resume:
--- a/spacy/cli/project/assets.py
+++ b/spacy/cli/project/assets.py
@ -7,7 +7,7 @@ import requests

 from ...util import ensure_path, working_dir
 from .._util import project_cli, Arg, PROJECT_FILE, load_project_config, get_checksum
-from .._util import download_file, git_sparse_checkout
+from .._util import download_file, git_sparse_checkout, get_git_version


@project_cli.command("assets")
@ -41,6 +41,11 @@ def project_assets(project_dir: Path) -> None:
        dest = (project_dir / asset["dest"]).resolve()
        checksum = asset.get("checksum")
        if "git" in asset:
+            git_err = (
+                f"Cloning spaCy project templates requires Git and the 'git' command. "
+                f"Make sure it's installed and that the executable is available."
+            )
+            get_git_version(error=git_err)
            if dest.exists():
                # If there's already a file, check for checksum
                if checksum and checksum == get_checksum(dest):
--- a/spacy/cli/project/clone.py
+++ b/spacy/cli/project/clone.py
@ -7,7 +7,7 @@ import re
 from ... import about
 from ...util import ensure_path
 from .._util import project_cli, Arg, Opt, COMMAND, PROJECT_FILE
-from .._util import git_sparse_checkout
+from .._util import git_sparse_checkout, get_git_version


@project_cli.command("clone")
@ -70,16 +70,12 @@ def check_clone(name: str, dest: Path, repo: str) -> None:
    dest (Path): Local destination of cloned directory.
    repo (str): URL of the repo to clone from.
    """
-    try:
-        subprocess.run(["git", "--version"], stdout=subprocess.DEVNULL)
-    except Exception:
-        msg.fail(
-            f"Cloning spaCy project templates requires Git and the 'git' command. ",
-            f"To clone a project without Git, copy the files from the '{name}' "
-            f"directory in the {repo} to {dest} manually and then run:",
-            f"{COMMAND} project init {dest}",
-            exits=1,
-        )
+    git_err = (
+        f"Cloning spaCy project templates requires Git and the 'git' command. ",
+        f"To clone a project without Git, copy the files from the '{name}' "
+        f"directory in the {repo} to {dest} manually.",
+    )
+    get_git_version(error=git_err)
    if not dest:
        msg.fail(f"Not a valid directory to clone project: {dest}", exits=1)
    if dest.exists():
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -671,6 +671,9 @@ class Errors:
    E1007 = ("Unsupported DependencyMatcher operator '{op}'.")
    E1008 = ("Invalid pattern: each pattern should be a list of dicts. Check "
             "that you are providing a list of patterns as `List[List[dict]]`.")
+    E1009 = ("String for hash '{val}' not found in StringStore. Set the value "
+             "through token.morph_ instead or add the string to the "
+             "StringStore with `nlp.vocab.strings.add(string)`.")


@add_codes
--- a/spacy/language.py
+++ b/spacy/language.py
@ -244,7 +244,8 @@ class Language:
        self._config["nlp"]["disabled"] = list(self.disabled)
        self._config["components"] = pipeline
        if not self._config["training"].get("score_weights"):
-            self._config["training"]["score_weights"] = combine_score_weights(score_weights)
+            combined_score_weights = combine_score_weights(score_weights)
+            self._config["training"]["score_weights"] = combined_score_weights
        if not srsly.is_json_serializable(self._config):
            raise ValueError(Errors.E961.format(config=self._config))
        return self._config
@ -1166,14 +1167,20 @@ class Language:
        if not hasattr(get_examples, "__call__"):
            err = Errors.E930.format(name="Language", obj=type(get_examples))
            raise ValueError(err)
+        valid_examples = False
        for example in get_examples():
            if not isinstance(example, Example):
                err = Errors.E978.format(
                    name="Language.begin_training", types=type(example)
                )
                raise ValueError(err)
+            else:
+                valid_examples = True
            for word in [t.text for t in example.reference]:
                _ = self.vocab[word]  # noqa: F841
+        if not valid_examples:
+            err = Errors.E930.format(name="Language", obj="empty list")
+            raise ValueError(err)
        if device >= 0:  # TODO: do we need this here?
            require_gpu(device)
            if self.vocab.vectors.data.shape[1] >= 1:
@ -1274,7 +1281,7 @@ class Language:
            util.logger.debug(doc)
            eg.predicted = doc
        results = scorer.score(examples)
-        n_words = sum(len(eg.predicted) for eg in examples)
+        n_words = sum(len(doc) for doc in docs)
        results["speed"] = n_words / (end_time - start_time)
        return results

--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@ -56,7 +56,7 @@ subword_features = true
@Language.factory(
    "textcat",
    assigns=["doc.cats"],
-    default_config={"labels": [], "model": DEFAULT_TEXTCAT_MODEL},
+    default_config={"labels": [], "threshold": 0.5, "model": DEFAULT_TEXTCAT_MODEL},
    scores=[
        "cats_score",
        "cats_score_desc",
@ -75,6 +75,7 @@ def make_textcat(
    name: str,
    model: Model[List[Doc], List[Floats2d]],
    labels: Iterable[str],
+    threshold: float,
 ) -> "TextCategorizer":
    """Create a TextCategorizer compoment. The text categorizer predicts categories
    over a whole document. It can learn one or more labels, and the labels can
@ -86,8 +87,9 @@ def make_textcat(
        scores for each category.
    labels (list): A list of categories to learn. If empty, the model infers the
        categories from the data.
+    threshold (float): Cutoff to consider a prediction "positive".
    """
-    return TextCategorizer(nlp.vocab, model, name, labels=labels)
+    return TextCategorizer(nlp.vocab, model, name, labels=labels, threshold=threshold)


 class TextCategorizer(Pipe):
@ -103,6 +105,7 @@ class TextCategorizer(Pipe):
        name: str = "textcat",
        *,
        labels: Iterable[str],
+        threshold: float,
    ) -> None:
        """Initialize a text categorizer.

@ -111,6 +114,7 @@ class TextCategorizer(Pipe):
        name (str): The component instance name, used to add entries to the
            losses during training.
        labels (Iterable[str]): The labels to use.
+        threshold (float): Cutoff to consider a prediction "positive".

        DOCS: https://nightly.spacy.io/api/textcategorizer#init
        """
@ -118,7 +122,7 @@ class TextCategorizer(Pipe):
        self.model = model
        self.name = name
        self._rehearsal_model = None
-        cfg = {"labels": labels}
+        cfg = {"labels": labels, "threshold": threshold}
        self.cfg = dict(cfg)

    @property
@ -371,5 +375,6 @@ class TextCategorizer(Pipe):
            labels=self.labels,
            multi_label=self.model.attrs["multi_label"],
            positive_label=positive_label,
+            threshold=self.cfg["threshold"],
            **kwargs,
        )
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@ -246,15 +246,14 @@ class ConfigSchemaPretrainEmpty(BaseModel):
 class ConfigSchemaPretrain(BaseModel):
    # fmt: off
    max_epochs: StrictInt = Field(..., title="Maximum number of epochs to train for")
-    min_length: StrictInt = Field(..., title="Minimum length of examples")
-    max_length: StrictInt = Field(..., title="Maximum length of examples")
    dropout: StrictFloat = Field(..., title="Dropout rate")
    n_save_every: Optional[StrictInt] = Field(..., title="Saving frequency")
-    batch_size: Union[Sequence[int], int] = Field(..., title="The batch size or batch size schedule")
-    seed: Optional[StrictInt] = Field(..., title="Random seed")
-    use_pytorch_for_gpu_memory: StrictBool = Field(..., title="Allocate memory via PyTorch")
-    tok2vec_model: StrictStr = Field(..., title="tok2vec model in config, e.g. components.tok2vec.model")
    optimizer: Optimizer = Field(..., title="The optimizer to use")
+    corpus: Reader = Field(..., title="Reader for the training data")
+    batcher: Batcher = Field(..., title="Batcher for the training data")
+    component: str = Field(..., title="Component to find the layer to pretrain")
+    layer: str = Field(..., title="Layer to pretrain. Whole model if empty.")
+ 
    # TODO: use a more detailed schema for this?
    objective: Dict[str, Any] = Field(..., title="Pretraining objective")
    # fmt: on
--- a/spacy/tests/doc/test_add_entities.py
+++ b/spacy/tests/doc/test_add_entities.py
@ -9,7 +9,10 @@ from spacy.pipeline.ner import DEFAULT_NER_MODEL


 def _ner_example(ner):
-    doc = Doc(ner.vocab, words=["Joe", "loves", "visiting", "London", "during", "the", "weekend"])
+    doc = Doc(
+        ner.vocab,
+        words=["Joe", "loves", "visiting", "London", "during", "the", "weekend"],
+    )
    gold = {"entities": [(0, 3, "PERSON"), (19, 25, "LOC")]}
    return Example.from_dict(doc, gold)

--- a/spacy/tests/doc/test_morphanalysis.py
+++ b/spacy/tests/doc/test_morphanalysis.py
@ -66,3 +66,31 @@ def test_morph_set(i_has):
 def test_morph_str(i_has):
    assert str(i_has[0].morph) == "PronType=prs"
    assert str(i_has[1].morph) == "Number=sing|Person=three|Tense=pres|VerbForm=fin"
+
+
+def test_morph_property(tokenizer):
+    doc = tokenizer("a dog")
+
+    # set through token.morph_
+    doc[0].morph_ = "PronType=prs"
+    assert doc[0].morph_ == "PronType=prs"
+    assert doc.to_array(["MORPH"])[0] != 0
+
+    # unset with token.morph
+    doc[0].morph = 0
+    assert doc.to_array(["MORPH"])[0] == 0
+
+    # empty morph is equivalent to "_"
+    doc[0].morph_ = ""
+    assert doc[0].morph_ == ""
+    assert doc.to_array(["MORPH"])[0] == tokenizer.vocab.strings["_"]
+
+    # "_" morph is also equivalent to empty morph
+    doc[0].morph_ = "_"
+    assert doc[0].morph_ == ""
+    assert doc.to_array(["MORPH"])[0] == tokenizer.vocab.strings["_"]
+
+    # set through existing hash with token.morph
+    tokenizer.vocab.strings.add("Feat=Val")
+    doc[0].morph = tokenizer.vocab.strings.add("Feat=Val")
+    assert doc[0].morph_ == "Feat=Val"
--- a/spacy/tests/matcher/test_dependency_matcher.py
+++ b/spacy/tests/matcher/test_dependency_matcher.py
@ -78,7 +78,7 @@ def patterns(en_vocab):
            "REL_OP": ">",
            "RIGHT_ID": "fox",
            "RIGHT_ATTRS": {"ORTH": "fox"},
-        }
+        },
    ]

    pattern5 = [
@ -233,9 +233,7 @@ def test_dependency_matcher_callback(en_vocab, doc):
    assert matches == matches2


-@pytest.mark.parametrize(
-    "op,num_matches", [(".", 8), (".*", 20), (";", 8), (";*", 20),]
-)
+@pytest.mark.parametrize("op,num_matches", [(".", 8), (".*", 20), (";", 8), (";*", 20)])
 def test_dependency_matcher_precedence_ops(en_vocab, op, num_matches):
    # two sentences to test that all matches are within the same sentence
    doc = get_doc(
@ -248,7 +246,7 @@ def test_dependency_matcher_precedence_ops(en_vocab, op, num_matches):
    for text in ["a", "b", "c", "d", "e"]:
        pattern = [
            {"RIGHT_ID": "1", "RIGHT_ATTRS": {"ORTH": text}},
-            {"LEFT_ID": "1", "REL_OP": op, "RIGHT_ID": "2", "RIGHT_ATTRS": {},},
+            {"LEFT_ID": "1", "REL_OP": op, "RIGHT_ID": "2", "RIGHT_ATTRS": {}},
        ]
        matcher = DependencyMatcher(en_vocab)
        matcher.add("A", [pattern])
--- a/spacy/tests/parser/test_add_label.py
+++ b/spacy/tests/parser/test_add_label.py
@ -54,7 +54,10 @@ def _parser_example(parser):


 def _ner_example(ner):
-    doc = Doc(ner.vocab, words=["Joe", "loves", "visiting", "London", "during", "the", "weekend"])
+    doc = Doc(
+        ner.vocab,
+        words=["Joe", "loves", "visiting", "London", "during", "the", "weekend"],
+    )
    gold = {"entities": [(0, 3, "PERSON"), (19, 25, "LOC")]}
    return Example.from_dict(doc, gold)

--- a/spacy/tests/pipeline/test_senter.py
+++ b/spacy/tests/pipeline/test_senter.py
@ -30,9 +30,10 @@ TRAIN_DATA = [
    ),
 ]

+
 def test_begin_training_examples():
    nlp = Language()
-    senter = nlp.add_pipe("senter")
+    nlp.add_pipe("senter")
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@ -89,7 +89,7 @@ def test_no_label():

 def test_implicit_label():
    nlp = Language()
-    textcat = nlp.add_pipe("textcat")
+    nlp.add_pipe("textcat")
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
--- a/spacy/tests/serialize/test_serialize_pipeline.py
+++ b/spacy/tests/serialize/test_serialize_pipeline.py
@ -136,7 +136,7 @@ def test_serialize_textcat_empty(en_vocab):
    # See issue #1105
    cfg = {"model": DEFAULT_TEXTCAT_MODEL}
    model = registry.make_from_config(cfg, validate=True)["model"]
-    textcat = TextCategorizer(en_vocab, model, labels=["ENTITY", "ACTION", "MODIFIER"])
+    textcat = TextCategorizer(en_vocab, model, labels=["ENTITY", "ACTION", "MODIFIER"], threshold=0.5)
    textcat.to_bytes(exclude=["vocab"])


--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@ -5,7 +5,6 @@ from spacy.training import docs_to_json, biluo_tags_from_offsets
 from spacy.training.converters import iob2docs, conll_ner2docs, conllu2docs
 from spacy.lang.en import English
 from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
-from spacy.cli.pretrain import make_docs
 from spacy.cli.init_config import init_config, RECOMMENDATIONS
 from spacy.cli._util import validate_project_commands, parse_config_overrides
 from spacy.cli._util import load_project_config, substitute_project_variables
@ -231,48 +230,6 @@ def test_cli_converters_conll_ner2json():
        assert ent.text in ["New York City", "London"]


-def test_pretrain_make_docs():
-    nlp = English()
-
-    valid_jsonl_text = {"text": "Some text"}
-    docs, skip_count = make_docs(nlp, [valid_jsonl_text], 1, 10)
-    assert len(docs) == 1
-    assert skip_count == 0
-
-    valid_jsonl_tokens = {"tokens": ["Some", "tokens"]}
-    docs, skip_count = make_docs(nlp, [valid_jsonl_tokens], 1, 10)
-    assert len(docs) == 1
-    assert skip_count == 0
-
-    invalid_jsonl_type = 0
-    with pytest.raises(TypeError):
-        make_docs(nlp, [invalid_jsonl_type], 1, 100)
-
-    invalid_jsonl_key = {"invalid": "Does not matter"}
-    with pytest.raises(ValueError):
-        make_docs(nlp, [invalid_jsonl_key], 1, 100)
-
-    empty_jsonl_text = {"text": ""}
-    docs, skip_count = make_docs(nlp, [empty_jsonl_text], 1, 10)
-    assert len(docs) == 0
-    assert skip_count == 1
-
-    empty_jsonl_tokens = {"tokens": []}
-    docs, skip_count = make_docs(nlp, [empty_jsonl_tokens], 1, 10)
-    assert len(docs) == 0
-    assert skip_count == 1
-
-    too_short_jsonl = {"text": "This text is not long enough"}
-    docs, skip_count = make_docs(nlp, [too_short_jsonl], 10, 15)
-    assert len(docs) == 0
-    assert skip_count == 0
-
-    too_long_jsonl = {"text": "This text contains way too much tokens for this test"}
-    docs, skip_count = make_docs(nlp, [too_long_jsonl], 1, 5)
-    assert len(docs) == 0
-    assert skip_count == 0
-
-
 def test_project_config_validation_full():
    config = {
        "vars": {"some_var": 20},
--- a/spacy/tests/tokenizer/test_tokenizer.py
+++ b/spacy/tests/tokenizer/test_tokenizer.py
@ -155,3 +155,11 @@ def test_tokenizer_special_cases_with_period(tokenizer):
    tokenizer.add_special_case("_SPECIAL_", [{"orth": "_SPECIAL_"}])
    doc = tokenizer(text)
    assert [token.text for token in doc] == ["_SPECIAL_", "."]
+
+
+def test_tokenizer_special_cases_idx(tokenizer):
+    text = "the _ID'X_"
+    tokenizer.add_special_case("_ID'X_", [{"orth": "_ID"}, {"orth": "'X_"}])
+    doc = tokenizer(text)
+    assert doc[1].idx == 4
+    assert doc[2].idx == 7
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -343,8 +343,9 @@ cdef class Tokenizer:
                    for j in range(cached.length):
                        tokens[i + offset + j] = cached.data.tokens[j]
                        tokens[i + offset + j].idx = orig_idx + idx_offset
-                        idx_offset += cached.data.tokens[j].lex.length + \
-                                1 if cached.data.tokens[j].spacy else 0
+                        idx_offset += cached.data.tokens[j].lex.length
+                        if cached.data.tokens[j].spacy:
+                            idx_offset += 1
                    tokens[i + offset + cached.length - 1].spacy = orig_final_spacy
                    i += span_end - span_start
                    offset += span[3]
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -214,9 +214,17 @@ cdef class Token:
        xp = get_array_module(vector)
        return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm))

-    @property
-    def morph(self):
-        return MorphAnalysis.from_id(self.vocab, self.c.morph)
+    property morph:
+        def __get__(self):
+            return MorphAnalysis.from_id(self.vocab, self.c.morph)
+
+        def __set__(self, attr_t morph):
+            if morph == 0:
+                self.c.morph = morph
+            elif morph in self.vocab.strings:
+                self.morph_ = self.vocab.strings[morph]
+            else:
+                raise ValueError(Errors.E1009.format(val=morph))

    property morph_:
        def __get__(self):
--- a/spacy/training/corpus.py
+++ b/spacy/training/corpus.py
@ -1,6 +1,7 @@
 import warnings
 from typing import Union, List, Iterable, Iterator, TYPE_CHECKING, Callable
 from pathlib import Path
+import srsly

 from .. import util
 from .example import Example
@ -21,6 +22,36 @@ def create_docbin_reader(
 ) -> Callable[["Language"], Iterable[Example]]:
    return Corpus(path, gold_preproc=gold_preproc, max_length=max_length, limit=limit)

+@util.registry.readers("spacy.JsonlReader.v1")
+def create_jsonl_reader(
+    path: Path, min_length: int=0, max_length: int = 0, limit: int = 0
+) -> Callable[["Language"], Iterable[Doc]]:
+    return JsonlTexts(path, min_length=min_length, max_length=max_length, limit=limit)
+
+
+def walk_corpus(path: Union[str, Path], file_type) -> List[Path]:
+    path = util.ensure_path(path)
+    if not path.is_dir() and path.parts[-1].endswith(file_type):
+        return [path]
+    orig_path = path
+    paths = [path]
+    locs = []
+    seen = set()
+    for path in paths:
+        if str(path) in seen:
+            continue
+        seen.add(str(path))
+        if path.parts and path.parts[-1].startswith("."):
+            continue
+        elif path.is_dir():
+            paths.extend(path.iterdir())
+        elif path.parts[-1].endswith(file_type):
+            locs.append(path)
+    if len(locs) == 0:
+        warnings.warn(Warnings.W090.format(path=orig_path))
+    return locs
+
+

 class Corpus:
    """Iterate Example objects from a file or directory of DocBin (.spacy)
@ -47,36 +78,13 @@ class Corpus:
        *,
        limit: int = 0,
        gold_preproc: bool = False,
-        max_length: bool = False,
+        max_length: int = 0,
    ) -> None:
        self.path = util.ensure_path(path)
        self.gold_preproc = gold_preproc
        self.max_length = max_length
        self.limit = limit

-    @staticmethod
-    def walk_corpus(path: Union[str, Path]) -> List[Path]:
-        path = util.ensure_path(path)
-        if not path.is_dir() and path.parts[-1].endswith(FILE_TYPE):
-            return [path]
-        orig_path = path
-        paths = [path]
-        locs = []
-        seen = set()
-        for path in paths:
-            if str(path) in seen:
-                continue
-            seen.add(str(path))
-            if path.parts and path.parts[-1].startswith("."):
-                continue
-            elif path.is_dir():
-                paths.extend(path.iterdir())
-            elif path.parts[-1].endswith(FILE_TYPE):
-                locs.append(path)
-        if len(locs) == 0:
-            warnings.warn(Warnings.W090.format(path=orig_path))
-        return locs
-
    def __call__(self, nlp: "Language") -> Iterator[Example]:
        """Yield examples from the data.

@ -85,11 +93,11 @@ class Corpus:

        DOCS: https://nightly.spacy.io/api/corpus#call
        """
-        ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.path))
+        ref_docs = self.read_docbin(nlp.vocab, walk_corpus(self.path, FILE_TYPE))
        if self.gold_preproc:
            examples = self.make_examples_gold_preproc(nlp, ref_docs)
        else:
-            examples = self.make_examples(nlp, ref_docs, self.max_length)
+            examples = self.make_examples(nlp, ref_docs)
        yield from examples

    def _make_example(
@ -108,18 +116,18 @@ class Corpus:
            return Example(nlp.make_doc(reference.text), reference)

    def make_examples(
-        self, nlp: "Language", reference_docs: Iterable[Doc], max_length: int = 0
+        self, nlp: "Language", reference_docs: Iterable[Doc]
    ) -> Iterator[Example]:
        for reference in reference_docs:
            if len(reference) == 0:
                continue
-            elif max_length == 0 or len(reference) < max_length:
+            elif self.max_length == 0 or len(reference) < self.max_length:
                yield self._make_example(nlp, reference, False)
            elif reference.is_sentenced:
                for ref_sent in reference.sents:
                    if len(ref_sent) == 0:
                        continue
-                    elif max_length == 0 or len(ref_sent) < max_length:
+                    elif self.max_length == 0 or len(ref_sent) < self.max_length:
                        yield self._make_example(nlp, ref_sent.as_doc(), False)

    def make_examples_gold_preproc(
@ -151,3 +159,57 @@ class Corpus:
                        i += 1
                        if self.limit >= 1 and i >= self.limit:
                            break
+
+
+class JsonlTexts:
+    """Iterate Doc objects from a file or directory of jsonl 
+    formatted raw text files.
+
+    path (Path): The directory or filename to read from.
+    min_length (int): Minimum document length (in tokens). Shorter documents
+        will be skipped. Defaults to 0, which indicates no limit.
+ 
+    max_length (int): Maximum document length (in tokens). Longer documents will
+        be skipped. Defaults to 0, which indicates no limit.
+    limit (int): Limit corpus to a subset of examples, e.g. for debugging.
+        Defaults to 0, which indicates no limit.
+
+    DOCS: https://nightly.spacy.io/api/corpus
+    """
+    file_type = "jsonl"
+
+    def __init__(
+        self,
+        path: Union[str, Path],
+        *,
+        limit: int = 0,
+        min_length: int = 0,
+        max_length: int = 0,
+    ) -> None:
+        self.path = util.ensure_path(path)
+        self.min_length = min_length
+        self.max_length = max_length
+        self.limit = limit
+
+    def __call__(self, nlp: "Language") -> Iterator[Example]:
+        """Yield examples from the data.
+
+        nlp (Language): The current nlp object.
+        YIELDS (Doc): The docs.
+
+        DOCS: https://nightly.spacy.io/api/corpus#call
+        """
+        for loc in walk_corpus(self.path, "jsonl"):
+            records = srsly.read_jsonl(loc)
+            for record in records:
+                doc = nlp.make_doc(record["text"])
+                if self.min_length >= 1 and len(doc) < self.min_length:
+                    continue
+                elif self.max_length >= 1 and len(doc) >= self.max_length:
+                    continue
+                else:
+                    words = [w.text for w in doc]
+                    spaces = [bool(w.whitespace_) for w in doc]
+                    # We don't *need* an example here, but it seems nice to
+                    # make it match the Corpus signature.
+                    yield Example(doc, Doc(nlp.vocab, words=words, spaces=spaces))
--- a/spacy/training/loggers.py
+++ b/spacy/training/loggers.py
@ -36,20 +36,12 @@ def console_logger():
                        keys=list(info["losses"].keys()),
                    )
                ) from None
-
-            try:
-                scores = [
-                    "{0:.2f}".format(float(info["other_scores"].get(col, 0.0)) * 100)
-                    for col in score_cols
-                ]
-            except KeyError as e:
-                raise KeyError(
-                    Errors.E983.format(
-                        dict="scores (other)",
-                        key=str(e),
-                        keys=list(info["other_scores"].keys()),
-                    )
-                ) from None
+            scores = []
+            for col in score_cols:
+                score = float(info["other_scores"].get(col, 0.0))
+                if col != "speed":
+                    score *= 100
+                scores.append("{0:.2f}".format(score))
            data = (
                [info["epoch"], info["step"]]
                + losses
--- a/spacy/util.py
+++ b/spacy/util.py
@ -648,12 +648,20 @@ def join_command(command: List[str]) -> str:
    return " ".join(shlex.quote(cmd) for cmd in command)


-def run_command(command: Union[str, List[str]], *, capture=False, stdin=None):
+def run_command(
+    command: Union[str, List[str]],
+    *,
+    capture: bool = False,
+    stdin: Optional[Any] = None,
+) -> Optional[subprocess.CompletedProcess]:
    """Run a command on the command line as a subprocess. If the subprocess
    returns a non-zero exit code, a system exit is performed.

    command (str / List[str]): The command. If provided as a string, the
        string will be split using shlex.split.
+    stdin (Optional[Any]): stdin to read from or None.
+    capture (bool): Whether to capture the output.
+    RETURNS (Optional[CompletedProcess]): The process object.
    """
    if isinstance(command, str):
        command = split_command(command)
@ -671,6 +679,10 @@ def run_command(command: Union[str, List[str]], *, capture=False, stdin=None):
        raise FileNotFoundError(
            Errors.E970.format(str_command=" ".join(command), tool=command[0])
        ) from None
+    except subprocess.CalledProcessError as e:
+        # We don't want a duplicate traceback here
+        print(e)
+        sys.exit(1)
    if ret.returncode != 0:
        sys.exit(ret.returncode)
    return ret
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@ -14,6 +14,7 @@ menu:
  - ['evaluate', 'evaluate']
  - ['package', 'package']
  - ['project', 'project']
+  - ['ray', 'ray']
 ---

 spaCy's CLI provides a range of helpful commands for downloading and training
@ -1134,3 +1135,47 @@ $ python -m spacy project dvc [project_dir] [workflow] [--force] [--verbose]
 | `--verbose`, `-V` |  Print more output generated by DVC. ~~bool (flag)~~                                                              |
 | `--help`, `-h`    | Show help message and available arguments. ~~bool (flag)~~                                                        |
 | **CREATES**       | A `dvc.yaml` file in the project directory, based on the steps defined in the given workflow.                     |
+
+## ray {#ray new="3"}
+
+The `spacy ray` CLI includes commands for parallel and distributed computing via
+[Ray](https://ray.io).
+
+<Infobox variant="warning">
+
+To use this command, you need the
+[`spacy-ray`](https://github.com/explosion/spacy-ray) package installed.
+Installing the package will automatically add the `ray` command to the spaCy
+CLI.
+
+</Infobox>
+
+### ray train {#ray-train tag="command"}
+
+Train a spaCy pipeline using [Ray](https://ray.io) for parallel training. The
+command works just like [`spacy train`](/api/cli#train). For more details and
+examples, see the usage guide on
+[parallel training](/usage/training#parallel-training) and the spaCy project
+[integration](/usage/projects#ray).
+
+```cli
+$ python -m spacy ray train [config_path] [--code-path] [--output] [--n-workers] [--address] [--gpu-id] [--verbose] [overrides]
+```
+
+> #### Example
+>
+> ```cli
+> $ python -m spacy ray train config.cfg --n-workers 2
+> ```
+
+| Name                | Description                                                                                                                                                                                |
+| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `config_path`       | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~                                                                |
+| `--code`, `-c`      | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~       |
+| `--output`, `-o`    | Directory or remote storage URL for saving trained pipeline. The directory will be created if it doesn't exist. ~~Optional[Path] \(positional)~~                                           |
+| `--n-workers`, `-n` | The number of workers. Defaults to `1`. ~~int (option)~~                                                                                                                                   |
+| `--address`, `-a`   | Optional address of the Ray cluster. If not set (default), Ray will run locally. ~~Optional[str] \(option)~~                                                                               |
+| `--gpu-id`, `-g`    | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~                                                                                                                                 |
+| `--verbose`, `-V`   | Display more information for debugging purposes. ~~bool (flag)~~                                                                                                                           |
+| `--help`, `-h`      | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                 |
+| overrides           | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
--- a/website/docs/api/textcategorizer.md
+++ b/website/docs/api/textcategorizer.md
@ -30,15 +30,17 @@ architectures and their arguments and hyperparameters.
 > from spacy.pipeline.textcat import DEFAULT_TEXTCAT_MODEL
 > config = {
 >    "labels": [],
+>    "threshold": 0.5,
 >    "model": DEFAULT_TEXTCAT_MODEL,
 > }
 > nlp.add_pipe("textcat", config=config)
 > ```

-| Setting  | Description                                                                                                                                                      |
-| -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `labels` | A list of categories to learn. If empty, the model infers the categories from the data. Defaults to `[]`. ~~Iterable[str]~~                                      |
-| `model`  | A model instance that predicts scores for each category. Defaults to [TextCatEnsemble](/api/architectures#TextCatEnsemble). ~~Model[List[Doc], List[Floats2d]]~~ |
+| Setting     | Description                                                                                                                                                      |
+| ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `labels`    | A list of categories to learn. If empty, the model infers the categories from the data. Defaults to `[]`. ~~Iterable[str]~~                                      |
+| `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~                                                                   |
+| `model`     | A model instance that predicts scores for each category. Defaults to [TextCatEnsemble](/api/architectures#TextCatEnsemble). ~~Model[List[Doc], List[Floats2d]]~~ |

 ```python
 %%GITHUB_SPACY/spacy/pipeline/textcat.py
@ -58,7 +60,7 @@ architectures and their arguments and hyperparameters.
 >
 > # Construction from class
 > from spacy.pipeline import TextCategorizer
-> textcat = TextCategorizer(nlp.vocab, model)
+> textcat = TextCategorizer(nlp.vocab, model, labels=[], threshold=0.5)
 > ```

 Create a new pipeline instance. In your application, you would normally use a
@ -72,6 +74,7 @@ shortcut for this and instantiate the component using its string name and
 | `name`         | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                        |
 | _keyword-only_ |                                                                                                                            |
 | `labels`       | The labels to use. ~~Iterable[str]~~                                                                                       |
+| `threshold`    | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~                             |

 ## TextCategorizer.\_\_call\_\_ {#call tag="method"}

--- a/website/docs/images/spacy-ray.svg
+++ b/website/docs/images/spacy-ray.svg
--- a/website/docs/usage/_benchmarks-models.md
+++ b/website/docs/usage/_benchmarks-models.md
@ -26,7 +26,7 @@ on training Stanza on this corpus to allow direct comparison.

 <figure>

-| System                                                                         |  POS |  USA |  LAS |
+| System                                                                         |  POS |  UAS |  LAS |
 | ------------------------------------------------------------------------------ | ---: | ---: | ---: |
 | spaCy RoBERTa (2020)                                                           |      |      |      |
 | spaCy CNN (2020)                                                               |      |      |      |
--- a/website/docs/usage/facts-figures.md
+++ b/website/docs/usage/facts-figures.md
@ -61,17 +61,13 @@ import Benchmarks from 'usage/\_benchmarks-models.md'

 <Benchmarks />

-<!-- TODO:
-
-<Project id="benchmarks/penn_treebank">
+<Project id="benchmarks/parsing_penn_treebank">

 The easiest way to reproduce spaCy's benchmarks on the Penn Treebank is to clone
 our project template.

 </Project>

-->
-
 <!-- ## Citing spaCy {#citation}

 <!-- TODO: update -->
--- a/website/docs/usage/projects.md
+++ b/website/docs/usage/projects.md
@ -796,11 +796,9 @@ workflows, including
 evaluation workflow that lets you compare two different models and their
 results.

-<Project id="integrations/prodigy">
+<!-- TODO: <Project id="integrations/prodigy">

-<!-- TODO: -->
-
-</Project>
+</Project> -->

 ---

@ -817,7 +815,7 @@ full embedded visualizer, as well as individual components.
 > #### Installation
 >
 > ```bash
-> $ pip install "spacy_streamlit>=1.0.0a0"
+> $ pip install "spacy-streamlit>=1.0.0a0"
 > ```

 ![](../images/spacy-streamlit.png)
@ -915,7 +913,39 @@ https://github.com/explosion/projects/blob/v3/integrations/fastapi/scripts/main.
 <Infobox title="This section is still under construction" emoji="🚧" variant="warning">
 </Infobox>

-<!-- TODO: document -->
+> #### Installation
+>
+> ```cli
+> $ pip install spacy-ray
+> # Check that the CLI is registered
+> $ python -m spacy ray --help
+> ```
+
+[Ray](https://ray.io/) is a fast and simple framework for building and running
+**distributed applications**. You can use Ray for parallel and distributed
+training with spaCy via our lightweight
+[`spacy-ray`](https://github.com/explosion/spacy-ray) extension package. If the
+package is installed in the same environment as spaCy, it will automatically add
+[`spacy ray`](/api/cli#ray) commands to your spaCy CLI.
+
+You can integrate [`spacy ray train`](/api/cli#ray-train) into your
+`project.yml` just like the regular training command:
+
+<!-- prettier-ignore -->
+```yaml
+### project.yml
+- name: "ray"
+    help: "Train a model via parallel training with Ray"
+    script:
+      - "python -m spacy ray train configs/config.cfg --paths.train corpus/train.spacy --paths.dev corpus/dev.spacy"
+    deps:
+      - "corpus/train.spacy"
+      - "corpus/dev.spacy"
+```
+
+<!-- TODO: <Project id="integrations/ray">
+
+</Project> -->

 ---

@ -943,12 +973,14 @@ your results.

 ![Screenshot: Parameter importance using config values](../images/wandb2.jpg 'Parameter importance using config values')

+<!-- TODO:
+
 <Project id="integrations/wandb">

 Get started with tracking your spaCy training runs in Weights & Biases using our
 project template. It includes a simple config using the `WandbLogger`, as well
 as a custom logger implementation you can adjust for your specific use case.

-<!-- TODO: -->
-
 </Project>
+
+-->
--- a/website/docs/usage/rule-based-matching.md
+++ b/website/docs/usage/rule-based-matching.md
@ -1075,7 +1075,7 @@ relations and tokens we want to match:

 > #### Visualizing the parse
 >
-> The [`displacy` visualizer](/usage/visualizer) lets you render `Doc` objects
+> The [`displacy` visualizer](/usage/visualizers) lets you render `Doc` objects
 > and their dependency parse and part-of-speech tags:
 >
 > ```python
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@ -7,7 +7,7 @@ menu:
  - ['Quickstart', 'quickstart']
  - ['Config System', 'config']
  - ['Custom Functions', 'custom-functions']
-  #   - ['Parallel Training', 'parallel-training']
+  - ['Parallel Training', 'parallel-training']
  - ['Internal API', 'api']
 ---

@ -832,6 +832,73 @@ def MyModel(output_width: int) -> Model[List[Doc], List[Floats2d]]:
    return create_model(output_width)
 ```

+## Parallel & distributed training with Ray {#parallel-training}
+
+> #### Installation
+>
+> ```cli
+> $ pip install spacy-ray
+> # Check that the CLI is registered
+> $ python -m spacy ray --help
+> ```
+
+[Ray](https://ray.io/) is a fast and simple framework for building and running
+**distributed applications**. You can use Ray to train spaCy on one or more
+remote machines, potentially speeding up your training process. Parallel
+training won't always be faster though – it depends on your batch size, models,
+and hardware.
+
+<Infobox variant="warning">
+
+To use Ray with spaCy, you need the
+[`spacy-ray`](https://github.com/explosion/spacy-ray) package installed.
+Installing the package will automatically add the `ray` command to the spaCy
+CLI.
+
+</Infobox>
+
+The [`spacy ray train`](/api/cli#ray-train) command follows the same API as
+[`spacy train`](/api/cli#train), with a few extra options to configure the Ray
+setup. You can optionally set the `--address` option to point to your Ray
+cluster. If it's not set, Ray will run locally.
+
+```cli
+python -m spacy ray train config.cfg --n-workers 2
+```
+
+<!-- TODO: <Project id="integrations/ray">
+
+</Project> -->
+
+### How parallel training works {#parallel-training-details}
+
+Each worker receives a shard of the **data** and builds a copy of the **model
+and optimizer** from the [`config.cfg`](#config). It also has a communication
+channel to **pass gradients and parameters** to the other workers. Additionally,
+each worker is given ownership of a subset of the parameter arrays. Every
+parameter array is owned by exactly one worker, and the workers are given a
+mapping so they know which worker owns which parameter.
+
+![Illustration of setup](../images/spacy-ray.svg)
+
+As training proceeds, every worker will be computing gradients for **all** of
+the model parameters. When they compute gradients for parameters they don't own,
+they'll **send them to the worker** that does own that parameter, along with a
+version identifier so that the owner can decide whether the discard the
+gradient. Workers use the gradients they receive and the ones they compute
+locally to update the parameters they own, and then broadcast the updated array
+and a new version ID to the other workers.
+
+This training procedure is **asynchronous** and **non-blocking**. Workers always
+push their gradient increments and parameter updates, they do not have to pull
+them and block on the result, so the transfers can happen in the background,
+overlapped with the actual training work. The workers also do not have to stop
+and wait for each other ("synchronize") at the start of each batch. This is very
+useful for spaCy, because spaCy is often trained on long documents, which means
+**batches can vary in size** significantly. Uneven workloads make synchronous
+gradient descent inefficient, because if one batch is slow, all of the other
+workers are stuck waiting for it to complete before they can continue.
+
 ## Internal training API {#api}

 <Infobox variant="warning">
--- a/website/docs/usage/v3.md
+++ b/website/docs/usage/v3.md
@ -34,6 +34,7 @@ to clone and adapt best-practice projects for your own use cases.
 - [Training & config system](#features-training)
 - [Custom models](#features-custom-models)
 - [End-to-end project workflows](#features-projects)
+- [Parallel training with Ray](#features-parallel-training)
 - [New built-in components](#features-pipeline-components)
 - [New custom component API](#features-components)
 - [Dependency matching](#features-dep-matcher)
@ -223,6 +224,39 @@ workflows, from data preprocessing to training and packaging your pipeline.

 </Infobox>

+### Parallel and distributed training with Ray {#features-parallel-training}
+
+> #### Example
+>
+> ```cli
+> $ pip install spacy-ray
+> # Check that the CLI is registered
+> $ python -m spacy ray --help
+> # Train a pipeline
+> $ python -m spacy ray train config.cfg --n-workers 2
+> ```
+
+[Ray](https://ray.io/) is a fast and simple framework for building and running
+**distributed applications**. You can use Ray to train spaCy on one or more
+remote machines, potentially speeding up your training process. The Ray
+integration is powered by a lightweight extension package,
+[`spacy-ray`](https://github.com/explosion/spacy-ray), that automatically adds
+the [`ray`](/api/cli#ray) command to your spaCy CLI if it's installed in the
+same environment. You can then run [`spacy ray train`](/api/cli#ray-train) for
+parallel training.
+
+![Illustration of setup](../images/spacy-ray.svg)
+
+<Infobox title="Details & Documentation" emoji="📖" list>
+
+- **Usage: **
+  [Parallel and distributed training](/usage/training#parallel-training),
+  [spaCy Projects integration](/usage/projects#ray)
+- **CLI:** [`ray`](/api/cli#ray), [`ray train`](/api/cli#ray-train)
+- **Implementation:** [`spacy-ray`](https://github.com/explosion/spacy-ray)
+
+</Infobox>
+
 ### New built-in pipeline components {#features-pipeline-components}

 spaCy v3.0 includes several new trainable and rule-based components that you can
@ -390,6 +424,7 @@ The following methods, attributes and commands are new in spaCy v3.0.
 | [`util.get_installed_models`](/api/top-level#util.get_installed_models)                                                         | Names of all pipeline packages installed in the environment.                                                                                                                                     |
 | [`init config`](/api/cli#init-config), [`init fill-config`](/api/cli#init-fill-config), [`debug config`](/api/cli#debug-config) | CLI commands for initializing, auto-filling and debugging [training configs](/usage/training).                                                                                                   |
 | [`project`](/api/cli#project)                                                                                                   | Suite of CLI commands for cloning, running and managing [spaCy projects](/usage/projects).                                                                                                       |
+| [`ray`](/api/cli#ray)                                                                                                           | Suite of CLI commands for parallel training with [Ray](https://ray.io/), provided by the [`spacy-ray`](https://github.com/explosion/spacy-ray) extension package.                                |

 ### New and updated documentation {#new-docs}

--- a/website/gatsby-config.js
+++ b/website/gatsby-config.js
@ -26,11 +26,27 @@ const replacements = {
    GITHUB_SPACY: `https://github.com/explosion/spaCy/tree/${branch}`,
 }

+/**
+ * Compute the overall total counts of models and languages
+ */
+function getCounts(langs = []) {
+    return {
+        langs: langs.length,
+        modelLangs: langs.filter(({ models }) => models && !!models.length).length,
+        starterLangs: langs.filter(({ starters }) => starters && !!starters.length).length,
+        models: langs.map(({ models }) => (models ? models.length : 0)).reduce((a, b) => a + b, 0),
+        starters: langs
+            .map(({ starters }) => (starters ? starters.length : 0))
+            .reduce((a, b) => a + b, 0),
+    }
+}
+
 module.exports = {
    siteMetadata: {
        ...site,
        sidebars,
        ...models,
+        counts: getCounts(models.languages),
        universe,
        nightly: isNightly,
        binderBranch,
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@ -1,5 +1,16 @@
 {
    "resources": [
+        {
+            "id": "spacy-ray",
+            "title": "spacy-ray",
+            "slogan": "Parallel and distributed training with spaCy and Ray",
+            "description": "[Ray](https://ray.io/) is a fast and simple framework for building and running **distributed applications**. This very lightweight extension package lets you use Ray for parallel and distributed training with spaCy. If `spacy-ray` is installed in the same environment as spaCy, it will automatically add `spacy ray` commands to your spaCy CLI.",
+            "github": "explosion/spacy-ray",
+            "pip": "spacy-ray",
+            "category": ["training"],
+            "author": "Explosion / Anyscale",
+            "thumb": "https://i.imgur.com/7so6ZpS.png"
+        },
        {
            "id": "spacy-sentence-bert",
            "title": "spaCy - sentence-transformers",
@ -2518,14 +2529,14 @@
            "description": "A spaCy rule-based pipeline for identifying positive cases of COVID-19 from clinical text. A version of this system was deployed as part of the US Department of Veterans Affairs biosurveillance response to COVID-19.",
            "pip": "cov-bsv",
            "code_example": [
-              "import cov_bsv",
-              "",
-              "nlp = cov_bsv.load()",
-              "text = 'Pt tested for COVID-19. His wife was recently diagnosed with novel coronavirus. SARS-COV-2: Detected'",
-              "",
-              "print(doc.ents)",
-              "print(doc._.cov_classification)",
-              "cov_bsv.visualize_doc(doc)"
+                "import cov_bsv",
+                "",
+                "nlp = cov_bsv.load()",
+                "text = 'Pt tested for COVID-19. His wife was recently diagnosed with novel coronavirus. SARS-COV-2: Detected'",
+                "",
+                "print(doc.ents)",
+                "print(doc._.cov_classification)",
+                "cov_bsv.visualize_doc(doc)"
            ],
            "category": ["pipeline", "standalone", "biomedical", "scientific"],
            "tags": ["clinical", "epidemiology", "covid-19", "surveillance"],
--- a/website/src/components/code.js
+++ b/website/src/components/code.js
@ -14,6 +14,7 @@ import GitHubCode from './github'
 import classes from '../styles/code.module.sass'

 const WRAP_THRESHOLD = 30
+const CLI_GROUPS = ['init', 'debug', 'project', 'ray']

 export default props => (
    <Pre>
@ -99,7 +100,6 @@ function replacePrompt(line, prompt, isFirst = false) {
 }

 function parseArgs(raw) {
-    const commandGroups = ['init', 'debug', 'project']
    let args = raw.split(' ').filter(arg => arg)
    const result = {}
    while (args.length) {
@ -108,7 +108,12 @@ function parseArgs(raw) {
            const isFlag = !args.length || (args[0].length > 1 && args[0].startsWith('-'))
            result[opt] = isFlag ? true : args.shift()
        } else {
-            const key = commandGroups.includes(opt) ? `${opt} ${args.shift()}` : opt
+            let key = opt
+            if (CLI_GROUPS.includes(opt)) {
+                if (args.length && !args[0].startsWith('-')) {
+                    key = `${opt} ${args.shift()}`
+                }
+            }
            result[key] = null
        }
    }
--- a/website/src/components/landing.js
+++ b/website/src/components/landing.js
@ -38,8 +38,8 @@ export const LandingSubtitle = ({ children }) => (
 )

 export const LandingGrid = ({ cols = 3, blocks = false, style, children }) => (
-    <Content className={classNames(classes.grid, { [classes.blocks]: blocks })}>
-        <Grid cols={cols} narrow={blocks} style={style}>
+    <Content className={classNames({ [classes.blocks]: blocks })}>
+        <Grid cols={cols} narrow={blocks} className={classes.grid} style={style}>
            {children}
        </Grid>
    </Content>
--- a/website/src/styles/accordion.module.sass
+++ b/website/src/styles/accordion.module.sass
@ -26,8 +26,11 @@
        border-bottom-right-radius: 0

 .icon
-    width: 2rem
-    height: 2rem
+    $width: 2rem
+
+    width: $width
+    height: $width
+    flex: 0 0 $width
    background: var(--color-theme)
    color: var(--color-back)
    border-radius: 50%
--- a/website/src/styles/landing.module.sass
+++ b/website/src/styles/landing.module.sass
@ -128,14 +128,17 @@
        padding-right: 2rem

@include breakpoint(max, md)
+    .banner
+        padding: 1rem 3rem
+
    .banner-content
        display: block

    .banner-text
        padding-top: 0

-    .col
-        grid-column: 1 / span 2
+    .grid
+        grid-template-columns: 1fr !important

 .banner-button
    margin-bottom: var(--spacing-sm)
--- a/website/src/widgets/landing.js
+++ b/website/src/widgets/landing.js
@ -54,23 +54,8 @@ for entity in doc.ents:
    print(entity.text, entity.label_)
 `

-/**
- * Compute the overall total counts of models and languages
- */
-function getCounts(langs = []) {
-    return {
-        langs: langs.length,
-        modelLangs: langs.filter(({ models }) => models && !!models.length).length,
-        starterLangs: langs.filter(({ starters }) => starters && !!starters.length).length,
-        models: langs.map(({ models }) => (models ? models.length : 0)).reduce((a, b) => a + b, 0),
-        starters: langs
-            .map(({ starters }) => (starters ? starters.length : 0))
-            .reduce((a, b) => a + b, 0),
-    }
-}
-
 const Landing = ({ data }) => {
-    const counts = getCounts(data.languages)
+    const { counts } = data
    return (
        <>
            <LandingHeader nightly={data.nightly}>
@ -345,7 +330,10 @@ const landingQuery = graphql`
            siteMetadata {
                nightly
                repo
-                languages {
+                counts {
+                    langs
+                    modelLangs
+                    starterLangs
                    models
                    starters
                }