Merge branch 'develop' into nightly.spacy.io

2025-10-22 03:34:15 +03:00 · 2020-09-13 22:31:22 +02:00 · 2020-09-13 22:31:22 +02:00 · ceb850f099
commit ceb850f099
parent 472b9b4fa3 47acb45850
40 changed files with 643 additions and 313 deletions
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@ -301,6 +301,7 @@ def ensure_pathy(path):
 def git_sparse_checkout(repo: str, subpath: str, dest: Path, *, branch: str = "master"):
    git_version = get_git_version()
    if dest.exists():
        msg.fail("Destination of checkout must not exist", exits=1)
    if not dest.parent.exists():
@ -321,24 +322,28 @@ def git_sparse_checkout(repo: str, subpath: str, dest: Path, *, branch: str = "m
    # *that* we can do by path.
    # We're using Git and sparse checkout to only clone the files we need
    with make_tempdir() as tmp_dir:
        git_version = get_git_version()
        supports_sparse = git_version >= (2, 22)
        # This is the "clone, but don't download anything" part.
        cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 " f"-b {branch} "
        if supports_sparse:
            cmd += f"--filter=blob:none"  # <-- The key bit
        else:
-            msg.warn(
+            err_old = (
                f"You're running an old version of Git (v{git_version[0]}.{git_version[1]}) "
-                f"that doesn't fully support sparse checkout yet. This means that "
+                f"that doesn't fully support sparse checkout yet."
                f"more files than necessary may be downloaded temporarily. To "
                f"only download the files needed, upgrade to Git v2.22 or above."
            )
-        _attempt_run_command(cmd)
+            err_unk = "You're running an unknown version of Git, so sparse checkout has been disabled."
            msg.warn(
                f"{err_unk if git_version == (0, 0) else err_old} "
                f"This means that more files than necessary may be downloaded "
                f"temporarily. To only download the files needed, make sure "
                f"you're using Git v2.22 or above."
            )
        try_run_command(cmd)
        # Now we need to find the missing filenames for the subpath we want.
        # Looking for this 'rev-list' command in the git --help? Hah.
        cmd = f"git -C {tmp_dir} rev-list --objects --all {'--missing=print ' if supports_sparse else ''} -- {subpath}"
-        ret = _attempt_run_command(cmd)
+        ret = try_run_command(cmd)
        git_repo = _from_http_to_git(repo)
        # Now pass those missings into another bit of git internals
        missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")])
@ -351,27 +356,44 @@ def git_sparse_checkout(repo: str, subpath: str, dest: Path, *, branch: str = "m
            msg.fail(err, exits=1)
        if supports_sparse:
            cmd = f"git -C {tmp_dir} fetch-pack {git_repo} {missings}"
-            _attempt_run_command(cmd)
+            try_run_command(cmd)
        # And finally, we can checkout our subpath
        cmd = f"git -C {tmp_dir} checkout {branch} {subpath}"
-        _attempt_run_command(cmd)
+        try_run_command(cmd)
        # We need Path(name) to make sure we also support subdirectories
        shutil.move(str(tmp_dir / Path(subpath)), str(dest))
-def get_git_version() -> Tuple[int, int]:
+def get_git_version(
-    ret = _attempt_run_command(["git", "--version"])
+    error: str = "Could not run 'git'. Make sure it's installed and the executable is available.",
-    # TODO: this seems kinda brittle?
+) -> Tuple[int, int]:
-    version = ret.stdout[11:].strip().split(".")
+    """Get the version of git and raise an error if calling 'git --version' fails.
    error (str): The error message to show.
    RETURNS (Tuple[int, int]): The version as a (major, minor) tuple. Returns
        (0, 0) if the version couldn't be determined.
    """
    ret = try_run_command(["git", "--version"], error=error)
    stdout = ret.stdout.strip()
    if not stdout or not stdout.startswith("git version"):
        return (0, 0)
    version = stdout[11:].strip().split(".")
    return (int(version[0]), int(version[1]))
-def _attempt_run_command(cmd: Union[str, List[str]]):
+def try_run_command(
    cmd: Union[str, List[str]], error: str = "Could not run command"
 ) -> subprocess.CompletedProcess:
    """Try running a command and raise an error if it fails.
    cmd (Union[str, List[str]]): The command to run.
    error (str): The error message.
    RETURNS (CompletedProcess): The completed process if the command ran.
    """
    try:
        return run_command(cmd, capture=True)
    except subprocess.CalledProcessError as e:
-        err = f"Could not run command"
+        msg.fail(error)
        msg.fail(err)
        print(cmd)
        sys.exit(1)
@ -387,8 +409,15 @@ def _from_http_to_git(repo: str) -> str:
    return repo
-def string_to_list(value, intify=False):
+def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[int]]:
-    """Parse a comma-separated string to a list"""
+    """Parse a comma-separated string to a list and account for various
    formatting options. Mostly used to handle CLI arguments that take a list of
    comma-separated values.
    value (str): The value to parse.
    intify (bool): Whether to convert values to ints.
    RETURNS (Union[List[str], List[int]]): A list of strings or ints.
    """
    if not value:
        return []
    if value.startswith("[") and value.endswith("]"):
--- a/spacy/cli/debug_model.py
+++ b/spacy/cli/debug_model.py
@ -5,7 +5,8 @@ from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam
 from thinc.api import Model, data_validation
 import typer
-from ._util import Arg, Opt, debug_cli, show_validation_error, parse_config_overrides, string_to_list
+from ._util import Arg, Opt, debug_cli, show_validation_error
 from ._util import parse_config_overrides, string_to_list
 from .. import util
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@ -1,10 +1,10 @@
-from typing import Optional, Dict, Any
+from typing import Optional
 import random
 import numpy
 import time
 import re
 from collections import Counter
 from pathlib import Path
 from thinc.api import Config
 from thinc.api import use_pytorch_for_gpu_memory, require_gpu
 from thinc.api import set_dropout_rate, to_categorical, fix_random_seed
 from thinc.api import CosineDistance, L2Distance
@ -15,11 +15,10 @@ import typer
 from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
 from ._util import import_code
 from ..errors import Errors
 from ..ml.models.multi_task import build_cloze_multi_task_model
 from ..ml.models.multi_task import build_cloze_characters_multi_task_model
 from ..tokens import Doc
-from ..attrs import ID, HEAD
+from ..attrs import ID
 from .. import util
@ -30,9 +29,8 @@ from .. import util
 def pretrain_cli(
    # fmt: off
    ctx: typer.Context,  # This is only used to read additional arguments
    texts_loc: Path = Arg(..., help="Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the key 'tokens'", exists=True),
    output_dir: Path = Arg(..., help="Directory to write weights to on each epoch"),
    config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False),
    output_dir: Path = Arg(..., help="Directory to write weights to on each epoch"),
    code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
    resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
    epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."),
@ -60,13 +58,35 @@ def pretrain_cli(
    DOCS: https://nightly.spacy.io/api/cli#pretrain
    """
-    overrides = parse_config_overrides(ctx.args)
+    config_overrides = parse_config_overrides(ctx.args)
    import_code(code_path)
    verify_cli_args(config_path, output_dir, resume_path, epoch_resume)
    if use_gpu >= 0:
        msg.info("Using GPU")
        require_gpu(use_gpu)
    else:
        msg.info("Using CPU")
    msg.info(f"Loading config from: {config_path}")
    with show_validation_error(config_path):
        config = util.load_config(
            config_path,
            overrides=config_overrides,
            interpolate=True
        )
    if not config.get("pretraining"):
        # TODO: What's the solution here? How do we handle optional blocks?
        msg.fail("The [pretraining] block in your config is empty", exits=1)
    if not output_dir.exists():
        output_dir.mkdir()
        msg.good(f"Created output directory: {output_dir}")
    config.to_disk(output_dir / "config.cfg")
    msg.good("Saved config file in the output directory")
    pretrain(
-        texts_loc,
+        config,
        output_dir,
        config_path,
        config_overrides=overrides,
        resume_path=resume_path,
        epoch_resume=epoch_resume,
        use_gpu=use_gpu,
@ -74,52 +94,22 @@ def pretrain_cli(
 def pretrain(
-    texts_loc: Path,
+    config: Config,
    output_dir: Path,
    config_path: Path,
    config_overrides: Dict[str, Any] = {},
    resume_path: Optional[Path] = None,
    epoch_resume: Optional[int] = None,
-    use_gpu: int = -1,
+    use_gpu: int=-1
 ):
-    verify_cli_args(texts_loc, output_dir, config_path, resume_path, epoch_resume)
+    if config["system"].get("seed") is not None:
-    if use_gpu >= 0:
+        fix_random_seed(config["system"]["seed"])
-        msg.info("Using GPU")
+    if use_gpu >= 0 and config["system"].get("use_pytorch_for_gpu_memory"):
        require_gpu(use_gpu)
    else:
        msg.info("Using CPU")
    msg.info(f"Loading config from: {config_path}")
    with show_validation_error(config_path):
        config = util.load_config(config_path, overrides=config_overrides)
        nlp, config = util.load_model_from_config(config)
    pretrain_config = config["pretraining"]
    if not pretrain_config:
        # TODO: What's the solution here? How do we handle optional blocks?
        msg.fail("The [pretraining] block in your config is empty", exits=1)
    if not output_dir.exists():
        output_dir.mkdir()
        msg.good(f"Created output directory: {output_dir}")
    seed = pretrain_config["seed"]
    if seed is not None:
        fix_random_seed(seed)
    if use_gpu >= 0 and pretrain_config["use_pytorch_for_gpu_memory"]:
        use_pytorch_for_gpu_memory()
-    config.to_disk(output_dir / "config.cfg")
+    nlp, config = util.load_model_from_config(config)
-    msg.good("Saved config file in the output directory")
+    P_cfg = config["pretraining"]
-    if texts_loc != "-":  # reading from a file
+    corpus = P_cfg["corpus"]
-        with msg.loading("Loading input texts..."):
+    batcher = P_cfg["batcher"]
-            texts = list(srsly.read_jsonl(texts_loc))
+    model = create_pretraining_model(nlp, config["pretraining"])
-        random.shuffle(texts)
+    optimizer = config["pretraining"]["optimizer"]
    else:  # reading from stdin
        msg.info("Reading input text from stdin...")
        texts = srsly.read_jsonl("-")
    tok2vec_path = pretrain_config["tok2vec_model"]
    tok2vec = config
    for subpath in tok2vec_path.split("."):
        tok2vec = tok2vec.get(subpath)
    model = create_pretraining_model(nlp, tok2vec, pretrain_config)
    optimizer = pretrain_config["optimizer"]
    # Load in pretrained weights to resume from
    if resume_path is not None:
@ -147,38 +137,35 @@ def pretrain(
            with (output_dir / "log.jsonl").open("a") as file_:
                file_.write(srsly.json_dumps(log) + "\n")
-    skip_counter = 0
+    objective = create_objective(P_cfg["objective"])
-    objective = create_objective(pretrain_config["objective"])
+    # TODO: I think we probably want this to look more like the
-    for epoch in range(epoch_resume, pretrain_config["max_epochs"]):
+    # 'create_train_batches' function?
-        batches = util.minibatch_by_words(texts, size=pretrain_config["batch_size"])
+    for epoch in range(epoch_resume, P_cfg["max_epochs"]):
-        for batch_id, batch in enumerate(batches):
+        for batch_id, batch in enumerate(batcher(corpus(nlp))):
-            docs, count = make_docs(
+            docs = ensure_docs(batch)
                nlp,
                batch,
                max_length=pretrain_config["max_length"],
                min_length=pretrain_config["min_length"],
            )
            skip_counter += count
            loss = make_update(model, docs, optimizer, objective)
            progress = tracker.update(epoch, loss, docs)
            if progress:
                msg.row(progress, **row_settings)
-                if texts_loc == "-" and tracker.words_per_epoch[epoch] >= 10 ** 7:
+            if P_cfg["n_save_every"] and (
-                    break
+                batch_id % P_cfg["n_save_every"] == 0
            if pretrain_config["n_save_every"] and (
                batch_id % pretrain_config["n_save_every"] == 0
            ):
                _save_model(epoch, is_temp=True)
        _save_model(epoch)
        tracker.epoch_loss = 0.0
        if texts_loc != "-":
            # Reshuffle the texts if texts were loaded from a file
            random.shuffle(texts)
    if skip_counter > 0:
        msg.warn(f"Skipped {skip_counter} empty values")
    msg.good("Successfully finished pretrain")
 def ensure_docs(examples_or_docs):
    docs = []
    for eg_or_doc in examples_or_docs:
        if isinstance(eg_or_doc, Doc):
            docs.append(eg_or_doc)
        else:
            docs.append(eg_or_doc.reference)
    return docs
 def _resume_model(model, resume_path, epoch_resume):
    msg.info(f"Resume training tok2vec from: {resume_path}")
    with resume_path.open("rb") as file_:
@ -211,36 +198,6 @@ def make_update(model, docs, optimizer, objective_func):
    return float(loss)
 def make_docs(nlp, batch, min_length, max_length):
    docs = []
    skip_count = 0
    for record in batch:
        if not isinstance(record, dict):
            raise TypeError(Errors.E137.format(type=type(record), line=record))
        if "tokens" in record:
            words = record["tokens"]
            if not words:
                skip_count += 1
                continue
            doc = Doc(nlp.vocab, words=words)
        elif "text" in record:
            text = record["text"]
            if not text:
                skip_count += 1
                continue
            doc = nlp.make_doc(text)
        else:
            raise ValueError(Errors.E138.format(text=record))
        if "heads" in record:
            heads = record["heads"]
            heads = numpy.asarray(heads, dtype="uint64")
            heads = heads.reshape((len(doc), 1))
            doc = doc.from_array([HEAD], heads)
        if min_length <= len(doc) < max_length:
            docs.append(doc)
    return docs, skip_count
 def create_objective(config):
    """Create the objective for pretraining.
@ -296,7 +253,7 @@ def get_characters_loss(ops, docs, prediction, nr_char):
    return loss, d_target
-def create_pretraining_model(nlp, tok2vec, pretrain_config):
+def create_pretraining_model(nlp, pretrain_config):
    """Define a network for the pretraining. We simply add an output layer onto
    the tok2vec input model. The tok2vec input model needs to be a model that
    takes a batch of Doc objects (as a list), and returns a list of arrays.
@ -304,6 +261,12 @@ def create_pretraining_model(nlp, tok2vec, pretrain_config):
    The actual tok2vec layer is stored as a reference, and only this bit will be
    serialized to file and read back in when calling the 'train' command.
    """
    component = nlp.get_pipe(pretrain_config["component"])
    if pretrain_config.get("layer"):
        tok2vec = component.model.get_ref(pretrain_config["layer"])
    else:
        tok2vec = component.model
    # TODO
    maxout_pieces = 3
    hidden_size = 300
@ -372,7 +335,7 @@ def _smart_round(figure, width=10, max_decimal=4):
        return format_str % figure
-def verify_cli_args(texts_loc, output_dir, config_path, resume_path, epoch_resume):
+def verify_cli_args(config_path, output_dir, resume_path, epoch_resume):
    if not config_path or not config_path.exists():
        msg.fail("Config file not found", config_path, exits=1)
    if output_dir.exists() and [p for p in output_dir.iterdir()]:
@ -388,16 +351,6 @@ def verify_cli_args(texts_loc, output_dir, config_path, resume_path, epoch_resum
                "It is better to use an empty directory or refer to a new output path, "
                "then the new directory will be created for you.",
            )
    if texts_loc != "-":  # reading from a file
        texts_loc = Path(texts_loc)
        if not texts_loc.exists():
            msg.fail("Input text file doesn't exist", texts_loc, exits=1)
        for text in srsly.read_jsonl(texts_loc):
            break
        else:
            msg.fail("Input file is empty", texts_loc, exits=1)
    if resume_path is not None:
        model_name = re.search(r"model\d+\.bin", str(resume_path))
        if not model_name and not epoch_resume:
--- a/spacy/cli/project/assets.py
+++ b/spacy/cli/project/assets.py
@ -7,7 +7,7 @@ import requests
 from ...util import ensure_path, working_dir
 from .._util import project_cli, Arg, PROJECT_FILE, load_project_config, get_checksum
-from .._util import download_file, git_sparse_checkout
+from .._util import download_file, git_sparse_checkout, get_git_version
@project_cli.command("assets")
@ -41,6 +41,11 @@ def project_assets(project_dir: Path) -> None:
        dest = (project_dir / asset["dest"]).resolve()
        checksum = asset.get("checksum")
        if "git" in asset:
            git_err = (
                f"Cloning spaCy project templates requires Git and the 'git' command. "
                f"Make sure it's installed and that the executable is available."
            )
            get_git_version(error=git_err)
            if dest.exists():
                # If there's already a file, check for checksum
                if checksum and checksum == get_checksum(dest):
--- a/spacy/cli/project/clone.py
+++ b/spacy/cli/project/clone.py
@ -7,7 +7,7 @@ import re
 from ... import about
 from ...util import ensure_path
 from .._util import project_cli, Arg, Opt, COMMAND, PROJECT_FILE
-from .._util import git_sparse_checkout
+from .._util import git_sparse_checkout, get_git_version
@project_cli.command("clone")
@ -70,16 +70,12 @@ def check_clone(name: str, dest: Path, repo: str) -> None:
    dest (Path): Local destination of cloned directory.
    repo (str): URL of the repo to clone from.
    """
-    try:
+    git_err = (
-        subprocess.run(["git", "--version"], stdout=subprocess.DEVNULL)
+        f"Cloning spaCy project templates requires Git and the 'git' command. ",
-    except Exception:
+        f"To clone a project without Git, copy the files from the '{name}' "
-        msg.fail(
+        f"directory in the {repo} to {dest} manually.",
-            f"Cloning spaCy project templates requires Git and the 'git' command. ",
+    )
-            f"To clone a project without Git, copy the files from the '{name}' "
+    get_git_version(error=git_err)
            f"directory in the {repo} to {dest} manually and then run:",
            f"{COMMAND} project init {dest}",
            exits=1,
        )
    if not dest:
        msg.fail(f"Not a valid directory to clone project: {dest}", exits=1)
    if dest.exists():
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -671,6 +671,9 @@ class Errors:
    E1007 = ("Unsupported DependencyMatcher operator '{op}'.")
    E1008 = ("Invalid pattern: each pattern should be a list of dicts. Check "
             "that you are providing a list of patterns as `List[List[dict]]`.")
    E1009 = ("String for hash '{val}' not found in StringStore. Set the value "
             "through token.morph_ instead or add the string to the "
             "StringStore with `nlp.vocab.strings.add(string)`.")
@add_codes
--- a/spacy/language.py
+++ b/spacy/language.py
@ -244,7 +244,8 @@ class Language:
        self._config["nlp"]["disabled"] = list(self.disabled)
        self._config["components"] = pipeline
        if not self._config["training"].get("score_weights"):
-            self._config["training"]["score_weights"] = combine_score_weights(score_weights)
+            combined_score_weights = combine_score_weights(score_weights)
            self._config["training"]["score_weights"] = combined_score_weights
        if not srsly.is_json_serializable(self._config):
            raise ValueError(Errors.E961.format(config=self._config))
        return self._config
@ -1166,14 +1167,20 @@ class Language:
        if not hasattr(get_examples, "__call__"):
            err = Errors.E930.format(name="Language", obj=type(get_examples))
            raise ValueError(err)
        valid_examples = False
        for example in get_examples():
            if not isinstance(example, Example):
                err = Errors.E978.format(
                    name="Language.begin_training", types=type(example)
                )
                raise ValueError(err)
            else:
                valid_examples = True
            for word in [t.text for t in example.reference]:
                _ = self.vocab[word]  # noqa: F841
        if not valid_examples:
            err = Errors.E930.format(name="Language", obj="empty list")
            raise ValueError(err)
        if device >= 0:  # TODO: do we need this here?
            require_gpu(device)
            if self.vocab.vectors.data.shape[1] >= 1:
@ -1274,7 +1281,7 @@ class Language:
            util.logger.debug(doc)
            eg.predicted = doc
        results = scorer.score(examples)
-        n_words = sum(len(eg.predicted) for eg in examples)
+        n_words = sum(len(doc) for doc in docs)
        results["speed"] = n_words / (end_time - start_time)
        return results
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@ -56,7 +56,7 @@ subword_features = true
@Language.factory(
    "textcat",
    assigns=["doc.cats"],
-    default_config={"labels": [], "model": DEFAULT_TEXTCAT_MODEL},
+    default_config={"labels": [], "threshold": 0.5, "model": DEFAULT_TEXTCAT_MODEL},
    scores=[
        "cats_score",
        "cats_score_desc",
@ -75,6 +75,7 @@ def make_textcat(
    name: str,
    model: Model[List[Doc], List[Floats2d]],
    labels: Iterable[str],
    threshold: float,
 ) -> "TextCategorizer":
    """Create a TextCategorizer compoment. The text categorizer predicts categories
    over a whole document. It can learn one or more labels, and the labels can
@ -86,8 +87,9 @@ def make_textcat(
        scores for each category.
    labels (list): A list of categories to learn. If empty, the model infers the
        categories from the data.
    threshold (float): Cutoff to consider a prediction "positive".
    """
-    return TextCategorizer(nlp.vocab, model, name, labels=labels)
+    return TextCategorizer(nlp.vocab, model, name, labels=labels, threshold=threshold)
 class TextCategorizer(Pipe):
@ -103,6 +105,7 @@ class TextCategorizer(Pipe):
        name: str = "textcat",
        *,
        labels: Iterable[str],
        threshold: float,
    ) -> None:
        """Initialize a text categorizer.
@ -111,6 +114,7 @@ class TextCategorizer(Pipe):
        name (str): The component instance name, used to add entries to the
            losses during training.
        labels (Iterable[str]): The labels to use.
        threshold (float): Cutoff to consider a prediction "positive".
        DOCS: https://nightly.spacy.io/api/textcategorizer#init
        """
@ -118,7 +122,7 @@ class TextCategorizer(Pipe):
        self.model = model
        self.name = name
        self._rehearsal_model = None
-        cfg = {"labels": labels}
+        cfg = {"labels": labels, "threshold": threshold}
        self.cfg = dict(cfg)
    @property
@ -371,5 +375,6 @@ class TextCategorizer(Pipe):
            labels=self.labels,
            multi_label=self.model.attrs["multi_label"],
            positive_label=positive_label,
            threshold=self.cfg["threshold"],
            **kwargs,
        )
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@ -246,15 +246,14 @@ class ConfigSchemaPretrainEmpty(BaseModel):
 class ConfigSchemaPretrain(BaseModel):
    # fmt: off
    max_epochs: StrictInt = Field(..., title="Maximum number of epochs to train for")
    min_length: StrictInt = Field(..., title="Minimum length of examples")
    max_length: StrictInt = Field(..., title="Maximum length of examples")
    dropout: StrictFloat = Field(..., title="Dropout rate")
    n_save_every: Optional[StrictInt] = Field(..., title="Saving frequency")
    batch_size: Union[Sequence[int], int] = Field(..., title="The batch size or batch size schedule")
    seed: Optional[StrictInt] = Field(..., title="Random seed")
    use_pytorch_for_gpu_memory: StrictBool = Field(..., title="Allocate memory via PyTorch")
    tok2vec_model: StrictStr = Field(..., title="tok2vec model in config, e.g. components.tok2vec.model")
    optimizer: Optimizer = Field(..., title="The optimizer to use")
    corpus: Reader = Field(..., title="Reader for the training data")
    batcher: Batcher = Field(..., title="Batcher for the training data")
    component: str = Field(..., title="Component to find the layer to pretrain")
    layer: str = Field(..., title="Layer to pretrain. Whole model if empty.")
    # TODO: use a more detailed schema for this?
    objective: Dict[str, Any] = Field(..., title="Pretraining objective")
    # fmt: on
--- a/spacy/tests/doc/test_add_entities.py
+++ b/spacy/tests/doc/test_add_entities.py
@ -9,7 +9,10 @@ from spacy.pipeline.ner import DEFAULT_NER_MODEL
 def _ner_example(ner):
-    doc = Doc(ner.vocab, words=["Joe", "loves", "visiting", "London", "during", "the", "weekend"])
+    doc = Doc(
        ner.vocab,
        words=["Joe", "loves", "visiting", "London", "during", "the", "weekend"],
    )
    gold = {"entities": [(0, 3, "PERSON"), (19, 25, "LOC")]}
    return Example.from_dict(doc, gold)
--- a/spacy/tests/doc/test_morphanalysis.py
+++ b/spacy/tests/doc/test_morphanalysis.py
@ -66,3 +66,31 @@ def test_morph_set(i_has):
 def test_morph_str(i_has):
    assert str(i_has[0].morph) == "PronType=prs"
    assert str(i_has[1].morph) == "Number=sing|Person=three|Tense=pres|VerbForm=fin"
 def test_morph_property(tokenizer):
    doc = tokenizer("a dog")
    # set through token.morph_
    doc[0].morph_ = "PronType=prs"
    assert doc[0].morph_ == "PronType=prs"
    assert doc.to_array(["MORPH"])[0] != 0
    # unset with token.morph
    doc[0].morph = 0
    assert doc.to_array(["MORPH"])[0] == 0
    # empty morph is equivalent to "_"
    doc[0].morph_ = ""
    assert doc[0].morph_ == ""
    assert doc.to_array(["MORPH"])[0] == tokenizer.vocab.strings["_"]
    # "_" morph is also equivalent to empty morph
    doc[0].morph_ = "_"
    assert doc[0].morph_ == ""
    assert doc.to_array(["MORPH"])[0] == tokenizer.vocab.strings["_"]
    # set through existing hash with token.morph
    tokenizer.vocab.strings.add("Feat=Val")
    doc[0].morph = tokenizer.vocab.strings.add("Feat=Val")
    assert doc[0].morph_ == "Feat=Val"
--- a/spacy/tests/matcher/test_dependency_matcher.py
+++ b/spacy/tests/matcher/test_dependency_matcher.py
@ -78,7 +78,7 @@ def patterns(en_vocab):
            "REL_OP": ">",
            "RIGHT_ID": "fox",
            "RIGHT_ATTRS": {"ORTH": "fox"},
-        }
+        },
    ]
    pattern5 = [
@ -233,9 +233,7 @@ def test_dependency_matcher_callback(en_vocab, doc):
    assert matches == matches2
-@pytest.mark.parametrize(
+@pytest.mark.parametrize("op,num_matches", [(".", 8), (".*", 20), (";", 8), (";*", 20)])
    "op,num_matches", [(".", 8), (".*", 20), (";", 8), (";*", 20),]
 )
 def test_dependency_matcher_precedence_ops(en_vocab, op, num_matches):
    # two sentences to test that all matches are within the same sentence
    doc = get_doc(
@ -248,7 +246,7 @@ def test_dependency_matcher_precedence_ops(en_vocab, op, num_matches):
    for text in ["a", "b", "c", "d", "e"]:
        pattern = [
            {"RIGHT_ID": "1", "RIGHT_ATTRS": {"ORTH": text}},
-            {"LEFT_ID": "1", "REL_OP": op, "RIGHT_ID": "2", "RIGHT_ATTRS": {},},
+            {"LEFT_ID": "1", "REL_OP": op, "RIGHT_ID": "2", "RIGHT_ATTRS": {}},
        ]
        matcher = DependencyMatcher(en_vocab)
        matcher.add("A", [pattern])
--- a/spacy/tests/parser/test_add_label.py
+++ b/spacy/tests/parser/test_add_label.py
@ -54,7 +54,10 @@ def _parser_example(parser):
 def _ner_example(ner):
-    doc = Doc(ner.vocab, words=["Joe", "loves", "visiting", "London", "during", "the", "weekend"])
+    doc = Doc(
        ner.vocab,
        words=["Joe", "loves", "visiting", "London", "during", "the", "weekend"],
    )
    gold = {"entities": [(0, 3, "PERSON"), (19, 25, "LOC")]}
    return Example.from_dict(doc, gold)
--- a/spacy/tests/pipeline/test_senter.py
+++ b/spacy/tests/pipeline/test_senter.py
@ -30,9 +30,10 @@ TRAIN_DATA = [
    ),
 ]
 def test_begin_training_examples():
    nlp = Language()
-    senter = nlp.add_pipe("senter")
+    nlp.add_pipe("senter")
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@ -89,7 +89,7 @@ def test_no_label():
 def test_implicit_label():
    nlp = Language()
-    textcat = nlp.add_pipe("textcat")
+    nlp.add_pipe("textcat")
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
--- a/spacy/tests/serialize/test_serialize_pipeline.py
+++ b/spacy/tests/serialize/test_serialize_pipeline.py
@ -136,7 +136,7 @@ def test_serialize_textcat_empty(en_vocab):
    # See issue #1105
    cfg = {"model": DEFAULT_TEXTCAT_MODEL}
    model = registry.make_from_config(cfg, validate=True)["model"]
-    textcat = TextCategorizer(en_vocab, model, labels=["ENTITY", "ACTION", "MODIFIER"])
+    textcat = TextCategorizer(en_vocab, model, labels=["ENTITY", "ACTION", "MODIFIER"], threshold=0.5)
    textcat.to_bytes(exclude=["vocab"])
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@ -5,7 +5,6 @@ from spacy.training import docs_to_json, biluo_tags_from_offsets
 from spacy.training.converters import iob2docs, conll_ner2docs, conllu2docs
 from spacy.lang.en import English
 from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
 from spacy.cli.pretrain import make_docs
 from spacy.cli.init_config import init_config, RECOMMENDATIONS
 from spacy.cli._util import validate_project_commands, parse_config_overrides
 from spacy.cli._util import load_project_config, substitute_project_variables
@ -231,48 +230,6 @@ def test_cli_converters_conll_ner2json():
        assert ent.text in ["New York City", "London"]
 def test_pretrain_make_docs():
    nlp = English()
    valid_jsonl_text = {"text": "Some text"}
    docs, skip_count = make_docs(nlp, [valid_jsonl_text], 1, 10)
    assert len(docs) == 1
    assert skip_count == 0
    valid_jsonl_tokens = {"tokens": ["Some", "tokens"]}
    docs, skip_count = make_docs(nlp, [valid_jsonl_tokens], 1, 10)
    assert len(docs) == 1
    assert skip_count == 0
    invalid_jsonl_type = 0
    with pytest.raises(TypeError):
        make_docs(nlp, [invalid_jsonl_type], 1, 100)
    invalid_jsonl_key = {"invalid": "Does not matter"}
    with pytest.raises(ValueError):
        make_docs(nlp, [invalid_jsonl_key], 1, 100)
    empty_jsonl_text = {"text": ""}
    docs, skip_count = make_docs(nlp, [empty_jsonl_text], 1, 10)
    assert len(docs) == 0
    assert skip_count == 1
    empty_jsonl_tokens = {"tokens": []}
    docs, skip_count = make_docs(nlp, [empty_jsonl_tokens], 1, 10)
    assert len(docs) == 0
    assert skip_count == 1
    too_short_jsonl = {"text": "This text is not long enough"}
    docs, skip_count = make_docs(nlp, [too_short_jsonl], 10, 15)
    assert len(docs) == 0
    assert skip_count == 0
    too_long_jsonl = {"text": "This text contains way too much tokens for this test"}
    docs, skip_count = make_docs(nlp, [too_long_jsonl], 1, 5)
    assert len(docs) == 0
    assert skip_count == 0
 def test_project_config_validation_full():
    config = {
        "vars": {"some_var": 20},
--- a/spacy/tests/tokenizer/test_tokenizer.py
+++ b/spacy/tests/tokenizer/test_tokenizer.py
@ -155,3 +155,11 @@ def test_tokenizer_special_cases_with_period(tokenizer):
    tokenizer.add_special_case("_SPECIAL_", [{"orth": "_SPECIAL_"}])
    doc = tokenizer(text)
    assert [token.text for token in doc] == ["_SPECIAL_", "."]
 def test_tokenizer_special_cases_idx(tokenizer):
    text = "the _ID'X_"
    tokenizer.add_special_case("_ID'X_", [{"orth": "_ID"}, {"orth": "'X_"}])
    doc = tokenizer(text)
    assert doc[1].idx == 4
    assert doc[2].idx == 7
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -343,8 +343,9 @@ cdef class Tokenizer:
                    for j in range(cached.length):
                        tokens[i + offset + j] = cached.data.tokens[j]
                        tokens[i + offset + j].idx = orig_idx + idx_offset
-                        idx_offset += cached.data.tokens[j].lex.length + \
+                        idx_offset += cached.data.tokens[j].lex.length
-                                1 if cached.data.tokens[j].spacy else 0
+                        if cached.data.tokens[j].spacy:
                            idx_offset += 1
                    tokens[i + offset + cached.length - 1].spacy = orig_final_spacy
                    i += span_end - span_start
                    offset += span[3]
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -214,9 +214,17 @@ cdef class Token:
        xp = get_array_module(vector)
        return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm))
-    @property
+    property morph:
-    def morph(self):
+        def __get__(self):
-        return MorphAnalysis.from_id(self.vocab, self.c.morph)
+            return MorphAnalysis.from_id(self.vocab, self.c.morph)
        def __set__(self, attr_t morph):
            if morph == 0:
                self.c.morph = morph
            elif morph in self.vocab.strings:
                self.morph_ = self.vocab.strings[morph]
            else:
                raise ValueError(Errors.E1009.format(val=morph))
    property morph_:
        def __get__(self):
--- a/spacy/training/corpus.py
+++ b/spacy/training/corpus.py
@ -1,6 +1,7 @@
 import warnings
 from typing import Union, List, Iterable, Iterator, TYPE_CHECKING, Callable
 from pathlib import Path
 import srsly
 from .. import util
 from .example import Example
@ -21,6 +22,36 @@ def create_docbin_reader(
 ) -> Callable[["Language"], Iterable[Example]]:
    return Corpus(path, gold_preproc=gold_preproc, max_length=max_length, limit=limit)
@util.registry.readers("spacy.JsonlReader.v1")
 def create_jsonl_reader(
    path: Path, min_length: int=0, max_length: int = 0, limit: int = 0
 ) -> Callable[["Language"], Iterable[Doc]]:
    return JsonlTexts(path, min_length=min_length, max_length=max_length, limit=limit)
 def walk_corpus(path: Union[str, Path], file_type) -> List[Path]:
    path = util.ensure_path(path)
    if not path.is_dir() and path.parts[-1].endswith(file_type):
        return [path]
    orig_path = path
    paths = [path]
    locs = []
    seen = set()
    for path in paths:
        if str(path) in seen:
            continue
        seen.add(str(path))
        if path.parts and path.parts[-1].startswith("."):
            continue
        elif path.is_dir():
            paths.extend(path.iterdir())
        elif path.parts[-1].endswith(file_type):
            locs.append(path)
    if len(locs) == 0:
        warnings.warn(Warnings.W090.format(path=orig_path))
    return locs
 class Corpus:
    """Iterate Example objects from a file or directory of DocBin (.spacy)
@ -47,36 +78,13 @@ class Corpus:
        *,
        limit: int = 0,
        gold_preproc: bool = False,
-        max_length: bool = False,
+        max_length: int = 0,
    ) -> None:
        self.path = util.ensure_path(path)
        self.gold_preproc = gold_preproc
        self.max_length = max_length
        self.limit = limit
    @staticmethod
    def walk_corpus(path: Union[str, Path]) -> List[Path]:
        path = util.ensure_path(path)
        if not path.is_dir() and path.parts[-1].endswith(FILE_TYPE):
            return [path]
        orig_path = path
        paths = [path]
        locs = []
        seen = set()
        for path in paths:
            if str(path) in seen:
                continue
            seen.add(str(path))
            if path.parts and path.parts[-1].startswith("."):
                continue
            elif path.is_dir():
                paths.extend(path.iterdir())
            elif path.parts[-1].endswith(FILE_TYPE):
                locs.append(path)
        if len(locs) == 0:
            warnings.warn(Warnings.W090.format(path=orig_path))
        return locs
    def __call__(self, nlp: "Language") -> Iterator[Example]:
        """Yield examples from the data.
@ -85,11 +93,11 @@ class Corpus:
        DOCS: https://nightly.spacy.io/api/corpus#call
        """
-        ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.path))
+        ref_docs = self.read_docbin(nlp.vocab, walk_corpus(self.path, FILE_TYPE))
        if self.gold_preproc:
            examples = self.make_examples_gold_preproc(nlp, ref_docs)
        else:
-            examples = self.make_examples(nlp, ref_docs, self.max_length)
+            examples = self.make_examples(nlp, ref_docs)
        yield from examples
    def _make_example(
@ -108,18 +116,18 @@ class Corpus:
            return Example(nlp.make_doc(reference.text), reference)
    def make_examples(
-        self, nlp: "Language", reference_docs: Iterable[Doc], max_length: int = 0
+        self, nlp: "Language", reference_docs: Iterable[Doc]
    ) -> Iterator[Example]:
        for reference in reference_docs:
            if len(reference) == 0:
                continue
-            elif max_length == 0 or len(reference) < max_length:
+            elif self.max_length == 0 or len(reference) < self.max_length:
                yield self._make_example(nlp, reference, False)
            elif reference.is_sentenced:
                for ref_sent in reference.sents:
                    if len(ref_sent) == 0:
                        continue
-                    elif max_length == 0 or len(ref_sent) < max_length:
+                    elif self.max_length == 0 or len(ref_sent) < self.max_length:
                        yield self._make_example(nlp, ref_sent.as_doc(), False)
    def make_examples_gold_preproc(
@ -151,3 +159,57 @@ class Corpus:
                        i += 1
                        if self.limit >= 1 and i >= self.limit:
                            break
 class JsonlTexts:
    """Iterate Doc objects from a file or directory of jsonl 
    formatted raw text files.
    path (Path): The directory or filename to read from.
    min_length (int): Minimum document length (in tokens). Shorter documents
        will be skipped. Defaults to 0, which indicates no limit.
    max_length (int): Maximum document length (in tokens). Longer documents will
        be skipped. Defaults to 0, which indicates no limit.
    limit (int): Limit corpus to a subset of examples, e.g. for debugging.
        Defaults to 0, which indicates no limit.
    DOCS: https://nightly.spacy.io/api/corpus
    """
    file_type = "jsonl"
    def __init__(
        self,
        path: Union[str, Path],
        *,
        limit: int = 0,
        min_length: int = 0,
        max_length: int = 0,
    ) -> None:
        self.path = util.ensure_path(path)
        self.min_length = min_length
        self.max_length = max_length
        self.limit = limit
    def __call__(self, nlp: "Language") -> Iterator[Example]:
        """Yield examples from the data.
        nlp (Language): The current nlp object.
        YIELDS (Doc): The docs.
        DOCS: https://nightly.spacy.io/api/corpus#call
        """
        for loc in walk_corpus(self.path, "jsonl"):
            records = srsly.read_jsonl(loc)
            for record in records:
                doc = nlp.make_doc(record["text"])
                if self.min_length >= 1 and len(doc) < self.min_length:
                    continue
                elif self.max_length >= 1 and len(doc) >= self.max_length:
                    continue
                else:
                    words = [w.text for w in doc]
                    spaces = [bool(w.whitespace_) for w in doc]
                    # We don't *need* an example here, but it seems nice to
                    # make it match the Corpus signature.
                    yield Example(doc, Doc(nlp.vocab, words=words, spaces=spaces))
--- a/spacy/training/loggers.py
+++ b/spacy/training/loggers.py
@ -36,20 +36,12 @@ def console_logger():
                        keys=list(info["losses"].keys()),
                    )
                ) from None
-
+            scores = []
-            try:
+            for col in score_cols:
-                scores = [
+                score = float(info["other_scores"].get(col, 0.0))
-                    "{0:.2f}".format(float(info["other_scores"].get(col, 0.0)) * 100)
+                if col != "speed":
-                    for col in score_cols
+                    score *= 100
-                ]
+                scores.append("{0:.2f}".format(score))
            except KeyError as e:
                raise KeyError(
                    Errors.E983.format(
                        dict="scores (other)",
                        key=str(e),
                        keys=list(info["other_scores"].keys()),
                    )
                ) from None
            data = (
                [info["epoch"], info["step"]]
                + losses
--- a/spacy/util.py
+++ b/spacy/util.py
@ -648,12 +648,20 @@ def join_command(command: List[str]) -> str:
    return " ".join(shlex.quote(cmd) for cmd in command)
-def run_command(command: Union[str, List[str]], *, capture=False, stdin=None):
+def run_command(
    command: Union[str, List[str]],
    *,
    capture: bool = False,
    stdin: Optional[Any] = None,
 ) -> Optional[subprocess.CompletedProcess]:
    """Run a command on the command line as a subprocess. If the subprocess
    returns a non-zero exit code, a system exit is performed.
    command (str / List[str]): The command. If provided as a string, the
        string will be split using shlex.split.
    stdin (Optional[Any]): stdin to read from or None.
    capture (bool): Whether to capture the output.
    RETURNS (Optional[CompletedProcess]): The process object.
    """
    if isinstance(command, str):
        command = split_command(command)
@ -671,6 +679,10 @@ def run_command(command: Union[str, List[str]], *, capture=False, stdin=None):
        raise FileNotFoundError(
            Errors.E970.format(str_command=" ".join(command), tool=command[0])
        ) from None
    except subprocess.CalledProcessError as e:
        # We don't want a duplicate traceback here
        print(e)
        sys.exit(1)
    if ret.returncode != 0:
        sys.exit(ret.returncode)
    return ret
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@ -14,6 +14,7 @@ menu:
  - ['evaluate', 'evaluate']
  - ['package', 'package']
  - ['project', 'project']
  - ['ray', 'ray']
 ---
 spaCy's CLI provides a range of helpful commands for downloading and training
@ -1134,3 +1135,47 @@ $ python -m spacy project dvc [project_dir] [workflow] [--force] [--verbose]
 | `--verbose`, `-V` |  Print more output generated by DVC. ~~bool (flag)~~                                                              |
 | `--help`, `-h`    | Show help message and available arguments. ~~bool (flag)~~                                                        |
 | **CREATES**       | A `dvc.yaml` file in the project directory, based on the steps defined in the given workflow.                     |
 ## ray {#ray new="3"}
 The `spacy ray` CLI includes commands for parallel and distributed computing via
 [Ray](https://ray.io).
 <Infobox variant="warning">
 To use this command, you need the
 [`spacy-ray`](https://github.com/explosion/spacy-ray) package installed.
 Installing the package will automatically add the `ray` command to the spaCy
 CLI.
 </Infobox>
 ### ray train {#ray-train tag="command"}
 Train a spaCy pipeline using [Ray](https://ray.io) for parallel training. The
 command works just like [`spacy train`](/api/cli#train). For more details and
 examples, see the usage guide on
 [parallel training](/usage/training#parallel-training) and the spaCy project
 [integration](/usage/projects#ray).
 ```cli
 $ python -m spacy ray train [config_path] [--code-path] [--output] [--n-workers] [--address] [--gpu-id] [--verbose] [overrides]
 ```
 > #### Example
 >
 > ```cli
 > $ python -m spacy ray train config.cfg --n-workers 2
 > ```
 | Name                | Description                                                                                                                                                                                |
 | ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `config_path`       | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~                                                                |
 | `--code`, `-c`      | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~       |
 | `--output`, `-o`    | Directory or remote storage URL for saving trained pipeline. The directory will be created if it doesn't exist. ~~Optional[Path] \(positional)~~                                           |
 | `--n-workers`, `-n` | The number of workers. Defaults to `1`. ~~int (option)~~                                                                                                                                   |
 | `--address`, `-a`   | Optional address of the Ray cluster. If not set (default), Ray will run locally. ~~Optional[str] \(option)~~                                                                               |
 | `--gpu-id`, `-g`    | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~                                                                                                                                 |
 | `--verbose`, `-V`   | Display more information for debugging purposes. ~~bool (flag)~~                                                                                                                           |
 | `--help`, `-h`      | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                 |
 | overrides           | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
--- a/website/docs/api/textcategorizer.md
+++ b/website/docs/api/textcategorizer.md
@ -30,15 +30,17 @@ architectures and their arguments and hyperparameters.
 > from spacy.pipeline.textcat import DEFAULT_TEXTCAT_MODEL
 > config = {
 >    "labels": [],
 >    "threshold": 0.5,
 >    "model": DEFAULT_TEXTCAT_MODEL,
 > }
 > nlp.add_pipe("textcat", config=config)
 > ```
-| Setting  | Description                                                                                                                                                      |
+| Setting     | Description                                                                                                                                                      |
-| -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `labels` | A list of categories to learn. If empty, the model infers the categories from the data. Defaults to `[]`. ~~Iterable[str]~~                                      |
+| `labels`    | A list of categories to learn. If empty, the model infers the categories from the data. Defaults to `[]`. ~~Iterable[str]~~                                      |
-| `model`  | A model instance that predicts scores for each category. Defaults to [TextCatEnsemble](/api/architectures#TextCatEnsemble). ~~Model[List[Doc], List[Floats2d]]~~ |
+| `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~                                                                   |
 | `model`     | A model instance that predicts scores for each category. Defaults to [TextCatEnsemble](/api/architectures#TextCatEnsemble). ~~Model[List[Doc], List[Floats2d]]~~ |
 ```python
 %%GITHUB_SPACY/spacy/pipeline/textcat.py
@ -58,7 +60,7 @@ architectures and their arguments and hyperparameters.
 >
 > # Construction from class
 > from spacy.pipeline import TextCategorizer
-> textcat = TextCategorizer(nlp.vocab, model)
+> textcat = TextCategorizer(nlp.vocab, model, labels=[], threshold=0.5)
 > ```
 Create a new pipeline instance. In your application, you would normally use a
@ -72,6 +74,7 @@ shortcut for this and instantiate the component using its string name and
 | `name`         | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                        |
 | _keyword-only_ |                                                                                                                            |
 | `labels`       | The labels to use. ~~Iterable[str]~~                                                                                       |
 | `threshold`    | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~                             |
 ## TextCategorizer.\_\_call\_\_ {#call tag="method"}
--- a/website/docs/images/spacy-ray.svg
+++ b/website/docs/images/spacy-ray.svg
--- a/website/docs/usage/_benchmarks-models.md
+++ b/website/docs/usage/_benchmarks-models.md
@ -26,7 +26,7 @@ on training Stanza on this corpus to allow direct comparison.
 <figure>
-| System                                                                         |  POS |  USA |  LAS |
+| System                                                                         |  POS |  UAS |  LAS |
 | ------------------------------------------------------------------------------ | ---: | ---: | ---: |
 | spaCy RoBERTa (2020)                                                           |      |      |      |
 | spaCy CNN (2020)                                                               |      |      |      |
--- a/website/docs/usage/facts-figures.md
+++ b/website/docs/usage/facts-figures.md
@ -61,17 +61,13 @@ import Benchmarks from 'usage/\_benchmarks-models.md'
 <Benchmarks />
-<!-- TODO:
+<Project id="benchmarks/parsing_penn_treebank">
 <Project id="benchmarks/penn_treebank">
 The easiest way to reproduce spaCy's benchmarks on the Penn Treebank is to clone
 our project template.
 </Project>
 -->
 <!-- ## Citing spaCy {#citation}
 <!-- TODO: update -->
--- a/website/docs/usage/projects.md
+++ b/website/docs/usage/projects.md
@ -796,11 +796,9 @@ workflows, including
 evaluation workflow that lets you compare two different models and their
 results.
-<Project id="integrations/prodigy">
+<!-- TODO: <Project id="integrations/prodigy">
-<!-- TODO: -->
+</Project> -->
 </Project>
 ---
@ -817,7 +815,7 @@ full embedded visualizer, as well as individual components.
 > #### Installation
 >
 > ```bash
-> $ pip install "spacy_streamlit>=1.0.0a0"
+> $ pip install "spacy-streamlit>=1.0.0a0"
 > ```
 ![](../images/spacy-streamlit.png)
@ -915,7 +913,39 @@ https://github.com/explosion/projects/blob/v3/integrations/fastapi/scripts/main.
 <Infobox title="This section is still under construction" emoji="🚧" variant="warning">
 </Infobox>
-<!-- TODO: document -->
+> #### Installation
 >
 > ```cli
 > $ pip install spacy-ray
 > # Check that the CLI is registered
 > $ python -m spacy ray --help
 > ```
 [Ray](https://ray.io/) is a fast and simple framework for building and running
 **distributed applications**. You can use Ray for parallel and distributed
 training with spaCy via our lightweight
 [`spacy-ray`](https://github.com/explosion/spacy-ray) extension package. If the
 package is installed in the same environment as spaCy, it will automatically add
 [`spacy ray`](/api/cli#ray) commands to your spaCy CLI.
 You can integrate [`spacy ray train`](/api/cli#ray-train) into your
 `project.yml` just like the regular training command:
 <!-- prettier-ignore -->
 ```yaml
 ### project.yml
 - name: "ray"
    help: "Train a model via parallel training with Ray"
    script:
      - "python -m spacy ray train configs/config.cfg --paths.train corpus/train.spacy --paths.dev corpus/dev.spacy"
    deps:
      - "corpus/train.spacy"
      - "corpus/dev.spacy"
 ```
 <!-- TODO: <Project id="integrations/ray">
 </Project> -->
 ---
@ -943,12 +973,14 @@ your results.
 ![Screenshot: Parameter importance using config values](../images/wandb2.jpg 'Parameter importance using config values')
 <!-- TODO:
 <Project id="integrations/wandb">
 Get started with tracking your spaCy training runs in Weights & Biases using our
 project template. It includes a simple config using the `WandbLogger`, as well
 as a custom logger implementation you can adjust for your specific use case.
 <!-- TODO: -->
 </Project>
 -->
--- a/website/docs/usage/rule-based-matching.md
+++ b/website/docs/usage/rule-based-matching.md
@ -1075,7 +1075,7 @@ relations and tokens we want to match:
 > #### Visualizing the parse
 >
-> The [`displacy` visualizer](/usage/visualizer) lets you render `Doc` objects
+> The [`displacy` visualizer](/usage/visualizers) lets you render `Doc` objects
 > and their dependency parse and part-of-speech tags:
 >
 > ```python
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@ -7,7 +7,7 @@ menu:
  - ['Quickstart', 'quickstart']
  - ['Config System', 'config']
  - ['Custom Functions', 'custom-functions']
-  #   - ['Parallel Training', 'parallel-training']
+  - ['Parallel Training', 'parallel-training']
  - ['Internal API', 'api']
 ---
@ -832,6 +832,73 @@ def MyModel(output_width: int) -> Model[List[Doc], List[Floats2d]]:
    return create_model(output_width)
 ```
 ## Parallel & distributed training with Ray {#parallel-training}
 > #### Installation
 >
 > ```cli
 > $ pip install spacy-ray
 > # Check that the CLI is registered
 > $ python -m spacy ray --help
 > ```
 [Ray](https://ray.io/) is a fast and simple framework for building and running
 **distributed applications**. You can use Ray to train spaCy on one or more
 remote machines, potentially speeding up your training process. Parallel
 training won't always be faster though – it depends on your batch size, models,
 and hardware.
 <Infobox variant="warning">
 To use Ray with spaCy, you need the
 [`spacy-ray`](https://github.com/explosion/spacy-ray) package installed.
 Installing the package will automatically add the `ray` command to the spaCy
 CLI.
 </Infobox>
 The [`spacy ray train`](/api/cli#ray-train) command follows the same API as
 [`spacy train`](/api/cli#train), with a few extra options to configure the Ray
 setup. You can optionally set the `--address` option to point to your Ray
 cluster. If it's not set, Ray will run locally.
 ```cli
 python -m spacy ray train config.cfg --n-workers 2
 ```
 <!-- TODO: <Project id="integrations/ray">
 </Project> -->
 ### How parallel training works {#parallel-training-details}
 Each worker receives a shard of the **data** and builds a copy of the **model
 and optimizer** from the [`config.cfg`](#config). It also has a communication
 channel to **pass gradients and parameters** to the other workers. Additionally,
 each worker is given ownership of a subset of the parameter arrays. Every
 parameter array is owned by exactly one worker, and the workers are given a
 mapping so they know which worker owns which parameter.
 ![Illustration of setup](../images/spacy-ray.svg)
 As training proceeds, every worker will be computing gradients for **all** of
 the model parameters. When they compute gradients for parameters they don't own,
 they'll **send them to the worker** that does own that parameter, along with a
 version identifier so that the owner can decide whether the discard the
 gradient. Workers use the gradients they receive and the ones they compute
 locally to update the parameters they own, and then broadcast the updated array
 and a new version ID to the other workers.
 This training procedure is **asynchronous** and **non-blocking**. Workers always
 push their gradient increments and parameter updates, they do not have to pull
 them and block on the result, so the transfers can happen in the background,
 overlapped with the actual training work. The workers also do not have to stop
 and wait for each other ("synchronize") at the start of each batch. This is very
 useful for spaCy, because spaCy is often trained on long documents, which means
 **batches can vary in size** significantly. Uneven workloads make synchronous
 gradient descent inefficient, because if one batch is slow, all of the other
 workers are stuck waiting for it to complete before they can continue.
 ## Internal training API {#api}
 <Infobox variant="warning">
--- a/website/docs/usage/v3.md
+++ b/website/docs/usage/v3.md
@ -34,6 +34,7 @@ to clone and adapt best-practice projects for your own use cases.
 - [Training & config system](#features-training)
 - [Custom models](#features-custom-models)
 - [End-to-end project workflows](#features-projects)
 - [Parallel training with Ray](#features-parallel-training)
 - [New built-in components](#features-pipeline-components)
 - [New custom component API](#features-components)
 - [Dependency matching](#features-dep-matcher)
@ -223,6 +224,39 @@ workflows, from data preprocessing to training and packaging your pipeline.
 </Infobox>
 ### Parallel and distributed training with Ray {#features-parallel-training}
 > #### Example
 >
 > ```cli
 > $ pip install spacy-ray
 > # Check that the CLI is registered
 > $ python -m spacy ray --help
 > # Train a pipeline
 > $ python -m spacy ray train config.cfg --n-workers 2
 > ```
 [Ray](https://ray.io/) is a fast and simple framework for building and running
 **distributed applications**. You can use Ray to train spaCy on one or more
 remote machines, potentially speeding up your training process. The Ray
 integration is powered by a lightweight extension package,
 [`spacy-ray`](https://github.com/explosion/spacy-ray), that automatically adds
 the [`ray`](/api/cli#ray) command to your spaCy CLI if it's installed in the
 same environment. You can then run [`spacy ray train`](/api/cli#ray-train) for
 parallel training.
 ![Illustration of setup](../images/spacy-ray.svg)
 <Infobox title="Details & Documentation" emoji="📖" list>
 - **Usage: **
  [Parallel and distributed training](/usage/training#parallel-training),
  [spaCy Projects integration](/usage/projects#ray)
 - **CLI:** [`ray`](/api/cli#ray), [`ray train`](/api/cli#ray-train)
 - **Implementation:** [`spacy-ray`](https://github.com/explosion/spacy-ray)
 </Infobox>
 ### New built-in pipeline components {#features-pipeline-components}
 spaCy v3.0 includes several new trainable and rule-based components that you can
@ -390,6 +424,7 @@ The following methods, attributes and commands are new in spaCy v3.0.
 | [`util.get_installed_models`](/api/top-level#util.get_installed_models)                                                         | Names of all pipeline packages installed in the environment.                                                                                                                                     |
 | [`init config`](/api/cli#init-config), [`init fill-config`](/api/cli#init-fill-config), [`debug config`](/api/cli#debug-config) | CLI commands for initializing, auto-filling and debugging [training configs](/usage/training).                                                                                                   |
 | [`project`](/api/cli#project)                                                                                                   | Suite of CLI commands for cloning, running and managing [spaCy projects](/usage/projects).                                                                                                       |
 | [`ray`](/api/cli#ray)                                                                                                           | Suite of CLI commands for parallel training with [Ray](https://ray.io/), provided by the [`spacy-ray`](https://github.com/explosion/spacy-ray) extension package.                                |
 ### New and updated documentation {#new-docs}
--- a/website/gatsby-config.js
+++ b/website/gatsby-config.js
@ -26,11 +26,27 @@ const replacements = {
    GITHUB_SPACY: `https://github.com/explosion/spaCy/tree/${branch}`,
 }
 /**
 * Compute the overall total counts of models and languages
 */
 function getCounts(langs = []) {
    return {
        langs: langs.length,
        modelLangs: langs.filter(({ models }) => models && !!models.length).length,
        starterLangs: langs.filter(({ starters }) => starters && !!starters.length).length,
        models: langs.map(({ models }) => (models ? models.length : 0)).reduce((a, b) => a + b, 0),
        starters: langs
            .map(({ starters }) => (starters ? starters.length : 0))
            .reduce((a, b) => a + b, 0),
    }
 }
 module.exports = {
    siteMetadata: {
        ...site,
        sidebars,
        ...models,
        counts: getCounts(models.languages),
        universe,
        nightly: isNightly,
        binderBranch,
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@ -1,5 +1,16 @@
 {
    "resources": [
        {
            "id": "spacy-ray",
            "title": "spacy-ray",
            "slogan": "Parallel and distributed training with spaCy and Ray",
            "description": "[Ray](https://ray.io/) is a fast and simple framework for building and running **distributed applications**. This very lightweight extension package lets you use Ray for parallel and distributed training with spaCy. If `spacy-ray` is installed in the same environment as spaCy, it will automatically add `spacy ray` commands to your spaCy CLI.",
            "github": "explosion/spacy-ray",
            "pip": "spacy-ray",
            "category": ["training"],
            "author": "Explosion / Anyscale",
            "thumb": "https://i.imgur.com/7so6ZpS.png"
        },
        {
            "id": "spacy-sentence-bert",
            "title": "spaCy - sentence-transformers",
@ -2518,14 +2529,14 @@
            "description": "A spaCy rule-based pipeline for identifying positive cases of COVID-19 from clinical text. A version of this system was deployed as part of the US Department of Veterans Affairs biosurveillance response to COVID-19.",
            "pip": "cov-bsv",
            "code_example": [
-              "import cov_bsv",
+                "import cov_bsv",
-              "",
+                "",
-              "nlp = cov_bsv.load()",
+                "nlp = cov_bsv.load()",
-              "text = 'Pt tested for COVID-19. His wife was recently diagnosed with novel coronavirus. SARS-COV-2: Detected'",
+                "text = 'Pt tested for COVID-19. His wife was recently diagnosed with novel coronavirus. SARS-COV-2: Detected'",
-              "",
+                "",
-              "print(doc.ents)",
+                "print(doc.ents)",
-              "print(doc._.cov_classification)",
+                "print(doc._.cov_classification)",
-              "cov_bsv.visualize_doc(doc)"
+                "cov_bsv.visualize_doc(doc)"
            ],
            "category": ["pipeline", "standalone", "biomedical", "scientific"],
            "tags": ["clinical", "epidemiology", "covid-19", "surveillance"],
--- a/website/src/components/code.js
+++ b/website/src/components/code.js
@ -14,6 +14,7 @@ import GitHubCode from './github'
 import classes from '../styles/code.module.sass'
 const WRAP_THRESHOLD = 30
 const CLI_GROUPS = ['init', 'debug', 'project', 'ray']
 export default props => (
    <Pre>
@ -99,7 +100,6 @@ function replacePrompt(line, prompt, isFirst = false) {
 }
 function parseArgs(raw) {
    const commandGroups = ['init', 'debug', 'project']
    let args = raw.split(' ').filter(arg => arg)
    const result = {}
    while (args.length) {
@ -108,7 +108,12 @@ function parseArgs(raw) {
            const isFlag = !args.length || (args[0].length > 1 && args[0].startsWith('-'))
            result[opt] = isFlag ? true : args.shift()
        } else {
-            const key = commandGroups.includes(opt) ? `${opt} ${args.shift()}` : opt
+            let key = opt
            if (CLI_GROUPS.includes(opt)) {
                if (args.length && !args[0].startsWith('-')) {
                    key = `${opt} ${args.shift()}`
                }
            }
            result[key] = null
        }
    }
--- a/website/src/components/landing.js
+++ b/website/src/components/landing.js
@ -38,8 +38,8 @@ export const LandingSubtitle = ({ children }) => (
 )
 export const LandingGrid = ({ cols = 3, blocks = false, style, children }) => (
-    <Content className={classNames(classes.grid, { [classes.blocks]: blocks })}>
+    <Content className={classNames({ [classes.blocks]: blocks })}>
-        <Grid cols={cols} narrow={blocks} style={style}>
+        <Grid cols={cols} narrow={blocks} className={classes.grid} style={style}>
            {children}
        </Grid>
    </Content>
--- a/website/src/styles/accordion.module.sass
+++ b/website/src/styles/accordion.module.sass
@ -26,8 +26,11 @@
        border-bottom-right-radius: 0
 .icon
-    width: 2rem
+    $width: 2rem
-    height: 2rem
+
    width: $width
    height: $width
    flex: 0 0 $width
    background: var(--color-theme)
    color: var(--color-back)
    border-radius: 50%
--- a/website/src/styles/landing.module.sass
+++ b/website/src/styles/landing.module.sass
@ -128,14 +128,17 @@
        padding-right: 2rem
@include breakpoint(max, md)
    .banner
        padding: 1rem 3rem
    .banner-content
        display: block
    .banner-text
        padding-top: 0
-    .col
+    .grid
-        grid-column: 1 / span 2
+        grid-template-columns: 1fr !important
 .banner-button
    margin-bottom: var(--spacing-sm)
--- a/website/src/widgets/landing.js
+++ b/website/src/widgets/landing.js
@ -54,23 +54,8 @@ for entity in doc.ents:
    print(entity.text, entity.label_)
 `
 /**
 * Compute the overall total counts of models and languages
 */
 function getCounts(langs = []) {
    return {
        langs: langs.length,
        modelLangs: langs.filter(({ models }) => models && !!models.length).length,
        starterLangs: langs.filter(({ starters }) => starters && !!starters.length).length,
        models: langs.map(({ models }) => (models ? models.length : 0)).reduce((a, b) => a + b, 0),
        starters: langs
            .map(({ starters }) => (starters ? starters.length : 0))
            .reduce((a, b) => a + b, 0),
    }
 }
 const Landing = ({ data }) => {
-    const counts = getCounts(data.languages)
+    const { counts } = data
    return (
        <>
            <LandingHeader nightly={data.nightly}>
@ -345,7 +330,10 @@ const landingQuery = graphql`
            siteMetadata {
                nightly
                repo
-                languages {
+                counts {
                    langs
                    modelLangs
                    starterLangs
                    models
                    starters
                }