Merge branch 'develop' into nightly.spacy.io

2025-08-20 12:04:59 +03:00 · 2020-09-20 17:49:28 +02:00 · 2020-09-20 17:49:28 +02:00 · e22de2e69d
commit e22de2e69d
parent 56ca83b026 b9d2b29684
34 changed files with 267 additions and 176 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@ -6,7 +6,7 @@ requires = [
    "cymem>=2.0.2,<2.1.0",
    "preshed>=3.0.2,<3.1.0",
    "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=8.0.0a33,<8.0.0a40",
+    "thinc>=8.0.0a34,<8.0.0a40",
    "blis>=0.4.0,<0.5.0",
    "pytokenizations",
    "pathy"
--- a/requirements.txt
+++ b/requirements.txt
@ -1,7 +1,7 @@
 # Our libraries
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=8.0.0a33,<8.0.0a40
+thinc>=8.0.0a34,<8.0.0a40
 blis>=0.4.0,<0.5.0
 ml_datasets==0.2.0a0
 murmurhash>=0.28.0,<1.1.0
--- a/setup.cfg
+++ b/setup.cfg
@ -34,13 +34,13 @@ setup_requires =
    cymem>=2.0.2,<2.1.0
    preshed>=3.0.2,<3.1.0
    murmurhash>=0.28.0,<1.1.0
-    thinc>=8.0.0a33,<8.0.0a40
+    thinc>=8.0.0a34,<8.0.0a40
 install_requires =
    # Our libraries
    murmurhash>=0.28.0,<1.1.0
    cymem>=2.0.2,<2.1.0
    preshed>=3.0.2,<3.1.0
-    thinc>=8.0.0a33,<8.0.0a40
+    thinc>=8.0.0a34,<8.0.0a40
    blis>=0.4.0,<0.5.0
    wasabi>=0.8.0,<1.1.0
    srsly>=2.1.0,<3.0.0
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@ -308,6 +308,31 @@ def git_checkout(
        msg.fail("Destination of checkout must not exist", exits=1)
    if not dest.parent.exists():
        raise IOError("Parent of destination of checkout must exist")
+
+    if sparse and git_version >= (2, 22):
+        return git_sparse_checkout(repo, subpath, dest, branch)
+    elif sparse:
+        # Only show warnings if the user explicitly wants sparse checkout but
+        # the Git version doesn't support it
+        err_old = (
+            f"You're running an old version of Git (v{git_version[0]}.{git_version[1]}) "
+            f"that doesn't fully support sparse checkout yet."
+        )
+        err_unk = "You're running an unknown version of Git, so sparse checkout has been disabled."
+        msg.warn(
+            f"{err_unk if git_version == (0, 0) else err_old} "
+            f"This means that more files than necessary may be downloaded "
+            f"temporarily. To only download the files needed, make sure "
+            f"you're using Git v2.22 or above."
+        )
+    with make_tempdir() as tmp_dir:
+        cmd = f"git -C {tmp_dir} clone {repo} . -b {branch}"
+        ret = run_command(cmd, capture=True)
+        # We need Path(name) to make sure we also support subdirectories
+        shutil.copytree(str(tmp_dir / Path(subpath)), str(dest))
+
+
+def git_sparse_checkout(repo, subpath, dest, branch):
    # We're using Git, partial clone and sparse checkout to
    # only clone the files we need
    # This ends up being RIDICULOUS. omg.
@ -324,47 +349,31 @@ def git_checkout(
    # *that* we can do by path.
    # We're using Git and sparse checkout to only clone the files we need
    with make_tempdir() as tmp_dir:
-        supports_sparse = git_version >= (2, 22)
-        use_sparse = supports_sparse and sparse
        # This is the "clone, but don't download anything" part.
-        cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 " f"-b {branch} "
-        if use_sparse:
-            cmd += f"--filter=blob:none"  # <-- The key bit
-        # Only show warnings if the user explicitly wants sparse checkout but
-        # the Git version doesn't support it
-        elif sparse:
-            err_old = (
-                f"You're running an old version of Git (v{git_version[0]}.{git_version[1]}) "
-                f"that doesn't fully support sparse checkout yet."
-            )
-            err_unk = "You're running an unknown version of Git, so sparse checkout has been disabled."
-            msg.warn(
-                f"{err_unk if git_version == (0, 0) else err_old} "
-                f"This means that more files than necessary may be downloaded "
-                f"temporarily. To only download the files needed, make sure "
-                f"you're using Git v2.22 or above."
-            )
-        try_run_command(cmd)
+        cmd = (
+            f"git clone {repo} {tmp_dir} --no-checkout --depth 1 "
+            f"-b {branch} --filter=blob:none"
+        )
+        run_command(cmd)
        # Now we need to find the missing filenames for the subpath we want.
        # Looking for this 'rev-list' command in the git --help? Hah.
-        cmd = f"git -C {tmp_dir} rev-list --objects --all {'--missing=print ' if use_sparse else ''} -- {subpath}"
-        ret = try_run_command(cmd)
+        cmd = f"git -C {tmp_dir} rev-list --objects --all --missing=print -- {subpath}"
+        ret = run_command(cmd, capture=True)
        git_repo = _from_http_to_git(repo)
        # Now pass those missings into another bit of git internals
        missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")])
-        if use_sparse and not missings:
+        if not missings:
            err = (
                f"Could not find any relevant files for '{subpath}'. "
                f"Did you specify a correct and complete path within repo '{repo}' "
                f"and branch {branch}?"
            )
            msg.fail(err, exits=1)
-        if use_sparse:
-            cmd = f"git -C {tmp_dir} fetch-pack {git_repo} {missings}"
-            try_run_command(cmd)
+        cmd = f"git -C {tmp_dir} fetch-pack {git_repo} {missings}"
+        run_command(cmd, capture=True)
        # And finally, we can checkout our subpath
        cmd = f"git -C {tmp_dir} checkout {branch} {subpath}"
-        try_run_command(cmd)
+        run_command(cmd, capture=True)
        # We need Path(name) to make sure we also support subdirectories
        shutil.move(str(tmp_dir / Path(subpath)), str(dest))

@ -378,7 +387,7 @@ def get_git_version(
    RETURNS (Tuple[int, int]): The version as a (major, minor) tuple. Returns
        (0, 0) if the version couldn't be determined.
    """
-    ret = try_run_command(["git", "--version"], error=error)
+    ret = run_command("git --version", capture=True)
    stdout = ret.stdout.strip()
    if not stdout or not stdout.startswith("git version"):
        return (0, 0)
@ -386,23 +395,6 @@ def get_git_version(
    return (int(version[0]), int(version[1]))


-def try_run_command(
-    cmd: Union[str, List[str]], error: str = "Could not run command"
-) -> subprocess.CompletedProcess:
-    """Try running a command and raise an error if it fails.
-
-    cmd (Union[str, List[str]]): The command to run.
-    error (str): The error message.
-    RETURNS (CompletedProcess): The completed process if the command ran.
-    """
-    try:
-        return run_command(cmd, capture=True)
-    except subprocess.CalledProcessError as e:
-        msg.fail(error)
-        print(cmd)
-        sys.exit(1)
-
-
 def _from_http_to_git(repo: str) -> str:
    if repo.startswith("http://"):
        repo = repo.replace(r"http://", r"https://")
--- a/spacy/cli/debug_model.py
+++ b/spacy/cli/debug_model.py
@ -2,7 +2,7 @@ from typing import Dict, Any, Optional
 from pathlib import Path
 from wasabi import msg
 from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam
-from thinc.api import Model, data_validation
+from thinc.api import Model, data_validation, set_gpu_allocator
 import typer

 from ._util import Arg, Opt, debug_cli, show_validation_error
@ -53,7 +53,12 @@ def debug_model_cli(
    }
    config_overrides = parse_config_overrides(ctx.args)
    with show_validation_error(config_path):
-        config = util.load_config(config_path, overrides=config_overrides)
+        config = util.load_config(
+            config_path, overrides=config_overrides, interpolate=True
+        )
+        allocator = config["training"]["gpu_allocator"]
+        if use_gpu >= 0 and allocator:
+            set_gpu_allocator(allocator)
        nlp, config = util.load_model_from_config(config_path)
    seed = config["training"]["seed"]
    if seed is not None:
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@ -110,7 +110,7 @@ def package(
    msg.good(f"Successfully created package '{model_name_v}'", main_path)
    if create_sdist:
        with util.working_dir(main_path):
-            util.run_command([sys.executable, "setup.py", "sdist"])
+            util.run_command([sys.executable, "setup.py", "sdist"], capture=False)
        zip_file = main_path / "dist" / f"{model_name_v}.tar.gz"
        msg.good(f"Successfully created zipped Python package", zip_file)

--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@ -4,10 +4,9 @@ import time
 import re
 from collections import Counter
 from pathlib import Path
-from thinc.api import Config
-from thinc.api import use_pytorch_for_gpu_memory, require_gpu
+from thinc.api import require_gpu, set_gpu_allocator
 from thinc.api import set_dropout_rate, to_categorical, fix_random_seed
-from thinc.api import CosineDistance, L2Distance
+from thinc.api import Config, CosineDistance, L2Distance
 from wasabi import msg
 import srsly
 from functools import partial
@ -32,7 +31,7 @@ def pretrain_cli(
    ctx: typer.Context,  # This is only used to read additional arguments
    config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False),
    output_dir: Path = Arg(..., help="Directory to write weights to on each epoch"),
-    code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
+    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
    resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
    epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."),
    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
@ -99,10 +98,12 @@ def pretrain(
    epoch_resume: Optional[int] = None,
    use_gpu: int = -1,
 ):
-    if config["system"].get("seed") is not None:
-        fix_random_seed(config["system"]["seed"])
-    if use_gpu >= 0 and config["system"].get("use_pytorch_for_gpu_memory"):
-        use_pytorch_for_gpu_memory()
+    if config["training"]["seed"] is not None:
+        fix_random_seed(config["training"]["seed"])
+    allocator = config["training"]["gpu_allocator"]
+    if use_gpu >= 0 and allocator:
+        set_gpu_allocator(allocator)
+
    nlp, config = util.load_model_from_config(config)
    P_cfg = config["pretraining"]
    corpus = dot_to_object(config, P_cfg["corpus"])
--- a/spacy/cli/project/run.py
+++ b/spacy/cli/project/run.py
@ -59,8 +59,9 @@ def project_run(
        for dep in cmd.get("deps", []):
            if not (project_dir / dep).exists():
                err = f"Missing dependency specified by command '{subcommand}': {dep}"
+                err_help = "Maybe you forgot to run the 'project assets' command?"
                err_kwargs = {"exits": 1} if not dry else {}
-                msg.fail(err, **err_kwargs)
+                msg.fail(err, err_help, **err_kwargs)
        with working_dir(project_dir) as current_dir:
            rerun = check_rerun(current_dir, cmd)
            if not rerun and not force:
@ -144,7 +145,7 @@ def run_commands(
        if not silent:
            print(f"Running command: {join_command(command)}")
        if not dry:
-            run_command(command)
+            run_command(command, capture=False)


 def validate_subcommand(
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@ -8,7 +8,11 @@ train = ""
 dev = ""

 [system]
-use_pytorch_for_gpu_memory = {{ "true" if use_transformer else "false" }}
+{% if use_transformer -%}
+gpu_allocator = "pytorch"
+{% else -%}
+gpu_allocator = null
+{% endif %}

 [nlp]
 lang = "{{ lang }}"
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -6,8 +6,7 @@ from pathlib import Path
 from wasabi import msg
 import thinc
 import thinc.schedules
-from thinc.api import use_pytorch_for_gpu_memory, require_gpu, fix_random_seed
-from thinc.api import Config, Optimizer
+from thinc.api import Config, Optimizer, require_gpu, fix_random_seed, set_gpu_allocator
 import random
 import typer
 import logging
@ -29,7 +28,7 @@ def train_cli(
    ctx: typer.Context,  # This is only used to read additional arguments
    config_path: Path = Arg(..., help="Path to config file", exists=True),
    output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"),
-    code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
+    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
    resume: bool = Opt(False, "--resume", "-R", help="Resume training"),
@ -79,15 +78,16 @@ def train(
        config = util.load_config(
            config_path, overrides=config_overrides, interpolate=True
        )
-    if config.get("training", {}).get("seed") is not None:
+    if config["training"]["seed"] is not None:
        fix_random_seed(config["training"]["seed"])
-    if config.get("system", {}).get("use_pytorch_for_gpu_memory"):
-        # It feels kind of weird to not have a default for this.
-        use_pytorch_for_gpu_memory()
+    allocator = config["training"]["gpu_allocator"]
+    if use_gpu >= 0 and allocator:
+        set_gpu_allocator(allocator)
    # Use original config here before it's resolved to functions
    sourced_components = get_sourced_components(config)
    with show_validation_error(config_path):
        nlp, config = util.load_model_from_config(config)
+    util.load_vocab_data_into_model(nlp, lookups=config["training"]["lookups"])
    if config["training"]["vectors"] is not None:
        util.load_vectors_into_model(nlp, config["training"]["vectors"])
    raw_text, tag_map, morph_rules, weights_data = load_from_paths(config)
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@ -6,13 +6,12 @@ init_tok2vec = null

 [system]
 seed = 0
-use_pytorch_for_gpu_memory = false
+gpu_allocator = null

 [nlp]
 lang = null
 pipeline = []
 disabled = []
-load_vocab_data = true
 before_creation = null
 after_creation = null
 after_pipeline_creation = null
@ -52,12 +51,14 @@ limit = 0
 # Training hyper-parameters and additional features.
 [training]
 seed = ${system.seed}
+gpu_allocator = ${system.gpu_allocator}
 dropout = 0.1
 accumulate_gradient = 1
 # Extra resources for transfer-learning or pseudo-rehearsal
 init_tok2vec = ${paths.init_tok2vec}
 raw_text = ${paths.raw}
 vectors = null
+lookups = null
 # Controls early-stopping. 0 or -1 mean unlimited.
 patience = 1600
 max_epochs = 0
@ -75,7 +76,6 @@ train_corpus = "corpora.train"
 [training.logger]
@loggers = "spacy.ConsoleLogger.v1"

-
 [training.batcher]
@batchers = "spacy.batch_by_words.v1"
 discard_oversize = false
--- a/spacy/language.py
+++ b/spacy/language.py
@ -31,6 +31,7 @@ from .schemas import ConfigSchema
 from .git_info import GIT_VERSION
 from . import util
 from . import about
+from .lookups import load_lookups


 # This is the base config will all settings (training etc.)
@ -86,6 +87,13 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]:
    return tokenizer_factory


+@registry.misc("spacy.LookupsDataLoader.v1")
+def load_lookups_data(lang, tables):
+    util.logger.debug(f"Loading lookups from spacy-lookups-data: {tables}")
+    lookups = load_lookups(lang=lang, tables=tables)
+    return lookups
+
+
 class Language:
    """A text-processing pipeline. Usually you'll load this once per process,
    and pass the instance around your application.
@ -152,7 +160,6 @@ class Language:
                self.lang,
                self.Defaults,
                vectors_name=vectors_name,
-                load_data=self._config["nlp"]["load_vocab_data"],
            )
        else:
            if (self.lang and vocab.lang) and (self.lang != vocab.lang):
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@ -8,6 +8,7 @@ from collections import defaultdict
 from thinc.api import Optimizer

 from .attrs import NAMES
+from .lookups import Lookups

 if TYPE_CHECKING:
    # This lets us add type hints for mypy etc. without causing circular imports
@ -198,6 +199,7 @@ class ModelMetaSchema(BaseModel):
 class ConfigSchemaTraining(BaseModel):
    # fmt: off
    vectors: Optional[StrictStr] = Field(..., title="Path to vectors")
+    lookups: Optional[Lookups] = Field(..., title="Vocab lookups")
    dev_corpus: StrictStr = Field(..., title="Path in the config to the dev data")
    train_corpus: StrictStr = Field(..., title="Path in the config to the training data")
    batcher: Batcher = Field(..., title="Batcher for the training data")
@ -207,6 +209,7 @@ class ConfigSchemaTraining(BaseModel):
    max_steps: StrictInt = Field(..., title="Maximum number of update steps to train for")
    eval_frequency: StrictInt = Field(..., title="How often to evaluate during training (steps)")
    seed: Optional[StrictInt] = Field(..., title="Random seed")
+    gpu_allocator: Optional[StrictStr] = Field(..., title="Memory allocator when running on GPU")
    accumulate_gradient: StrictInt = Field(..., title="Whether to divide the batch up into substeps")
    score_weights: Dict[StrictStr, Union[StrictFloat, StrictInt]] = Field(..., title="Scores to report and their weights for selecting final model")
    init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
@ -227,7 +230,6 @@ class ConfigSchemaNlp(BaseModel):
    pipeline: List[StrictStr] = Field(..., title="The pipeline component names in order")
    disabled: List[StrictStr] = Field(..., title="Pipeline components to disable by default")
    tokenizer: Callable = Field(..., title="The tokenizer to use")
-    load_vocab_data: StrictBool = Field(..., title="Whether to load additional vocab data from spacy-lookups-data")
    before_creation: Optional[Callable[[Type["Language"]], Type["Language"]]] = Field(..., title="Optional callback to modify Language class before initialization")
    after_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after creation and before the pipeline is constructed")
    after_pipeline_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after the pipeline is constructed")
--- a/spacy/tests/test_util.py
+++ b/spacy/tests/test_util.py
@ -69,7 +69,6 @@ def test_util_dot_section():
    [nlp]
    lang = "en"
    pipeline = ["textcat"]
-    load_vocab_data = false

    [components]

@ -95,15 +94,13 @@ def test_util_dot_section():
    # not exclusive_classes
    assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] is False
    # Test that default values got overwritten
-    assert not en_config["nlp"]["load_vocab_data"]
-    assert nl_config["nlp"]["load_vocab_data"]  # default value True
+    assert en_config["nlp"]["pipeline"] == ["textcat"]
+    assert nl_config["nlp"]["pipeline"] == [] # default value []
    # Test proper functioning of 'dot_to_object'
    with pytest.raises(KeyError):
        dot_to_object(en_config, "nlp.pipeline.tagger")
    with pytest.raises(KeyError):
        dot_to_object(en_config, "nlp.unknownattribute")
-    assert not dot_to_object(en_config, "nlp.load_vocab_data")
-    assert dot_to_object(nl_config, "nlp.load_vocab_data")
    assert isinstance(dot_to_object(nl_config, "training.optimizer"), Optimizer)


--- a/spacy/util.py
+++ b/spacy/util.py
@ -253,6 +253,14 @@ def load_vectors_into_model(
                nlp.vocab.strings.add(vectors_nlp.vocab.strings[key])


+def load_vocab_data_into_model(
+    nlp: "Language", *, lookups: Optional["Lookups"] = None
+) -> None:
+    """Load vocab data."""
+    if lookups:
+        nlp.vocab.lookups = lookups
+
+
 def load_model(
    name: Union[str, Path],
    *,
@ -651,8 +659,8 @@ def join_command(command: List[str]) -> str:
 def run_command(
    command: Union[str, List[str]],
    *,
-    capture: bool = False,
    stdin: Optional[Any] = None,
+    capture: bool = False,
 ) -> Optional[subprocess.CompletedProcess]:
    """Run a command on the command line as a subprocess. If the subprocess
    returns a non-zero exit code, a system exit is performed.
@ -660,33 +668,46 @@ def run_command(
    command (str / List[str]): The command. If provided as a string, the
        string will be split using shlex.split.
    stdin (Optional[Any]): stdin to read from or None.
-    capture (bool): Whether to capture the output.
+    capture (bool): Whether to capture the output and errors. If False,
+        the stdout and stderr will not be redirected, and if there's an error,
+        sys.exit will be called with the returncode. You should use capture=False
+        when you want to turn over execution to the command, and capture=True
+        when you want to run the command more like a function.
    RETURNS (Optional[CompletedProcess]): The process object.
    """
    if isinstance(command, str):
-        command = split_command(command)
+        cmd_list = split_command(command)
+        cmd_str = command
+    else:
+        cmd_list = command
+        cmd_str = " ".join(command)
    try:
        ret = subprocess.run(
-            command,
+            cmd_list,
            env=os.environ.copy(),
            input=stdin,
            encoding="utf8",
-            check=True,
+            check=False,
            stdout=subprocess.PIPE if capture else None,
-            stderr=subprocess.PIPE if capture else None,
+            stderr=subprocess.STDOUT if capture else None,
        )
    except FileNotFoundError:
+        # Indicates the *command* wasn't found, it's an error before the command
+        # is run.
        raise FileNotFoundError(
-            Errors.E970.format(str_command=" ".join(command), tool=command[0])
+            Errors.E970.format(str_command=cmd_str, tool=cmd_list[0])
        ) from None
-    except subprocess.CalledProcessError as e:
-        # We don't want a duplicate traceback here so we're making sure the
-        # CalledProcessError isn't re-raised. We also print both the string
-        # message and the stderr, in case the error only has one of them.
-        print(e.stderr)
-        print(e)
-        sys.exit(1)
-    if ret.returncode != 0:
+    if ret.returncode != 0 and capture:
+        message = f"Error running command:\n\n{cmd_str}\n\n"
+        message += f"Subprocess exited with status {ret.returncode}"
+        if ret.stdout is not None:
+            message += f"\n\nProcess log (stdout and stderr):\n\n"
+            message += ret.stdout
+        error = subprocess.SubprocessError(message)
+        error.ret = ret
+        error.command = cmd_str
+        raise error
+    elif ret.returncode != 0:
        sys.exit(ret.returncode)
    return ret

--- a/spacy/vocab.pxd
+++ b/spacy/vocab.pxd
@ -28,7 +28,7 @@ cdef class Vocab:
    cpdef readonly StringStore strings
    cpdef public Morphology morphology
    cpdef public object vectors
-    cpdef public object lookups
+    cpdef public object _lookups
    cpdef public object writing_system
    cpdef public object get_noun_chunks
    cdef readonly int length
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -22,14 +22,9 @@ from .lang.norm_exceptions import BASE_NORMS
 from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang


-def create_vocab(lang, defaults, vectors_name=None, load_data=True):
+def create_vocab(lang, defaults, vectors_name=None):
    # If the spacy-lookups-data package is installed, we pre-populate the lookups
    # with lexeme data, if available
-    if load_data:
-        tables = ["lexeme_norm", "lexeme_prob", "lexeme_cluster", "lexeme_settings"]
-        lookups = load_lookups(lang, tables=tables, strict=False)
-    else:
-        lookups = Lookups()
    lex_attrs = {**LEX_ATTRS, **defaults.lex_attr_getters}
    # This is messy, but it's the minimal working fix to Issue #639.
    lex_attrs[IS_STOP] = functools.partial(is_stop, stops=defaults.stop_words)
@ -38,11 +33,9 @@ def create_vocab(lang, defaults, vectors_name=None, load_data=True):
    lex_attrs[NORM] = util.add_lookups(
        lex_attrs.get(NORM, LEX_ATTRS[NORM]),
        BASE_NORMS,
-        lookups.get_table("lexeme_norm", {}),
    )
    return Vocab(
        lex_attr_getters=lex_attrs,
-        lookups=lookups,
        writing_system=defaults.writing_system,
        get_noun_chunks=defaults.syntax_iterators.get("noun_chunks"),
        vectors_name=vectors_name,
@ -424,6 +417,19 @@ cdef class Vocab:
            orth = self.strings.add(orth)
        return orth in self.vectors

+    property lookups:
+        def __get__(self):
+            return self._lookups
+
+        def __set__(self, lookups):
+            self._lookups = lookups
+            if lookups.has_table("lexeme_norm"):
+                self.lex_attr_getters[NORM] = util.add_lookups(
+                    self.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]),
+                    self.lookups.get_table("lexeme_norm"),
+                )
+
+
    def to_disk(self, path, *, exclude=tuple()):
        """Save the current state to a directory.

--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@ -763,6 +763,7 @@ $ python -m spacy train [config_path] [--output] [--code] [--verbose] [overrides
 | `--output`, `-o`  | Directory to store trained pipeline in. Will be created if it doesn't exist. ~~Optional[Path] \(positional)~~                                                                              |
 | `--code`, `-c`    | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~       |
 | `--verbose`, `-V` | Show more detailed messages during training. ~~bool (flag)~~                                                                                                                               |
+| `--gpu-id`, `-g`  | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~                                                                                                                                 |
 | `--help`, `-h`    | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                 |
 | overrides         | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
 | **CREATES**       | The final trained pipeline and the best trained pipeline.                                                                                                                                  |
@ -798,11 +799,12 @@ $ python -m spacy pretrain [config_path] [output_dir] [--code] [--resume-path] [

 | Name                    | Description                                                                                                                                                                           |
 | ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `output_dir`            | Directory to save binary weights to on each epoch. ~~Path (positional)~~                                                                                                              |
 | `config_path`           | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~                                                           |
+| `output_dir`            | Directory to save binary weights to on each epoch. ~~Path (positional)~~                                                                                                              |
 | `--code`, `-c`          | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~  |
 | `--resume-path`, `-r`   | Path to pretrained weights from which to resume pretraining. ~~Optional[Path] \(option)~~                                                                                             |
 | `--epoch-resume`, `-er` | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files. ~~Optional[int] \(option)~~                                   |
+| `--gpu-id`, `-g`        | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~                                                                                                                            |
 | `--help`, `-h`          | Show help message and available arguments. ~~bool (flag)~~                                                                                                                            |
 | overrides               | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.dropout 0.2`. ~~Any (option/flag)~~ |
 | **CREATES**             | The pretrained weights that can be used to initialize `spacy train`.                                                                                                                  |
@ -893,8 +895,6 @@ what you need. By default, spaCy's
 can provide any other repo (public or private) that you have access to using the
 `--repo` option.

-<!-- TODO: update example once we've decided on repo structure -->
-
 ```cli
 $ python -m spacy project clone [name] [dest] [--repo] [--branch] [--sparse]
 ```
@ -902,7 +902,7 @@ $ python -m spacy project clone [name] [dest] [--repo] [--branch] [--sparse]
 > #### Example
 >
 > ```cli
-> $ python -m spacy project clone some_example
+> $ python -m spacy project clone pipelines/ner_wikiner
 > ```
 >
 > Clone from custom repo:
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@ -60,7 +60,6 @@ your config and check that it's valid, you can run the
 > [nlp]
 > lang = "en"
 > pipeline = ["tagger", "parser", "ner"]
-> load_vocab_data = true
 > before_creation = null
 > after_creation = null
 > after_pipeline_creation = null
@ -77,7 +76,6 @@ Defines the `nlp` object, its tokenizer and
 | `lang`                    | Pipeline language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). Defaults to `null`. ~~str~~                                                                                                                                                                                        |
 | `pipeline`                | Names of pipeline components in order. Should correspond to sections in the `[components]` block, e.g. `[components.ner]`. See docs on [defining components](/usage/training#config-components). Defaults to `[]`. ~~List[str]~~                                                                        |
 | `disabled`                | Names of pipeline components that are loaded but disabled by default and not run as part of the pipeline. Should correspond to components listed in `pipeline`. After a pipeline is loaded, disabled components can be enabled using [`Language.enable_pipe`](/api/language#enable_pipe). ~~List[str]~~ |
-| `load_vocab_data`         | Whether to load additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) if available. Defaults to `true`. ~~bool~~                                                                                                                                |
 | `before_creation`         | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `Language` subclass before it's initialized. Defaults to `null`. ~~Optional[Callable[[Type[Language]], Type[Language]]]~~                                                                                                      |
 | `after_creation`          | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object right after it's initialized. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~                                                                                                                    |
 | `after_pipeline_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object after the pipeline components have been added. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~                                                                                                   |
@ -190,7 +188,9 @@ process that are used when you run [`spacy train`](/api/cli#train).
 | `dropout`             | The dropout rate. Defaults to `0.1`. ~~float~~                                                                                                                                                               |
 | `eval_frequency`      | How often to evaluate during training (steps). Defaults to `200`. ~~int~~                                                                                                                                    |
 | `frozen_components`   | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~                              |
+| `gpu_allocator`       | Library for cupy to route GPU memory allocation to. Can be `"pytorch"` or `"tensorflow"`. Defaults to variable `${system.gpu_allocator}`. ~~str~~                                                            |
 | `init_tok2vec`        | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~                                              |
+| `lookups`             | Additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). Defaults to `null`. ~~Optional[Lookups]~~                                                     |
 | `max_epochs`          | Maximum number of epochs to train for. Defaults to `0`. ~~int~~                                                                                                                                              |
 | `max_steps`           | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~                                                                                                                                    |
 | `optimizer`           | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~                      |
@ -475,7 +475,7 @@ lexical data.
 Here's an example of the 20 most frequent lexemes in the English training data:

 ```json
-%%GITHUB_SPACY / extra / example_data / vocab - data.jsonl
+%%GITHUB_SPACY/extra/example_data/vocab-data.jsonl
 ```

 ## Pipeline meta {#meta}
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@ -145,9 +145,10 @@ pipelines.
 > nlp = spacy.load("en_core_web_sm")
 > ```

-| Name        | Description                             |
-| ----------- | --------------------------------------- |
-| **RETURNS** | Whether the GPU was activated. ~~bool~~ |
+| Name        | Description                                      |
+| ----------- | ------------------------------------------------ |
+| `gpu_id`    | Device index to select. Defaults to `0`. ~~int~~ |
+| **RETURNS** | Whether the GPU was activated. ~~bool~~          |

 ### spacy.require_gpu {#spacy.require_gpu tag="function" new="2.0.14"}

@ -164,9 +165,10 @@ and _before_ loading any pipelines.
 > nlp = spacy.load("en_core_web_sm")
 > ```

-| Name        | Description     |
-| ----------- | --------------- |
-| **RETURNS** | `True` ~~bool~~ |
+| Name        | Description                                      |
+| ----------- | ------------------------------------------------ |
+| `gpu_id`    | Device index to select. Defaults to `0`. ~~int~~ |
+| **RETURNS** | `True` ~~bool~~                                  |

 ## displaCy {#displacy source="spacy/displacy"}

@ -456,6 +458,16 @@ remain in the config file stored on your local system.
 | `project_name`         | The name of the project in the Weights & Biases interface. The project will be created automatically if it doesn't exist yet. ~~str~~ |
 | `remove_config_values` | A list of values to include from the config before it is uploaded to W&B (default: empty). ~~List[str]~~                              |

+<Project id="integrations/wandb">
+
+Get started with tracking your spaCy training runs in Weights & Biases using our
+project template. It trains on the IMDB Movie Review Dataset and includes a
+simple config with the built-in `WandbLogger`, as well as a custom example of
+creating variants of the config for a simple hyperparameter grid search and
+logging the results.
+
+</Project>
+
 ## Readers {#readers source="spacy/training/corpus.py" new="3"}

 Corpus readers are registered functions that load data and return a function
--- a/website/docs/usage/embeddings-transformers.md
+++ b/website/docs/usage/embeddings-transformers.md
@ -289,8 +289,7 @@ of objects by referring to creation functions, including functions you register
 yourself. For details on how to get started with training your own model, check
 out the [training quickstart](/usage/training#quickstart).

-<!-- TODO:
-<Project id="en_core_trf_lg">
+<!-- TODO: <Project id="en_core_trf_lg">

 The easiest way to get started is to clone a transformers-based project
 template. Swap in your data, edit the settings and hyperparameters and train,
@ -623,7 +622,7 @@ that are familiar from the training block: the `[pretraining.batcher]`,
 `[pretraining.optimizer]` and `[pretraining.corpus]` all work the same way and
 expect the same types of objects, although for pretraining your corpus does not
 need to have any annotations, so you will often use a different reader, such as
-the [`JsonlReader`](/api/toplevel#jsonlreader).
+the [`JsonlReader`](/api/top-level#jsonlreader).

 > #### Raw text format
 >
@ -655,6 +654,16 @@ and pass in optional config overrides, like the path to the raw text file:
 $ python -m spacy pretrain config_pretrain.cfg ./output --paths.raw text.jsonl
 ```

+The following defaults are used for the `[pretraining]` block and merged into
+your existing config when you run [`init config`](/api/cli#init-config) or
+[`init fill-config`](/api/cli#init-fill-config) with `--pretraining`. If needed,
+you can [configure](#pretraining-configure) the settings and hyperparameters or
+change the [objective](#pretraining-details).
+
+```ini
+%%GITHUB_SPACY/spacy/default_config_pretraining.cfg
+```
+
 ### How pretraining works {#pretraining-details}

 The impact of [`spacy pretrain`](/api/cli#pretrain) varies, but it will usually
--- a/website/docs/usage/facts-figures.md
+++ b/website/docs/usage/facts-figures.md
@ -45,7 +45,7 @@ spaCy v3.0 introduces transformer-based pipelines that bring spaCy's accuracy
 right up to **current state-of-the-art**. You can also use a CPU-optimized
 pipeline, which is less accurate but much cheaper to run.

-<!-- TODO: -->
+<!-- TODO: update benchmarks and intro -->

 > #### Evaluation details
 >
@ -68,6 +68,6 @@ our project template.

 </Project>

-<!-- ## Citing spaCy {#citation}
+<!-- TODO: ## Citing spaCy {#citation}

-<!-- TODO: update -->
+-->
--- a/website/docs/usage/layers-architectures.md
+++ b/website/docs/usage/layers-architectures.md
@ -356,6 +356,18 @@ that training configs are complete and experiments fully reproducible.

 </Infobox>

+Note that when using a PyTorch or Tensorflow model, it is recommended to set the
+GPU memory allocator accordingly. When `gpu_allocator` is set to "pytorch" or
+"tensorflow" in the training config, cupy will allocate memory via those
+respective libraries, preventing OOM errors when there's available memory
+sitting in the other library's pool.
+
+```ini
+### config.cfg (excerpt)
+[training]
+gpu_allocator = "pytorch"
+```
+
 ## Custom models with Thinc {#thinc}

 Of course it's also possible to define the `Model` from the previous section
@ -477,7 +489,7 @@ with Model.define_operators({">>": chain}):
 <Infobox title="This section is still under construction" emoji="🚧" variant="warning">
 </Infobox>

-<!-- TODO:
+<!-- TODO: write trainable component section
 - Interaction with `predict`, `get_loss` and `set_annotations`
 - Initialization life-cycle with `begin_training`, correlation with add_label
 Example: relation extraction component (implemented as project template)
--- a/website/docs/usage/models.md
+++ b/website/docs/usage/models.md
@ -381,8 +381,6 @@ and loading pipeline packages, the underlying functionality is entirely based on
 native Python packaging. This allows your application to handle a spaCy pipeline
 like any other package dependency.

-<!-- TODO: reference relevant spaCy project -->
-
 ### Downloading and requiring package dependencies {#models-download}

 spaCy's built-in [`download`](/api/cli#download) command is mostly intended as a
--- a/website/docs/usage/projects.md
+++ b/website/docs/usage/projects.md
@ -29,15 +29,13 @@ and share your results with your team. spaCy projects can be used via the new

 ![Illustration of project workflow and commands](../images/projects.svg)

-<!-- TODO:
-<Project id="some_example_project">
+<Project id="pipelines/tagger_parser_ud">

-Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus interdum
-sodales lectus, ut sodales orci ullamcorper id. Sed condimentum neque ut erat
-mattis pretium.
+The easiest way to get started is to clone a project template and run it – for
+example, this end-to-end template that lets you train a **part-of-speech
+tagger** and **dependency parser** on a Universal Dependencies treebank.

 </Project>
-->

 spaCy projects make it easy to integrate with many other **awesome tools** in
 the data science and machine learning ecosystem to track and manage your data
@ -65,10 +63,8 @@ project template and copies the files to a local directory. You can then run the
 project, e.g. to train a pipeline and edit the commands and scripts to build
 fully custom workflows.

-<!-- TODO: update with real example project -->
-
 ```cli
-python -m spacy project clone some_example_project
+python -m spacy project clone pipelines/tagger_parser_ud
 ```

 By default, the project will be cloned into the current working directory. You
@ -216,10 +212,8 @@ format, train a pipeline, evaluate it and export metrics, package it and spin up
 a quick web demo. It looks pretty similar to a config file used to define CI
 pipelines.

-<!-- TODO: update with better (final) example -->
-
 ```yaml
-https://github.com/explosion/projects/tree/v3/tutorials/ner_fashion_brands/project.yml
+https://github.com/explosion/projects/tree/v3/pipelines/tagger_parser_ud/project.yml
 ```

 | Section       | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
@ -976,14 +970,12 @@ your results.

 ![Screenshot: Parameter importance using config values](../images/wandb2.jpg 'Parameter importance using config values')

-<!-- TODO:
-
 <Project id="integrations/wandb">

 Get started with tracking your spaCy training runs in Weights & Biases using our
-project template. It includes a simple config using the `WandbLogger`, as well
-as a custom logger implementation you can adjust for your specific use case.
+project template. It trains on the IMDB Movie Review Dataset and includes a
+simple config with the built-in `WandbLogger`, as well as a custom example of
+creating variants of the config for a simple hyperparameter grid search and
+logging the results.

 </Project>
-
-->
--- a/website/docs/usage/saving-loading.md
+++ b/website/docs/usage/saving-loading.md
@ -574,7 +574,7 @@ The directory will be created if it doesn't exist, and the whole pipeline data,
 meta and configuration will be written out. To make the pipeline more convenient
 to deploy, we recommend wrapping it as a [Python package](/api/cli#package).

-<Accordion title="What’s the difference between the config.cfg and meta.json?" spaced id="models-meta-vs-config">
+<Accordion title="What’s the difference between the config.cfg and meta.json?" spaced id="models-meta-vs-config" spaced>

 When you save a pipeline in spaCy v3.0+, two files will be exported: a
 [`config.cfg`](/api/data-formats#config) based on
@ -596,6 +596,15 @@ based on [`nlp.meta`](/api/language#meta).

 </Accordion>

+<Project id="pipelines/tagger_parser_ud">
+
+The easiest way to get started with an end-to-end workflow is to clone a
+[project template](/usage/projects) and run it – for example, this template that
+lets you train a **part-of-speech tagger** and **dependency parser** on a
+Universal Dependencies treebank and generates an installable Python package.
+
+</Project>
+
 ### Generating a pipeline package {#models-generating}

 <Infobox title="Important note" variant="warning">
@ -699,5 +708,3 @@ class and call [`from_disk`](/api/language#from_disk) instead.
 ```python
 nlp = spacy.blank("en").from_disk("/path/to/data")
 ```
-
-<!-- TODO: point to spaCy projects? -->
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@ -92,7 +92,7 @@ spaCy's binary `.spacy` format. You can either include the data paths in the
 $ python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./dev.spacy
 ```

-<Accordion title="How are the config recommendations generated?" id="quickstart-source">
+<Accordion title="How are the config recommendations generated?" id="quickstart-source" spaced>

 The recommended config settings generated by the quickstart widget and the
 [`init config`](/api/cli#init-config) command are based on some general **best
@ -112,6 +112,15 @@ as we run more experiments.

 </Accordion>

+<Project id="pipelines/tagger_parser_ud">
+
+The easiest way to get started is to clone a [project template](/usage/projects)
+and run it – for example, this end-to-end template that lets you train a
+**part-of-speech tagger** and **dependency parser** on a Universal Dependencies
+treebank.
+
+</Project>
+
 ## Training config {#config}

 Training config files include all **settings and hyperparameters** for training
--- a/website/docs/usage/v3.md
+++ b/website/docs/usage/v3.md
@ -176,18 +176,16 @@ freely combine implementations from different frameworks into a single model.

 ### Manage end-to-end workflows with projects {#features-projects}

-<!-- TODO: update example -->
-
 > #### Example
 >
 > ```cli
 > # Clone a project template
-> $ python -m spacy project clone example
-> $ cd example
+> $ python -m spacy project clone pipelines/tagger_parser_ud
+> $ cd tagger_parser_ud
 > # Download data assets
 > $ python -m spacy project assets
 > # Run a workflow
-> $ python -m spacy project run train
+> $ python -m spacy project run all
 > ```

 spaCy projects let you manage and share **end-to-end spaCy workflows** for
@ -207,14 +205,6 @@ data, [Streamlit](/usage/projects#streamlit) for building interactive apps,
 [Ray](/usage/projects#ray) for parallel training,
 [Weights & Biases](/usage/projects#wandb) for experiment tracking, and more!

-<!-- <Project id="some_example_project">
-
-The easiest way to get started with an end-to-end training process is to clone a
-[project](/usage/projects) template. Projects let you manage multi-step
-workflows, from data preprocessing to training and packaging your pipeline.
-
-</Project>-->
-
 <Infobox title="Details & Documentation" emoji="📖" list>

 - **Usage:** [spaCy projects](/usage/projects),
@ -224,6 +214,15 @@ workflows, from data preprocessing to training and packaging your pipeline.

 </Infobox>

+<Project id="pipelines/tagger_parser_ud">
+
+The easiest way to get started is to clone a [project template](/usage/projects)
+and run it – for example, this end-to-end template that lets you train a
+**part-of-speech tagger** and **dependency parser** on a Universal Dependencies
+treebank.
+
+</Project>
+
 ### Parallel and distributed training with Ray {#features-parallel-training}

 > #### Example
@ -875,7 +874,14 @@ values. You can then use the auto-generated `config.cfg` for training:
 + python -m spacy train ./config.cfg --output ./output
 ```

-<!-- TODO: project template -->
+<Project id="pipelines/tagger_parser_ud">
+
+The easiest way to get started is to clone a [project template](/usage/projects)
+and run it – for example, this end-to-end template that lets you train a
+**part-of-speech tagger** and **dependency parser** on a Universal Dependencies
+treebank.
+
+</Project>

 #### Training via the Python API {#migrating-training-python}

--- a/website/meta/site.json
+++ b/website/meta/site.json
@ -12,6 +12,7 @@
    "companyUrl": "https://explosion.ai",
    "repo": "explosion/spaCy",
    "modelsRepo": "explosion/spacy-models",
+    "projectsRepo": "explosion/projects/tree/v3",
    "social": {
        "twitter": "spacy_io",
        "github": "explosion"
--- a/website/src/components/tag.js
+++ b/website/src/components/tag.js
@ -13,7 +13,7 @@ export default function Tag({ spaced = false, variant, tooltip, children }) {
        const isValid = isString(children) && !isNaN(children)
        const version = isValid ? Number(children).toFixed(1) : children
        const tooltipText = `This feature is new and was introduced in spaCy v${version}`
-        // TODO: we probably want to handle this more elegantly, but the idea is
+        // We probably want to handle this more elegantly, but the idea is
        // that we can hide tags referring to old versions
        const major = isString(version) ? Number(version.split('.')[0]) : version
        return major < MIN_VERSION ? null : (
--- a/website/src/components/util.js
+++ b/website/src/components/util.js
@ -10,6 +10,7 @@ const htmlToReactParser = new HtmlToReactParser()
 const DEFAULT_BRANCH = 'develop'
 export const repo = siteMetadata.repo
 export const modelsRepo = siteMetadata.modelsRepo
+export const projectsRepo = siteMetadata.projectsRepo

 /**
 * This is used to provide selectors for headings so they can be crawled by
--- a/website/src/styles/copy.module.sass
+++ b/website/src/styles/copy.module.sass
@ -15,6 +15,10 @@
    background: transparent
    resize: none
    font: inherit
+    overflow: hidden
+    white-space: nowrap
+    text-overflow: ellipsis
+    margin-right: 1rem

 .prefix
    margin-right: 0.75em
--- a/website/src/widgets/landing.js
+++ b/website/src/widgets/landing.js
@ -30,7 +30,6 @@ import Benchmarks from 'usage/_benchmarks-models.md'

 const CODE_EXAMPLE = `# pip install spacy
 # python -m spacy download en_core_web_sm
-
 import spacy

 # Load English tokenizer, tagger, parser and NER
@ -120,7 +119,7 @@ const Landing = ({ data }) => {
                        </Li>
                        <Li>
                            ✅ Components for <strong>named entity</strong> recognition,
-                            part-of-speech-tagging, dependency parsing, sentence segmentation,{' '}
+                            part-of-speech tagging, dependency parsing, sentence segmentation,{' '}
                            <strong>text classification</strong>, lemmatization, morphological
                            analysis, entity linking and more
                        </Li>
@ -223,10 +222,11 @@ const Landing = ({ data }) => {
                    <br />
                    <br />
                    <br />
-                    {/** TODO: update with actual example */}
-                    <Project id="some_example">
-                        Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus interdum
-                        sodales lectus.
+                    <Project id="pipelines/tagger_parser_ud" title="Get started">
+                        The easiest way to get started is to clone a project template and run it
+                        – for example, this template for training a{' '}
+                        <strong>part-of-speech tagger</strong> and{' '}
+                        <strong>dependency parser</strong> on a Universal Dependencies treebank.
                    </Project>
                </LandingCol>
                <LandingCol>
--- a/website/src/widgets/project.js
+++ b/website/src/widgets/project.js
@ -4,25 +4,29 @@ import CopyInput from '../components/copy'
 import Infobox from '../components/infobox'
 import Link from '../components/link'
 import { InlineCode } from '../components/code'
+import { projectsRepo } from '../components/util'

-// TODO: move to meta?
-const DEFAULT_REPO = 'https://github.com/explosion/projects/tree/v3'
 const COMMAND = 'python -m spacy project clone'

-export default function Project({ id, repo, children }) {
+export default function Project({
+    title = 'Get started with a project template',
+    id,
+    repo,
+    children,
+}) {
    const repoArg = repo ? ` --repo ${repo}` : ''
    const text = `${COMMAND} ${id}${repoArg}`
-    const url = `${repo || DEFAULT_REPO}/${id}`
-    const title = (
+    const url = `${repo || projectsRepo}/${id}`
+    const header = (
        <>
-            Get started with a project template:{' '}
+            {title}:{' '}
            <Link to={url}>
                <InlineCode>{id}</InlineCode>
            </Link>
        </>
    )
    return (
-        <Infobox title={title} emoji="🪐">
+        <Infobox title={header} emoji="🪐">
            {children}
            <CopyInput text={text} prefix="$" />
        </Infobox>