Merge branch 'develop' into nightly.spacy.io

2025-07-10 16:22:29 +03:00 · 2020-10-05 22:01:13 +02:00 · 2020-10-05 22:01:13 +02:00 · e8156d191f
commit e8156d191f
parent c2709a32c9 126268ce50
47 changed files with 943 additions and 380 deletions
--- a/2
+++ b/2
@ -1,7 +1,7 @@
 SHELL := /bin/bash
 ifndef SPACY_EXTRAS
-override SPACY_EXTRAS = spacy-lookups-data==1.0.0rc0 jieba pkuseg==0.0.25 pickle5 sudachipy sudachidict_core
+override SPACY_EXTRAS = spacy-lookups-data==1.0.0rc0 jieba spacy-pkuseg==0.0.26 sudachipy sudachidict_core
 endif
 ifndef PYVER
--- a/pyproject.toml
+++ b/pyproject.toml
@ -7,7 +7,7 @@ requires = [
    "preshed>=3.0.2,<3.1.0",
    "murmurhash>=0.28.0,<1.1.0",
    "thinc>=8.0.0a43,<8.0.0a50",
-    "blis>=0.4.0,<0.5.0",
+    "blis>=0.4.0,<0.8.0",
    "pytokenizations",
    "pathy"
 ]
--- a/requirements.txt
+++ b/requirements.txt
@ -2,7 +2,7 @@
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
 thinc>=8.0.0a43,<8.0.0a50
-blis>=0.4.0,<0.5.0
+blis>=0.4.0,<0.8.0
 ml_datasets==0.2.0a0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.8.0,<1.1.0
--- a/setup.cfg
+++ b/setup.cfg
@ -41,7 +41,7 @@ install_requires =
    cymem>=2.0.2,<2.1.0
    preshed>=3.0.2,<3.1.0
    thinc>=8.0.0a43,<8.0.0a50
-    blis>=0.4.0,<0.5.0
+    blis>=0.4.0,<0.8.0
    wasabi>=0.8.0,<1.1.0
    srsly>=2.3.0,<3.0.0
    catalogue>=2.0.1,<2.1.0
@ -92,6 +92,8 @@ ko =
    natto-py==0.9.0
 th =
    pythainlp>=2.0
 zh =
    spacy-pkuseg==0.0.26
 [bdist_wheel]
 universal = false
--- a/spacy/about.py
+++ b/spacy/about.py
@ -1,7 +1,6 @@
 # fmt: off
 __title__ = "spacy-nightly"
-__version__ = "3.0.0a32"
+__version__ = "3.0.0a34"
 __release__ = True
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __projects__ = "https://github.com/explosion/projects"
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@ -1,4 +1,4 @@
-from typing import Dict, Any, Union, List, Optional, Tuple, TYPE_CHECKING
+from typing import Dict, Any, Union, List, Optional, Tuple, Iterable, TYPE_CHECKING
 import sys
 import shutil
 from pathlib import Path
@ -16,7 +16,8 @@ import os
 from ..schemas import ProjectConfigSchema, validate
 from ..util import import_file, run_command, make_tempdir, registry, logger
-from ..util import ENV_VARS
+from ..util import is_compatible_version, ENV_VARS
 from .. import about
 if TYPE_CHECKING:
    from pathy import Pathy  # noqa: F401
@ -142,6 +143,7 @@ def load_project_config(path: Path, interpolate: bool = True) -> Dict[str, Any]:
        msg.fail(invalid_err)
        print("\n".join(errors))
        sys.exit(1)
    validate_project_version(config)
    validate_project_commands(config)
    # Make sure directories defined in config exist
    for subdir in config.get("directories", []):
@ -167,6 +169,23 @@ def substitute_project_variables(config: Dict[str, Any], overrides: Dict = {}):
    return dict(interpolated["project"])
 def validate_project_version(config: Dict[str, Any]) -> None:
    """If the project defines a compatible spaCy version range, chec that it's
    compatible with the current version of spaCy.
    config (Dict[str, Any]): The loaded config.
    """
    spacy_version = config.get("spacy_version", None)
    if spacy_version and not is_compatible_version(about.__version__, spacy_version):
        err = (
            f"The {PROJECT_FILE} specifies a spaCy version range ({spacy_version}) "
            f"that's not compatible with the version of spaCy you're running "
            f"({about.__version__}). You can edit version requirement in the "
            f"{PROJECT_FILE} to load it, but the project may not run as expected."
        )
        msg.fail(err, exits=1)
 def validate_project_commands(config: Dict[str, Any]) -> None:
    """Check that project commands and workflows are valid, don't contain
    duplicates, don't clash  and only refer to commands that exist.
@ -193,12 +212,15 @@ def validate_project_commands(config: Dict[str, Any]) -> None:
                )
-def get_hash(data) -> str:
+def get_hash(data, exclude: Iterable[str] = tuple()) -> str:
    """Get the hash for a JSON-serializable object.
    data: The data to hash.
    exclude (Iterable[str]): Top-level keys to exclude if data is a dict.
    RETURNS (str): The hash.
    """
    if isinstance(data, dict):
        data = {k: v for k, v in data.items() if k not in exclude}
    data_str = srsly.json_dumps(data, sort_keys=True).encode("utf8")
    return hashlib.md5(data_str).hexdigest()
--- a/spacy/cli/project/remote_storage.py
+++ b/spacy/cli/project/remote_storage.py
@ -7,7 +7,9 @@ import tarfile
 from pathlib import Path
 from .._util import get_hash, get_checksum, download_file, ensure_pathy
-from ...util import make_tempdir
+from ...util import make_tempdir, get_minor_version, ENV_VARS, check_bool_env_var
 from ...git_info import GIT_VERSION
 from ... import about
 if TYPE_CHECKING:
    from pathy import Pathy  # noqa: F401
@ -129,7 +131,10 @@ def get_command_hash(
    currently installed packages, whatever environment variables have been marked
    as relevant, and the command.
    """
-    hashes = [site_hash, env_hash] + [get_checksum(dep) for dep in sorted(deps)]
+    check_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION)
    spacy_v = GIT_VERSION if check_commit else get_minor_version(about.__version__)
    dep_checksums = [get_checksum(dep) for dep in sorted(deps)]
    hashes = [spacy_v, site_hash, env_hash] + dep_checksums
    hashes.extend(cmd)
    creation_bytes = "".join(hashes).encode("utf8")
    return hashlib.md5(creation_bytes).hexdigest()
--- a/spacy/cli/project/run.py
+++ b/spacy/cli/project/run.py
@ -4,8 +4,11 @@ from wasabi import msg
 import sys
 import srsly
 from ... import about
 from ...git_info import GIT_VERSION
 from ...util import working_dir, run_command, split_command, is_cwd, join_command
-from ...util import SimpleFrozenList
+from ...util import SimpleFrozenList, is_minor_version_match, ENV_VARS
 from ...util import check_bool_env_var
 from .._util import PROJECT_FILE, PROJECT_LOCK, load_project_config, get_hash
 from .._util import get_checksum, project_cli, Arg, Opt, COMMAND
@ -62,12 +65,13 @@ def project_run(
                err_help = "Maybe you forgot to run the 'project assets' command or a previous step?"
                err_kwargs = {"exits": 1} if not dry else {}
                msg.fail(err, err_help, **err_kwargs)
        check_spacy_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION)
        with working_dir(project_dir) as current_dir:
-            rerun = check_rerun(current_dir, cmd)
+            msg.divider(subcommand)
            rerun = check_rerun(current_dir, cmd, check_spacy_commit=check_spacy_commit)
            if not rerun and not force:
                msg.info(f"Skipping '{cmd['name']}': nothing changed")
            else:
                msg.divider(subcommand)
                run_commands(cmd["script"], dry=dry)
                if not dry:
                    update_lockfile(current_dir, cmd)
@ -171,12 +175,19 @@ def validate_subcommand(
        )
-def check_rerun(project_dir: Path, command: Dict[str, Any]) -> bool:
+def check_rerun(
    project_dir: Path,
    command: Dict[str, Any],
    *,
    check_spacy_version: bool = True,
    check_spacy_commit: bool = False,
 ) -> bool:
    """Check if a command should be rerun because its settings or inputs/outputs
    changed.
    project_dir (Path): The current project directory.
    command (Dict[str, Any]): The command, as defined in the project.yml.
    strict_version (bool):
    RETURNS (bool): Whether to re-run the command.
    """
    lock_path = project_dir / PROJECT_LOCK
@ -189,10 +200,23 @@ def check_rerun(project_dir: Path, command: Dict[str, Any]) -> bool:
    # Always run commands with no outputs (otherwise they'd always be skipped)
    if not entry.get("outs", []):
        return True
    # Always rerun if spaCy version or commit hash changed
    spacy_v = entry.get("spacy_version")
    commit = entry.get("spacy_git_version")
    if check_spacy_version and not is_minor_version_match(spacy_v, about.__version__):
        info = f"({spacy_v} in {PROJECT_LOCK}, {about.__version__} current)"
        msg.info(f"Re-running '{command['name']}': spaCy minor version changed {info}")
        return True
    if check_spacy_commit and commit != GIT_VERSION:
        info = f"({commit} in {PROJECT_LOCK}, {GIT_VERSION} current)"
        msg.info(f"Re-running '{command['name']}': spaCy commit changed {info}")
        return True
    # If the entry in the lockfile matches the lockfile entry that would be
    # generated from the current command, we don't rerun because it means that
    # all inputs/outputs, hashes and scripts are the same and nothing changed
-    return get_hash(get_lock_entry(project_dir, command)) != get_hash(entry)
+    lock_entry = get_lock_entry(project_dir, command)
    exclude = ["spacy_version", "spacy_git_version"]
    return get_hash(lock_entry, exclude=exclude) != get_hash(entry, exclude=exclude)
 def update_lockfile(project_dir: Path, command: Dict[str, Any]) -> None:
@ -231,6 +255,8 @@ def get_lock_entry(project_dir: Path, command: Dict[str, Any]) -> Dict[str, Any]
        "script": command["script"],
        "deps": deps,
        "outs": [*outs, *outs_nc],
        "spacy_version": about.__version__,
        "spacy_git_version": GIT_VERSION,
    }
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@ -171,9 +171,14 @@ factory = "tok2vec"
 [components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v1"
 width = ${components.tok2vec.model.encode.width}
-rows = {{ 2000 if optimize == "efficiency" else 7000 }}
+{% if has_letters -%}
-also_embed_subwords = {{ "true" if has_letters else "false" }}
+attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
-also_use_static_vectors = {{ "true" if optimize == "accuracy" else "false" }}
+rows = [5000, 2500, 2500, 2500]
 {% else -%}
 attrs = ["ORTH", "SHAPE"]
 rows = [5000, 2500]
 {% endif -%}
 include_static_vectors = {{ "true" if optimize == "accuracy" else "false" }}
 [components.tok2vec.model.encode]
@architectures = "spacy.MaxoutWindowEncoder.v1"
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -456,10 +456,14 @@ class Errors:
            "issue tracker: http://github.com/explosion/spaCy/issues")
    # TODO: fix numbering after merging develop into master
-    E092 = ("The sentence-per-line IOB/IOB2 file is not formatted correctly. "
+    E901 = ("Failed to remove existing output directory: {path}. If your "
            "config and the components you train change between runs, a "
            "non-empty output directory can lead to stale pipeline data. To "
            "solve this, remove the existing directories in the output directory.")
    E902 = ("The sentence-per-line IOB/IOB2 file is not formatted correctly. "
            "Try checking whitespace and delimiters. See "
            "https://nightly.spacy.io/api/cli#convert")
-    E093 = ("The token-per-line NER file is not formatted correctly. Try checking "
+    E903 = ("The token-per-line NER file is not formatted correctly. Try checking "
            "whitespace and delimiters. See https://nightly.spacy.io/api/cli#convert")
    E904 = ("Cannot initialize StaticVectors layer: nO dimension unset. This "
            "dimension refers to the output width, after the linear projection "
--- a/spacy/lang/ru/init.py
+++ b/spacy/lang/ru/init.py
@ -25,8 +25,14 @@ class Russian(Language):
    default_config={"model": None, "mode": "pymorphy2"},
    default_score_weights={"lemma_acc": 1.0},
 )
-def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
+def make_lemmatizer(
-    return RussianLemmatizer(nlp.vocab, model, name, mode=mode)
+    nlp: Language,
    model: Optional[Model],
    name: str,
    mode: str,
    overwrite: bool = False,
 ):
    return RussianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
 __all__ = ["Russian"]
--- a/spacy/lang/ru/lemmatizer.py
+++ b/spacy/lang/ru/lemmatizer.py
@ -2,7 +2,6 @@ from typing import Optional, List, Dict, Tuple
 from thinc.api import Model
 from ...lookups import Lookups
 from ...pipeline import Lemmatizer
 from ...symbols import POS
 from ...tokens import Token
@ -22,9 +21,9 @@ class RussianLemmatizer(Lemmatizer):
        name: str = "lemmatizer",
        *,
        mode: str = "pymorphy2",
-        lookups: Optional[Lookups] = None,
+        overwrite: bool = False,
    ) -> None:
-        super().__init__(vocab, model, name, mode=mode, lookups=lookups)
+        super().__init__(vocab, model, name, mode=mode, overwrite=overwrite)
        try:
            from pymorphy2 import MorphAnalyzer
--- a/spacy/lang/uk/init.py
+++ b/spacy/lang/uk/init.py
@ -26,8 +26,10 @@ class Ukrainian(Language):
    default_config={"model": None, "mode": "pymorphy2"},
    default_score_weights={"lemma_acc": 1.0},
 )
-def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
+def make_lemmatizer(
-    return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode)
+    nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool = False
 ):
    return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
 __all__ = ["Ukrainian"]
--- a/spacy/lang/uk/lemmatizer.py
+++ b/spacy/lang/uk/lemmatizer.py
@ -3,7 +3,6 @@ from typing import Optional
 from thinc.api import Model
 from ..ru.lemmatizer import RussianLemmatizer
 from ...lookups import Lookups
 from ...vocab import Vocab
@ -15,9 +14,9 @@ class UkrainianLemmatizer(RussianLemmatizer):
        name: str = "lemmatizer",
        *,
        mode: str = "pymorphy2",
-        lookups: Optional[Lookups] = None,
+        overwrite: bool = False,
    ) -> None:
-        super().__init__(vocab, model, name, mode=mode, lookups=lookups)
+        super().__init__(vocab, model, name, mode=mode, overwrite=overwrite)
        try:
            from pymorphy2 import MorphAnalyzer
        except ImportError:
--- a/spacy/lang/zh/init.py
+++ b/spacy/lang/zh/init.py
@ -17,8 +17,7 @@ from ... import util
 # fmt: off
-_PKUSEG_INSTALL_MSG = "install pkuseg and pickle5 with `pip install pkuseg==0.0.25 pickle5`"
+_PKUSEG_INSTALL_MSG = "install spacy-pkuseg with `pip install spacy-pkuseg==0.0.26`"
 _PKUSEG_PICKLE_WARNING = "Failed to force pkuseg model to use pickle protocol 4. If you're saving this model with python 3.8, it may not work with python 3.6-3.7."
 # fmt: on
 DEFAULT_CONFIG = """
@ -55,9 +54,7 @@ def create_chinese_tokenizer(segmenter: Segmenter = Segmenter.char):
 class ChineseTokenizer(DummyTokenizer):
-    def __init__(
+    def __init__(self, nlp: Language, segmenter: Segmenter = Segmenter.char):
        self, nlp: Language, segmenter: Segmenter = Segmenter.char,
    ):
        self.vocab = nlp.vocab
        if isinstance(segmenter, Segmenter):
            segmenter = segmenter.value
@ -82,11 +79,13 @@ class ChineseTokenizer(DummyTokenizer):
        *,
        nlp: Optional[Language] = None,
        pkuseg_model: Optional[str] = None,
-        pkuseg_user_dict: str = "default",
+        pkuseg_user_dict: Optional[str] = "default",
    ):
        if self.segmenter == Segmenter.pkuseg:
            if pkuseg_user_dict is None:
                pkuseg_user_dict = pkuseg_model
            self.pkuseg_seg = try_pkuseg_import(
-                pkuseg_model=pkuseg_model, pkuseg_user_dict=pkuseg_user_dict,
+                pkuseg_model=pkuseg_model, pkuseg_user_dict=pkuseg_user_dict
            )
    def __call__(self, text: str) -> Doc:
@ -120,12 +119,12 @@ class ChineseTokenizer(DummyTokenizer):
        if self.segmenter == Segmenter.pkuseg:
            if reset:
                try:
-                    import pkuseg
+                    import spacy_pkuseg
-                    self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(None)
+                    self.pkuseg_seg.preprocesser = spacy_pkuseg.Preprocesser(None)
                except ImportError:
                    msg = (
-                        "pkuseg not installed: unable to reset pkuseg "
+                        "spacy_pkuseg not installed: unable to reset pkuseg "
                        "user dict. Please " + _PKUSEG_INSTALL_MSG
                    )
                    raise ImportError(msg) from None
@ -156,23 +155,7 @@ class ChineseTokenizer(DummyTokenizer):
                self.pkuseg_seg.feature_extractor.save(tempdir)
                self.pkuseg_seg.model.save(tempdir)
                tempdir = Path(tempdir)
-                # pkuseg saves features.pkl with pickle.HIGHEST_PROTOCOL, which
+                with open(tempdir / "features.msgpack", "rb") as fileh:
                # means that it will be saved with pickle protocol 5 with
                # python 3.8, which can't be reloaded with python 3.6-3.7.
                # To try to make the model compatible with python 3.6+, reload
                # the data with pickle5 and convert it back to protocol 4.
                try:
                    import pickle5
                    with open(tempdir / "features.pkl", "rb") as fileh:
                        features = pickle5.load(fileh)
                    with open(tempdir / "features.pkl", "wb") as fileh:
                        pickle5.dump(features, fileh, protocol=4)
                except ImportError as e:
                    raise e
                except Exception:
                    warnings.warn(_PKUSEG_PICKLE_WARNING)
                with open(tempdir / "features.pkl", "rb") as fileh:
                    pkuseg_features_b = fileh.read()
                with open(tempdir / "weights.npz", "rb") as fileh:
                    pkuseg_weights_b = fileh.read()
@ -213,22 +196,22 @@ class ChineseTokenizer(DummyTokenizer):
        if pkuseg_data["features_b"] and pkuseg_data["weights_b"]:
            with tempfile.TemporaryDirectory() as tempdir:
                tempdir = Path(tempdir)
-                with open(tempdir / "features.pkl", "wb") as fileh:
+                with open(tempdir / "features.msgpack", "wb") as fileh:
                    fileh.write(pkuseg_data["features_b"])
                with open(tempdir / "weights.npz", "wb") as fileh:
                    fileh.write(pkuseg_data["weights_b"])
                try:
-                    import pkuseg
+                    import spacy_pkuseg
                except ImportError:
                    raise ImportError(
-                        "pkuseg not installed. To use this model, "
+                        "spacy-pkuseg not installed. To use this model, "
                        + _PKUSEG_INSTALL_MSG
                    ) from None
-                self.pkuseg_seg = pkuseg.pkuseg(str(tempdir))
+                self.pkuseg_seg = spacy_pkuseg.pkuseg(str(tempdir))
            if pkuseg_data["processors_data"]:
                processors_data = pkuseg_data["processors_data"]
                (user_dict, do_process, common_words, other_words) = processors_data
-                self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict)
+                self.pkuseg_seg.preprocesser = spacy_pkuseg.Preprocesser(user_dict)
                self.pkuseg_seg.postprocesser.do_process = do_process
                self.pkuseg_seg.postprocesser.common_words = set(common_words)
                self.pkuseg_seg.postprocesser.other_words = set(other_words)
@ -244,18 +227,6 @@ class ChineseTokenizer(DummyTokenizer):
                    path.mkdir(parents=True)
                self.pkuseg_seg.model.save(path)
                self.pkuseg_seg.feature_extractor.save(path)
                # try to convert features.pkl to pickle protocol 4
                try:
                    import pickle5
                    with open(path / "features.pkl", "rb") as fileh:
                        features = pickle5.load(fileh)
                    with open(path / "features.pkl", "wb") as fileh:
                        pickle5.dump(features, fileh, protocol=4)
                except ImportError as e:
                    raise e
                except Exception:
                    warnings.warn(_PKUSEG_PICKLE_WARNING)
        def save_pkuseg_processors(path):
            if self.pkuseg_seg:
@ -279,26 +250,26 @@ class ChineseTokenizer(DummyTokenizer):
        def load_pkuseg_model(path):
            try:
-                import pkuseg
+                import spacy_pkuseg
            except ImportError:
                if self.segmenter == Segmenter.pkuseg:
                    raise ImportError(
-                        "pkuseg not installed. To use this model, "
+                        "spacy-pkuseg not installed. To use this model, "
                        + _PKUSEG_INSTALL_MSG
                    ) from None
            if path.exists():
-                self.pkuseg_seg = pkuseg.pkuseg(path)
+                self.pkuseg_seg = spacy_pkuseg.pkuseg(path)
        def load_pkuseg_processors(path):
            try:
-                import pkuseg
+                import spacy_pkuseg
            except ImportError:
                if self.segmenter == Segmenter.pkuseg:
                    raise ImportError(self._pkuseg_install_msg) from None
            if self.segmenter == Segmenter.pkuseg:
                data = srsly.read_msgpack(path)
                (user_dict, do_process, common_words, other_words) = data
-                self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict)
+                self.pkuseg_seg.preprocesser = spacy_pkuseg.Preprocesser(user_dict)
                self.pkuseg_seg.postprocesser.do_process = do_process
                self.pkuseg_seg.postprocesser.common_words = set(common_words)
                self.pkuseg_seg.postprocesser.other_words = set(other_words)
@ -341,12 +312,13 @@ def try_jieba_import() -> None:
 def try_pkuseg_import(pkuseg_model: str, pkuseg_user_dict: str) -> None:
    try:
-        import pkuseg
+        import spacy_pkuseg
        return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
    except ImportError:
-        msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
+        msg = "spacy-pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
        raise ImportError(msg) from None
    try:
        return spacy_pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
    except FileNotFoundError:
        msg = "Unable to load pkuseg model from: " + pkuseg_model
        raise FileNotFoundError(msg) from None
--- a/spacy/lookups.py
+++ b/spacy/lookups.py
@ -289,13 +289,12 @@ class Lookups:
        DOCS: https://nightly.spacy.io/api/lookups#to_disk
        """
-        if len(self._tables):
+        path = ensure_path(path)
-            path = ensure_path(path)
+        if not path.exists():
-            if not path.exists():
+            path.mkdir()
-                path.mkdir()
+        filepath = path / filename
-            filepath = path / filename
+        with filepath.open("wb") as file_:
-            with filepath.open("wb") as file_:
+            file_.write(self.to_bytes())
                file_.write(self.to_bytes())
    def from_disk(
        self, path: Union[str, Path], filename: str = "lookups.bin", **kwargs
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@ -11,7 +11,7 @@ from ...ml import _character_embed
 from ..staticvectors import StaticVectors
 from ..featureextractor import FeatureExtractor
 from ...pipeline.tok2vec import Tok2VecListener
-from ...attrs import ORTH, LOWER, PREFIX, SUFFIX, SHAPE, intify_attr
+from ...attrs import intify_attr
@registry.architectures.register("spacy.Tok2VecListener.v1")
@ -29,7 +29,7 @@ def build_hash_embed_cnn_tok2vec(
    window_size: int,
    maxout_pieces: int,
    subword_features: bool,
-    pretrained_vectors: Optional[bool]
+    pretrained_vectors: Optional[bool],
 ) -> Model[List[Doc], List[Floats2d]]:
    """Build spaCy's 'standard' tok2vec layer, which uses hash embedding
    with subword features and a CNN with layer-normalized maxout.
@ -54,12 +54,18 @@ def build_hash_embed_cnn_tok2vec(
        a language such as Chinese.
    pretrained_vectors (bool): Whether to also use static vectors.
    """
    if subword_features:
        attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
        row_sizes = [embed_size, embed_size // 2, embed_size // 2, embed_size // 2]
    else:
        attrs = ["NORM"]
        row_sizes = [embed_size]
    return build_Tok2Vec_model(
        embed=MultiHashEmbed(
            width=width,
-            rows=embed_size,
+            rows=row_sizes,
-            also_embed_subwords=subword_features,
+            attrs=attrs,
-            also_use_static_vectors=bool(pretrained_vectors),
+            include_static_vectors=bool(pretrained_vectors),
        ),
        encode=MaxoutWindowEncoder(
            width=width,
@ -93,58 +99,59 @@ def build_Tok2Vec_model(
@registry.architectures.register("spacy.MultiHashEmbed.v1")
 def MultiHashEmbed(
-    width: int, rows: int, also_embed_subwords: bool, also_use_static_vectors: bool
+    width: int,
    attrs: List[Union[str, int]],
    rows: List[int],
    include_static_vectors: bool,
 ) -> Model[List[Doc], List[Floats2d]]:
    """Construct an embedding layer that separately embeds a number of lexical
    attributes using hash embedding, concatenates the results, and passes it
    through a feed-forward subnetwork to build a mixed representations.
-    The features used are the LOWER, PREFIX, SUFFIX and SHAPE, which can have
+    The features used can be configured with the 'attrs' argument. The suggested
-    varying definitions depending on the Vocab of the Doc object passed in.
+    attributes are NORM, PREFIX, SUFFIX and SHAPE. This lets the model take into
-    Vectors from pretrained static vectors can also be incorporated into the
+    account some subword information, without construction a fully character-based
-    concatenated representation.
+    representation. If pretrained vectors are available, they can be included in
    the representation as well, with the vectors table will be kept static
    (i.e. it's not updated).
    The `width` parameter specifies the output width of the layer and the widths
    of all embedding tables. If static vectors are included, a learned linear
    layer is used to map the vectors to the specified width before concatenating
    it with the other embedding outputs. A single Maxout layer is then used to
    reduce the concatenated vectors to the final width.
    The `rows` parameter controls the number of rows used by the `HashEmbed`
    tables. The HashEmbed layer needs surprisingly few rows, due to its use of
    the hashing trick. Generally between 2000 and 10000 rows is sufficient,
    even for very large vocabularies. A number of rows must be specified for each
    table, so the `rows` list must be of the same length as the `attrs` parameter.
    width (int): The output width. Also used as the width of the embedding tables.
        Recommended values are between 64 and 300.
-    rows (int): The number of rows for the embedding tables. Can be low, due
+    attrs (list of attr IDs): The token attributes to embed. A separate
-        to the hashing trick. Embeddings for prefix, suffix and word shape
+        embedding table will be constructed for each attribute.
-        use half as many rows. Recommended values are between 2000 and 10000.
+    rows (List[int]): The number of rows in the embedding tables. Must have the
-    also_embed_subwords (bool): Whether to use the PREFIX, SUFFIX and SHAPE
+        same length as attrs.
-        features in the embeddings. If not using these, you may need more
+    include_static_vectors (bool): Whether to also use static word vectors.
        rows in your hash embeddings, as there will be increased chance of
        collisions.
    also_use_static_vectors (bool): Whether to also use static word vectors.
        Requires a vectors table to be loaded in the Doc objects' vocab.
    """
-    cols = [LOWER, PREFIX, SUFFIX, SHAPE, ORTH]
+    if len(rows) != len(attrs):
        raise ValueError(f"Mismatched lengths: {len(rows)} vs {len(attrs)}")
    seed = 7
-    def make_hash_embed(feature):
+    def make_hash_embed(index):
        nonlocal seed
        seed += 1
-        return HashEmbed(
+        return HashEmbed(width, rows[index], column=index, seed=seed, dropout=0.0)
            width,
            rows if feature == LOWER else rows // 2,
            column=cols.index(feature),
            seed=seed,
            dropout=0.0,
        )
-    if also_embed_subwords:
+    embeddings = [make_hash_embed(i) for i in range(len(attrs))]
-        embeddings = [
+    concat_size = width * (len(embeddings) + include_static_vectors)
-            make_hash_embed(LOWER),
+    if include_static_vectors:
            make_hash_embed(PREFIX),
            make_hash_embed(SUFFIX),
            make_hash_embed(SHAPE),
        ]
    else:
        embeddings = [make_hash_embed(LOWER)]
    concat_size = width * (len(embeddings) + also_use_static_vectors)
    if also_use_static_vectors:
        model = chain(
            concatenate(
                chain(
-                    FeatureExtractor(cols),
+                    FeatureExtractor(attrs),
                    list2ragged(),
                    with_array(concatenate(*embeddings)),
                ),
@ -155,7 +162,7 @@ def MultiHashEmbed(
        )
    else:
        model = chain(
-            FeatureExtractor(cols),
+            FeatureExtractor(list(attrs)),
            list2ragged(),
            with_array(concatenate(*embeddings)),
            with_array(Maxout(width, concat_size, nP=3, dropout=0.0, normalize=True)),
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@ -210,7 +210,7 @@ class Morphologizer(Tagger):
        examples (Iterable[Examples]): The batch of examples.
        scores: Scores representing the model's predictions.
-        RETUTNRS (Tuple[float, float]): The loss and the gradient.
+        RETURNS (Tuple[float, float]): The loss and the gradient.
        DOCS: https://nightly.spacy.io/api/morphologizer#get_loss
        """
--- a/spacy/pipeline/pipe.pyx
+++ b/spacy/pipeline/pipe.pyx
@ -162,7 +162,7 @@ cdef class Pipe:
        examples (Iterable[Examples]): The batch of examples.
        scores: Scores representing the model's predictions.
-        RETUTNRS (Tuple[float, float]): The loss and the gradient.
+        RETURNS (Tuple[float, float]): The loss and the gradient.
        DOCS: https://nightly.spacy.io/api/pipe#get_loss
        """
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@ -104,7 +104,7 @@ class SentenceRecognizer(Tagger):
        examples (Iterable[Examples]): The batch of examples.
        scores: Scores representing the model's predictions.
-        RETUTNRS (Tuple[float, float]): The loss and the gradient.
+        RETURNS (Tuple[float, float]): The loss and the gradient.
        DOCS: https://nightly.spacy.io/api/sentencerecognizer#get_loss
        """
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@ -249,7 +249,7 @@ class Tagger(Pipe):
        examples (Iterable[Examples]): The batch of examples.
        scores: Scores representing the model's predictions.
-        RETUTNRS (Tuple[float, float]): The loss and the gradient.
+        RETURNS (Tuple[float, float]): The loss and the gradient.
        DOCS: https://nightly.spacy.io/api/tagger#get_loss
        """
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@ -281,7 +281,7 @@ class TextCategorizer(Pipe):
        examples (Iterable[Examples]): The batch of examples.
        scores: Scores representing the model's predictions.
-        RETUTNRS (Tuple[float, float]): The loss and the gradient.
+        RETURNS (Tuple[float, float]): The loss and the gradient.
        DOCS: https://nightly.spacy.io/api/textcategorizer#get_loss
        """
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@ -448,6 +448,7 @@ class ProjectConfigSchema(BaseModel):
    workflows: Dict[StrictStr, List[StrictStr]] = Field({}, title="Named workflows, mapped to list of project commands to run in order")
    commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts")
    title: Optional[str] = Field(None, title="Project title")
    spacy_version: Optional[StrictStr] = Field(None, title="spaCy version range that the project is compatible with")
    # fmt: on
    class Config:
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -248,7 +248,6 @@ def tt_tokenizer():
@pytest.fixture(scope="session")
 def uk_tokenizer():
    pytest.importorskip("pymorphy2")
    pytest.importorskip("pymorphy2.lang")
    return get_lang_class("uk")().tokenizer
@ -285,8 +284,7 @@ def zh_tokenizer_jieba():
@pytest.fixture(scope="session")
 def zh_tokenizer_pkuseg():
-    pytest.importorskip("pkuseg")
+    pytest.importorskip("spacy_pkuseg")
    pytest.importorskip("pickle5")
    config = {
        "nlp": {
            "tokenizer": {
@ -296,7 +294,7 @@ def zh_tokenizer_pkuseg():
        },
        "initialize": {
            "tokenizer": {
-                "pkuseg_model": "default",
+                "pkuseg_model": "web",
            }
        },
    }
--- a/spacy/tests/doc/test_retokenize_split.py
+++ b/spacy/tests/doc/test_retokenize_split.py
@ -209,9 +209,13 @@ def test_doc_retokenizer_split_norm(en_vocab):
    # Retokenize to split out the words in the token at doc[2].
    token = doc[2]
    with doc.retokenize() as retokenizer:
-      retokenizer.split(token, ["brown", "fox", "jumps", "over", "the"], heads=[(token, idx) for idx in range(5)])
+        retokenizer.split(
            token,
            ["brown", "fox", "jumps", "over", "the"],
            heads=[(token, idx) for idx in range(5)],
        )
-    assert doc[9].text  == "w/"
+    assert doc[9].text == "w/"
    assert doc[9].norm_ == "with"
-    assert doc[5].text  == "over"
+    assert doc[5].text == "over"
    assert doc[5].norm_ == "over"
--- a/spacy/tests/pipeline/test_pipe_methods.py
+++ b/spacy/tests/pipeline/test_pipe_methods.py
@ -350,7 +350,7 @@ def test_pipe_methods_frozen():
@pytest.mark.parametrize(
-    "pipe", ["tagger", "parser", "ner", "textcat", "morphologizer"],
+    "pipe", ["tagger", "parser", "ner", "textcat", "morphologizer"]
 )
 def test_pipe_label_data_exports_labels(pipe):
    nlp = Language()
--- a/spacy/tests/pipeline/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@ -24,9 +24,9 @@ def test_empty_doc():
    tok2vec = build_Tok2Vec_model(
        MultiHashEmbed(
            width=width,
-            rows=embed_size,
+            rows=[embed_size, embed_size, embed_size, embed_size],
-            also_use_static_vectors=False,
+            include_static_vectors=False,
-            also_embed_subwords=True,
+            attrs=["NORM", "PREFIX", "SUFFIX", "SHAPE"],
        ),
        MaxoutWindowEncoder(width=width, depth=4, window_size=1, maxout_pieces=3),
    )
@ -44,9 +44,9 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size):
    tok2vec = build_Tok2Vec_model(
        MultiHashEmbed(
            width=width,
-            rows=embed_size,
+            rows=[embed_size] * 4,
-            also_use_static_vectors=False,
+            include_static_vectors=False,
-            also_embed_subwords=True,
+            attrs=["NORM", "PREFIX", "SUFFIX", "SHAPE"],
        ),
        MaxoutWindowEncoder(width=width, depth=4, window_size=1, maxout_pieces=3),
    )
@ -61,8 +61,8 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size):
@pytest.mark.parametrize(
    "width,embed_arch,embed_config,encode_arch,encode_config",
    [
-        (8, MultiHashEmbed, {"rows": 100, "also_embed_subwords": True, "also_use_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 2}),
+        (8, MultiHashEmbed, {"rows": [100, 100], "attrs": ["SHAPE", "LOWER"], "include_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 2}),
-        (8, MultiHashEmbed, {"rows": 100, "also_embed_subwords": True, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 6}),
+        (8, MultiHashEmbed, {"rows": [100, 20], "attrs": ["ORTH", "PREFIX"], "include_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 6}),
        (8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8, "also_use_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}),
        (8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 3}),
    ],
@ -118,9 +118,9 @@ cfg_string = """
    [components.tok2vec.model.embed]
    @architectures = "spacy.MultiHashEmbed.v1"
    width = ${components.tok2vec.model.encode.width}
-    rows = 2000
+    rows = [2000, 1000, 1000, 1000]
-    also_embed_subwords = true
+    attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
-    also_use_static_vectors = false
+    include_static_vectors = false
    [components.tok2vec.model.encode]
    @architectures = "spacy.MaxoutWindowEncoder.v1"
--- a/spacy/tests/regression/test_issue5918.py
+++ b/spacy/tests/regression/test_issue5918.py
@ -1,6 +1,5 @@
 from spacy.lang.en import English
 from spacy.pipeline import merge_entities
 import pytest
 def test_issue5918():
@ -23,7 +22,8 @@ def test_issue5918():
    assert len(doc.ents) == 3
    # make it so that the third span's head is within the entity (ent_iob=I)
    # bug #5918 would wrongly transfer that I to the full entity, resulting in 2 instead of 3 final ents.
-    with pytest.warns(UserWarning):
+    # TODO: test for logging here
-        doc[29].head = doc[33]
+    # with pytest.warns(UserWarning):
    #     doc[29].head = doc[33]
    doc = merge_entities(doc)
    assert len(doc.ents) == 3
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@ -89,9 +89,9 @@ def my_parser():
    tok2vec = build_Tok2Vec_model(
        MultiHashEmbed(
            width=321,
-            rows=5432,
+            attrs=["LOWER", "SHAPE"],
-            also_embed_subwords=True,
+            rows=[5432, 5432],
-            also_use_static_vectors=False,
+            include_static_vectors=False,
        ),
        MaxoutWindowEncoder(width=321, window_size=3, maxout_pieces=4, depth=2),
    )
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@ -7,6 +7,15 @@ from spacy import util
 from spacy import prefer_gpu, require_gpu
 from spacy.ml._precomputable_affine import PrecomputableAffine
 from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
 from spacy.util import dot_to_object, SimpleFrozenList
 from thinc.api import Config, Optimizer, ConfigValidationError
 from spacy.training.batchers import minibatch_by_words
 from spacy.lang.en import English
 from spacy.lang.nl import Dutch
 from spacy.language import DEFAULT_CONFIG_PATH
 from spacy.schemas import ConfigSchemaTraining
 from .util import get_random_doc
@pytest.fixture
@ -140,6 +149,21 @@ def test_is_unconstrained_version(constraint, expected):
    assert util.is_unconstrained_version(constraint) is expected
@pytest.mark.parametrize(
    "a1,a2,b1,b2,is_match",
    [
        ("3.0.0", "3.0", "3.0.1", "3.0", True),
        ("3.1.0", "3.1", "3.2.1", "3.2", False),
        ("xxx", None, "1.2.3.dev0", "1.2", False),
    ],
 )
 def test_minor_version(a1, a2, b1, b2, is_match):
    assert util.get_minor_version(a1) == a2
    assert util.get_minor_version(b1) == b2
    assert util.is_minor_version_match(a1, b1) is is_match
    assert util.is_minor_version_match(a2, b2) is is_match
@pytest.mark.parametrize(
    "dot_notation,expected",
    [
@ -157,3 +181,128 @@ def test_dot_to_dict(dot_notation, expected):
    result = util.dot_to_dict(dot_notation)
    assert result == expected
    assert util.dict_to_dot(result) == dot_notation
@pytest.mark.parametrize(
    "doc_sizes, expected_batches",
    [
        ([400, 400, 199], [3]),
        ([400, 400, 199, 3], [4]),
        ([400, 400, 199, 3, 200], [3, 2]),
        ([400, 400, 199, 3, 1], [5]),
        ([400, 400, 199, 3, 1, 1500], [5]),  # 1500 will be discarded
        ([400, 400, 199, 3, 1, 200], [3, 3]),
        ([400, 400, 199, 3, 1, 999], [3, 3]),
        ([400, 400, 199, 3, 1, 999, 999], [3, 2, 1, 1]),
        ([1, 2, 999], [3]),
        ([1, 2, 999, 1], [4]),
        ([1, 200, 999, 1], [2, 2]),
        ([1, 999, 200, 1], [2, 2]),
    ],
 )
 def test_util_minibatch(doc_sizes, expected_batches):
    docs = [get_random_doc(doc_size) for doc_size in doc_sizes]
    tol = 0.2
    batch_size = 1000
    batches = list(
        minibatch_by_words(docs, size=batch_size, tolerance=tol, discard_oversize=True)
    )
    assert [len(batch) for batch in batches] == expected_batches
    max_size = batch_size + batch_size * tol
    for batch in batches:
        assert sum([len(doc) for doc in batch]) < max_size
@pytest.mark.parametrize(
    "doc_sizes, expected_batches",
    [
        ([400, 4000, 199], [1, 2]),
        ([400, 400, 199, 3000, 200], [1, 4]),
        ([400, 400, 199, 3, 1, 1500], [1, 5]),
        ([400, 400, 199, 3000, 2000, 200, 200], [1, 1, 3, 2]),
        ([1, 2, 9999], [1, 2]),
        ([2000, 1, 2000, 1, 1, 1, 2000], [1, 1, 1, 4]),
    ],
 )
 def test_util_minibatch_oversize(doc_sizes, expected_batches):
    """ Test that oversized documents are returned in their own batch"""
    docs = [get_random_doc(doc_size) for doc_size in doc_sizes]
    tol = 0.2
    batch_size = 1000
    batches = list(
        minibatch_by_words(docs, size=batch_size, tolerance=tol, discard_oversize=False)
    )
    assert [len(batch) for batch in batches] == expected_batches
 def test_util_dot_section():
    cfg_string = """
    [nlp]
    lang = "en"
    pipeline = ["textcat"]
    [components]
    [components.textcat]
    factory = "textcat"
    [components.textcat.model]
    @architectures = "spacy.TextCatBOW.v1"
    exclusive_classes = true
    ngram_size = 1
    no_output_layer = false
    """
    nlp_config = Config().from_str(cfg_string)
    en_nlp = util.load_model_from_config(nlp_config, auto_fill=True)
    default_config = Config().from_disk(DEFAULT_CONFIG_PATH)
    default_config["nlp"]["lang"] = "nl"
    nl_nlp = util.load_model_from_config(default_config, auto_fill=True)
    # Test that creation went OK
    assert isinstance(en_nlp, English)
    assert isinstance(nl_nlp, Dutch)
    assert nl_nlp.pipe_names == []
    assert en_nlp.pipe_names == ["textcat"]
    # not exclusive_classes
    assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] is False
    # Test that default values got overwritten
    assert en_nlp.config["nlp"]["pipeline"] == ["textcat"]
    assert nl_nlp.config["nlp"]["pipeline"] == []  # default value []
    # Test proper functioning of 'dot_to_object'
    with pytest.raises(KeyError):
        dot_to_object(en_nlp.config, "nlp.pipeline.tagger")
    with pytest.raises(KeyError):
        dot_to_object(en_nlp.config, "nlp.unknownattribute")
    T = util.registry.resolve(nl_nlp.config["training"], schema=ConfigSchemaTraining)
    assert isinstance(dot_to_object({"training": T}, "training.optimizer"), Optimizer)
 def test_simple_frozen_list():
    t = SimpleFrozenList(["foo", "bar"])
    assert t == ["foo", "bar"]
    assert t.index("bar") == 1  # okay method
    with pytest.raises(NotImplementedError):
        t.append("baz")
    with pytest.raises(NotImplementedError):
        t.sort()
    with pytest.raises(NotImplementedError):
        t.extend(["baz"])
    with pytest.raises(NotImplementedError):
        t.pop()
    t = SimpleFrozenList(["foo", "bar"], error="Error!")
    with pytest.raises(NotImplementedError):
        t.append("baz")
 def test_resolve_dot_names():
    config = {
        "training": {"optimizer": {"@optimizers": "Adam.v1"}},
        "foo": {"bar": "training.optimizer", "baz": "training.xyz"},
    }
    result = util.resolve_dot_names(config, ["training.optimizer"])
    assert isinstance(result[0], Optimizer)
    with pytest.raises(ConfigValidationError) as e:
        util.resolve_dot_names(config, ["training.xyz", "training.optimizer"])
    errors = e.value.errors
    assert len(errors) == 1
    assert errors[0]["loc"] == ["training", "xyz"]
--- a/spacy/tests/test_models.py
+++ b/spacy/tests/test_models.py
@ -61,7 +61,10 @@ def get_tok2vec_kwargs():
    # This actually creates models, so seems best to put it in a function.
    return {
        "embed": MultiHashEmbed(
-            width=32, rows=500, also_embed_subwords=True, also_use_static_vectors=False
+            width=32,
            rows=[500, 500, 500],
            attrs=["NORM", "PREFIX", "SHAPE"],
            include_static_vectors=False,
        ),
        "encode": MaxoutWindowEncoder(
            width=32, depth=2, maxout_pieces=2, window_size=1
@ -73,6 +76,32 @@ def test_tok2vec():
    return build_Tok2Vec_model(**get_tok2vec_kwargs())
 def test_multi_hash_embed():
    embed = MultiHashEmbed(
        width=32,
        rows=[500, 500, 500],
        attrs=["NORM", "PREFIX", "SHAPE"],
        include_static_vectors=False,
    )
    hash_embeds = [node for node in embed.walk() if node.name == "hashembed"]
    assert len(hash_embeds) == 3
    # Check they look at different columns.
    assert list(sorted(he.attrs["column"] for he in hash_embeds)) == [0, 1, 2]
    # Check they use different seeds
    assert len(set(he.attrs["seed"] for he in hash_embeds)) == 3
    # Check they all have the same number of rows
    assert [he.get_dim("nV") for he in hash_embeds] == [500, 500, 500]
    # Now try with different row factors
    embed = MultiHashEmbed(
        width=32,
        rows=[1000, 50, 250],
        attrs=["NORM", "PREFIX", "SHAPE"],
        include_static_vectors=False,
    )
    hash_embeds = [node for node in embed.walk() if node.name == "hashembed"]
    assert [he.get_dim("nV") for he in hash_embeds] == [1000, 50, 250]
@pytest.mark.parametrize(
    "seed,model_func,kwargs",
    [
--- a/spacy/tests/test_util.py
+++ b/spacy/tests/test_util.py
@ -1,137 +0,0 @@
 import pytest
 from spacy import util
 from spacy.util import dot_to_object, SimpleFrozenList
 from thinc.api import Config, Optimizer, ConfigValidationError
 from spacy.training.batchers import minibatch_by_words
 from spacy.lang.en import English
 from spacy.lang.nl import Dutch
 from spacy.language import DEFAULT_CONFIG_PATH
 from spacy.schemas import ConfigSchemaTraining
 from .util import get_random_doc
@pytest.mark.parametrize(
    "doc_sizes, expected_batches",
    [
        ([400, 400, 199], [3]),
        ([400, 400, 199, 3], [4]),
        ([400, 400, 199, 3, 200], [3, 2]),
        ([400, 400, 199, 3, 1], [5]),
        ([400, 400, 199, 3, 1, 1500], [5]),  # 1500 will be discarded
        ([400, 400, 199, 3, 1, 200], [3, 3]),
        ([400, 400, 199, 3, 1, 999], [3, 3]),
        ([400, 400, 199, 3, 1, 999, 999], [3, 2, 1, 1]),
        ([1, 2, 999], [3]),
        ([1, 2, 999, 1], [4]),
        ([1, 200, 999, 1], [2, 2]),
        ([1, 999, 200, 1], [2, 2]),
    ],
 )
 def test_util_minibatch(doc_sizes, expected_batches):
    docs = [get_random_doc(doc_size) for doc_size in doc_sizes]
    tol = 0.2
    batch_size = 1000
    batches = list(
        minibatch_by_words(docs, size=batch_size, tolerance=tol, discard_oversize=True)
    )
    assert [len(batch) for batch in batches] == expected_batches
    max_size = batch_size + batch_size * tol
    for batch in batches:
        assert sum([len(doc) for doc in batch]) < max_size
@pytest.mark.parametrize(
    "doc_sizes, expected_batches",
    [
        ([400, 4000, 199], [1, 2]),
        ([400, 400, 199, 3000, 200], [1, 4]),
        ([400, 400, 199, 3, 1, 1500], [1, 5]),
        ([400, 400, 199, 3000, 2000, 200, 200], [1, 1, 3, 2]),
        ([1, 2, 9999], [1, 2]),
        ([2000, 1, 2000, 1, 1, 1, 2000], [1, 1, 1, 4]),
    ],
 )
 def test_util_minibatch_oversize(doc_sizes, expected_batches):
    """ Test that oversized documents are returned in their own batch"""
    docs = [get_random_doc(doc_size) for doc_size in doc_sizes]
    tol = 0.2
    batch_size = 1000
    batches = list(
        minibatch_by_words(docs, size=batch_size, tolerance=tol, discard_oversize=False)
    )
    assert [len(batch) for batch in batches] == expected_batches
 def test_util_dot_section():
    cfg_string = """
    [nlp]
    lang = "en"
    pipeline = ["textcat"]
    [components]
    [components.textcat]
    factory = "textcat"
    [components.textcat.model]
    @architectures = "spacy.TextCatBOW.v1"
    exclusive_classes = true
    ngram_size = 1
    no_output_layer = false
    """
    nlp_config = Config().from_str(cfg_string)
    en_nlp = util.load_model_from_config(nlp_config, auto_fill=True)
    default_config = Config().from_disk(DEFAULT_CONFIG_PATH)
    default_config["nlp"]["lang"] = "nl"
    nl_nlp = util.load_model_from_config(default_config, auto_fill=True)
    # Test that creation went OK
    assert isinstance(en_nlp, English)
    assert isinstance(nl_nlp, Dutch)
    assert nl_nlp.pipe_names == []
    assert en_nlp.pipe_names == ["textcat"]
    # not exclusive_classes
    assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] is False
    # Test that default values got overwritten
    assert en_nlp.config["nlp"]["pipeline"] == ["textcat"]
    assert nl_nlp.config["nlp"]["pipeline"] == []  # default value []
    # Test proper functioning of 'dot_to_object'
    with pytest.raises(KeyError):
        dot_to_object(en_nlp.config, "nlp.pipeline.tagger")
    with pytest.raises(KeyError):
        dot_to_object(en_nlp.config, "nlp.unknownattribute")
    T = util.registry.resolve(nl_nlp.config["training"], schema=ConfigSchemaTraining)
    assert isinstance(dot_to_object({"training": T}, "training.optimizer"), Optimizer)
 def test_simple_frozen_list():
    t = SimpleFrozenList(["foo", "bar"])
    assert t == ["foo", "bar"]
    assert t.index("bar") == 1  # okay method
    with pytest.raises(NotImplementedError):
        t.append("baz")
    with pytest.raises(NotImplementedError):
        t.sort()
    with pytest.raises(NotImplementedError):
        t.extend(["baz"])
    with pytest.raises(NotImplementedError):
        t.pop()
    t = SimpleFrozenList(["foo", "bar"], error="Error!")
    with pytest.raises(NotImplementedError):
        t.append("baz")
 def test_resolve_dot_names():
    config = {
        "training": {"optimizer": {"@optimizers": "Adam.v1"}},
        "foo": {"bar": "training.optimizer", "baz": "training.xyz"},
    }
    result = util.resolve_dot_names(config, ["training.optimizer"])
    assert isinstance(result[0], Optimizer)
    with pytest.raises(ConfigValidationError) as e:
        util.resolve_dot_names(config, ["training.xyz", "training.optimizer"])
    errors = e.value.errors
    assert len(errors) == 1
    assert errors[0]["loc"] == ["training", "xyz"]
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -1528,7 +1528,7 @@ cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1:
    while not heads_within_sents:
        heads_within_sents = _set_lr_kids_and_edges(tokens, start, end, loop_count)
        if loop_count > 10:
-            warnings.warn(Warnings.W026)
+            util.logger.debug(Warnings.W026)
            break
        loop_count += 1
    # Set sentence starts
--- a/spacy/training/augment.py
+++ b/spacy/training/augment.py
@ -5,7 +5,7 @@ import copy
 from functools import partial
 from pydantic import BaseModel, StrictStr
-from ..util import registry, logger
+from ..util import registry
 from ..tokens import Doc
 from .example import Example
@ -64,7 +64,7 @@ def dont_augment(nlp: "Language", example: Example) -> Iterator[Example]:
 def lower_casing_augmenter(
-    nlp: "Language", example: Example, *, level: float,
+    nlp: "Language", example: Example, *, level: float
 ) -> Iterator[Example]:
    if random.random() >= level:
        yield example
@ -119,9 +119,8 @@ def make_orth_variants(
    orig_token_dict = copy.deepcopy(token_dict)
    ndsv = orth_variants.get("single", [])
    ndpv = orth_variants.get("paired", [])
-    logger.debug(f"Data augmentation: {len(ndsv)} single / {len(ndpv)} paired variants")
+    words = token_dict.get("ORTH", [])
-    words = token_dict.get("words", [])
+    tags = token_dict.get("TAG", [])
    tags = token_dict.get("tags", [])
    # keep unmodified if words or tags are not defined
    if words and tags:
        if lower:
@ -154,8 +153,8 @@ def make_orth_variants(
                            if words[word_idx] in pair:
                                pair_idx = pair.index(words[word_idx])
                    words[word_idx] = punct_choices[punct_idx][pair_idx]
-        token_dict["words"] = words
+        token_dict["ORTH"] = words
-        token_dict["tags"] = tags
+        token_dict["TAG"] = tags
    # modify raw
    if raw is not None:
        variants = []
--- a/spacy/training/converters/conll_ner_to_docs.py
+++ b/spacy/training/converters/conll_ner_to_docs.py
@ -103,7 +103,7 @@ def conll_ner_to_docs(
            lines = [line.strip() for line in conll_sent.split("\n") if line.strip()]
            cols = list(zip(*[line.split() for line in lines]))
            if len(cols) < 2:
-                raise ValueError(Errors.E093)
+                raise ValueError(Errors.E903)
            length = len(cols[0])
            words.extend(cols[0])
            sent_starts.extend([True] + [False] * (length - 1))
--- a/spacy/training/converters/iob_to_docs.py
+++ b/spacy/training/converters/iob_to_docs.py
@ -46,7 +46,7 @@ def read_iob(raw_sents, vocab, n_sents):
                sent_words, sent_iob = zip(*sent_tokens)
                sent_tags = ["-"] * len(sent_words)
            else:
-                raise ValueError(Errors.E092)
+                raise ValueError(Errors.E902)
            words.extend(sent_words)
            tags.extend(sent_tags)
            iob.extend(sent_iob)
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@ -3,19 +3,24 @@ from typing import Optional, TYPE_CHECKING
 from pathlib import Path
 from timeit import default_timer as timer
 from thinc.api import Optimizer, Config, constant, fix_random_seed, set_gpu_allocator
 from wasabi import Printer
 import random
 import wasabi
 import sys
 import shutil
 from .example import Example
 from ..schemas import ConfigSchemaTraining
 from ..errors import Errors
-from ..util import resolve_dot_names, registry
+from ..util import resolve_dot_names, registry, logger
 if TYPE_CHECKING:
    from ..language import Language  # noqa: F401
 DIR_MODEL_BEST = "model-best"
 DIR_MODEL_LAST = "model-last"
 def train(
    nlp: "Language",
    output_path: Optional[Path] = None,
@ -38,7 +43,7 @@ def train(
    RETURNS (Path / None): The path to the final exported model.
    """
    # We use no_print here so we can respect the stdout/stderr options.
-    msg = wasabi.Printer(no_print=True)
+    msg = Printer(no_print=True)
    # Create iterator, which yields out info after each optimization step.
    config = nlp.config.interpolate()
    if config["training"]["seed"] is not None:
@ -69,6 +74,7 @@ def train(
        eval_frequency=T["eval_frequency"],
        exclude=frozen_components,
    )
    clean_output_dir(output_path)
    stdout.write(msg.info(f"Pipeline: {nlp.pipe_names}") + "\n")
    if frozen_components:
        stdout.write(msg.info(f"Frozen components: {frozen_components}") + "\n")
@ -83,7 +89,7 @@ def train(
                    update_meta(T, nlp, info)
                with nlp.use_params(optimizer.averages):
                    nlp = before_to_disk(nlp)
-                    nlp.to_disk(output_path / "model-best")
+                    nlp.to_disk(output_path / DIR_MODEL_BEST)
    except Exception as e:
        if output_path is not None:
            # We don't want to swallow the traceback if we don't have a
@ -100,7 +106,7 @@ def train(
    finally:
        finalize_logger()
        if output_path is not None:
-            final_model_path = output_path / "model-last"
+            final_model_path = output_path / DIR_MODEL_LAST
            if optimizer.averages:
                with nlp.use_params(optimizer.averages):
                    nlp.to_disk(final_model_path)
@ -305,3 +311,19 @@ def create_before_to_disk_callback(
        return modified_nlp
    return before_to_disk
 def clean_output_dir(path: Union[str, Path]) -> None:
    """Remove an existing output directory. Typically used to ensure that that
    a directory like model-best and its contents aren't just being overwritten
    by nlp.to_disk, which could preserve existing subdirectories (e.g.
    components that don't exist anymore).
    """
    if path is not None and path.exists():
        for subdir in [path / DIR_MODEL_BEST, path / DIR_MODEL_LAST]:
            if subdir.exists():
                try:
                    shutil.rmtree(str(subdir))
                    logger.debug(f"Removed existing output directory: {subdir}")
                except Exception as e:
                    raise IOError(Errors.E901.format(path=path)) from e
--- a/spacy/util.py
+++ b/spacy/util.py
@ -73,6 +73,7 @@ logger = logging.getLogger("spacy")
 class ENV_VARS:
    CONFIG_OVERRIDES = "SPACY_CONFIG_OVERRIDES"
    PROJECT_USE_GIT_VERSION = "SPACY_PROJECT_USE_GIT_VERSION"
 class registry(thinc.registry):
@ -584,6 +585,33 @@ def get_base_version(version: str) -> str:
    return Version(version).base_version
 def get_minor_version(version: str) -> Optional[str]:
    """Get the major + minor version (without patch or prerelease identifiers).
    version (str): The version.
    RETURNS (str): The major + minor version or None if version is invalid.
    """
    try:
        v = Version(version)
    except (TypeError, InvalidVersion):
        return None
    return f"{v.major}.{v.minor}"
 def is_minor_version_match(version_a: str, version_b: str) -> bool:
    """Compare two versions and check if they match in major and minor, without
    patch or prerelease identifiers. Used internally for compatibility checks
    that should be insensitive to patch releases.
    version_a (str): The first version
    version_b (str): The second version.
    RETURNS (bool): Whether the versions match.
    """
    a = get_minor_version(version_a)
    b = get_minor_version(version_b)
    return a is not None and b is not None and a == b
 def load_meta(path: Union[str, Path]) -> Dict[str, Any]:
    """Load a model meta.json from a path and validate its contents.
@ -1315,3 +1343,16 @@ def is_cython_func(func: Callable) -> bool:
        cls_func = vars(sys.modules[func.__module__])[func.__qualname__.split(".")[0]]
        return hasattr(cls_func, attr)
    return False
 def check_bool_env_var(env_var: str) -> bool:
    """Convert the value of an environment variable to a boolean. Add special
    check for "0" (falsy) and consider everything else truthy, except unset.
    env_var (str): The name of the environment variable to check.
    RETURNS (bool): Its boolean value.
    """
    value = os.environ.get(env_var, False)
    if value == "0":
        return False
    return bool(value)
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -445,9 +445,9 @@ cdef class Vocab:
        setters = ["strings", "vectors"]
        if "strings" not in exclude:
            self.strings.to_disk(path / "strings.json")
-        if "vectors" not in "exclude" and self.vectors is not None:
+        if "vectors" not in "exclude":
            self.vectors.to_disk(path)
-        if "lookups" not in "exclude" and self.lookups is not None:
+        if "lookups" not in "exclude":
            self.lookups.to_disk(path)
    def from_disk(self, path, *, exclude=tuple()):
--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@ -136,25 +136,28 @@ argument that connects to the shared `tok2vec` component in the pipeline.
 > [model]
 > @architectures = "spacy.MultiHashEmbed.v1"
 > width = 64
-> rows = 2000
+> attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
-> also_embed_subwords = false
+> rows = [2000, 1000, 1000, 1000]
-> also_use_static_vectors = false
+> include_static_vectors = true
 > ```
 Construct an embedding layer that separately embeds a number of lexical
 attributes using hash embedding, concatenates the results, and passes it through
-a feed-forward subnetwork to build mixed representations. The features used are
+a feed-forward subnetwork to build a mixed representations. The features used
-the `NORM`, `PREFIX`, `SUFFIX` and `SHAPE`, and they are extracted with a
+can be configured with the `attrs` argument. The suggested attributes are
-[FeatureExtractor](/api/architectures#FeatureExtractor) layer. Vectors from pretrained static
+`NORM`, `PREFIX`, `SUFFIX` and `SHAPE`. This lets the model take into account
-vectors can also be incorporated into the concatenated representation.
+some subword information, without construction a fully character-based
 representation. If pretrained vectors are available, they can be included in the
 representation as well, with the vectors table will be kept static (i.e. it's
 not updated).
-| Name                      | Description                                                                                                                                                                                                       |
+| Name                     | Description                                                                                                                                                                                                                                                                                                                                                                                                                                        |
-| ------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| ------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `width`                   | The output width. Also used as the width of the embedding tables. Recommended values are between `64` and `300`. ~~int~~                                                                                          |
+| `width`                  | The output width. Also used as the width of the embedding tables. Recommended values are between `64` and `300`. If static vectors are included, a learned linear layer is used to map the vectors to the specified width before concatenating it with the other embedding outputs. A single maxout layer is then used to reduce the concatenated vectors to the final width. ~~int~~                                                              |
-| `rows`                    | The number of rows for the embedding tables. Can be low, due to the hashing trick. Embeddings for prefix, suffix and word shape use half as many rows. Recommended values are between `2000` and `10000`. ~~int~~ |
+| `attrs`                  | The token attributes to embed. A separate embedding table will be constructed for each attribute. ~~List[Union[int, str]]~~                                                                                                                                                                                                                                                                                                                        |
-| `also_embed_subwords`     | Whether to use the `PREFIX`, `SUFFIX` and `SHAPE` features in the embeddings. If not using these, you may need more rows in your hash embeddings, as there will be increased chance of collisions. ~~bool~~       |
+| `rows`                   | The number of rows for each embedding tables. Can be low, due to the hashing trick. Recommended values are between `1000` and `10000`. The layer needs surprisingly few rows, due to its use of the hashing trick. Generally between 2000 and 10000 rows is sufficient, even for very large vocabularies. A number of rows must be specified for each table, so the `rows` list must be of the same length as the `attrs` parameter. ~~List[int]~~ |
-| `also_use_static_vectors` | Whether to also use static word vectors. Requires a vectors table to be loaded in the [Doc](/api/doc) objects' vocab. ~~bool~~                                                                                    |
+| `include_static_vectors` | Whether to also use static word vectors. Requires a vectors table to be loaded in the [`Doc`](/api/doc) objects' vocab. ~~bool~~                                                                                                                                                                                                                                                                                                                   |
-| **CREATES**               | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                            |
+| **CREATES**              | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                                                                                                                                                                                                                             |
 ### spacy.CharacterEmbed.v1 {#CharacterEmbed}
--- a/website/docs/api/pipe.md
+++ b/website/docs/api/pipe.md
@ -226,6 +226,12 @@ the "catastrophic forgetting" problem. This feature is experimental.
 Find the loss and gradient of loss for the batch of documents and their
 predicted scores.
 <Infobox variant="danger">
 This method needs to be overwritten with your own custom `get_loss` method.
 </Infobox>
 > #### Example
 >
 > ```python
--- a/website/docs/usage/layers-architectures.md
+++ b/website/docs/usage/layers-architectures.md
@ -86,7 +86,8 @@ see are:
 | ~~Ragged~~         | A container to handle variable-length sequence data in an unpadded contiguous array.                 |
 | ~~Padded~~         | A container to handle variable-length sequence data in a padded contiguous array.                    |
-The model type signatures help you figure out which model architectures and
+See the [Thinc type reference](https://thinc.ai/docs/api-types) for details. The
 model type signatures help you figure out which model architectures and
 components can **fit together**. For instance, the
 [`TextCategorizer`](/api/textcategorizer) class expects a model typed
 ~~Model[List[Doc], Floats2d]~~, because the model will predict one row of
@ -288,7 +289,7 @@ those parts of the network.
 To use our custom model including the PyTorch subnetwork, all we need to do is
 register the architecture using the
-[`architectures` registry](/api/top-level#registry). This will assign the
+[`architectures` registry](/api/top-level#registry). This assigns the
 architecture a name so spaCy knows how to find it, and allows passing in
 arguments like hyperparameters via the [config](/usage/training#config). The
 full example then becomes:
@ -373,7 +374,7 @@ gpu_allocator = "pytorch"
 Of course it's also possible to define the `Model` from the previous section
 entirely in Thinc. The Thinc documentation provides details on the
 [various layers](https://thinc.ai/docs/api-layers) and helper functions
-available. Combinators can also be used to
+available. Combinators can be used to
 [overload operators](https://thinc.ai/docs/usage-models#operators) and a common
 usage pattern is to bind `chain` to `>>`. The "native" Thinc version of our
 simple neural network would then become:
@ -486,28 +487,376 @@ with Model.define_operators({">>": chain}):
 ## Create new trainable components {#components}
-<Infobox title="This section is still under construction" emoji="🚧" variant="warning">
+In addition to [swapping out](#swap-architectures) default models in built-in
 components, you can also implement an entirely new,
 [trainable](/usage/processing-pipelines#trainable-components) pipeline component
 from scratch. This can be done by creating a new class inheriting from
 [`Pipe`](/api/pipe), and linking it up to your custom model implementation.
 <Infobox title="Trainable component API" emoji="💡">
 For details on how to implement pipeline components, check out the usage guide
 on [custom components](/usage/processing-pipelines#custom-component) and the
 overview of the `Pipe` methods used by
 [trainable components](/usage/processing-pipelines#trainable-components).
 </Infobox>
-<!-- TODO: write trainable component section
+### Example: Entity elation extraction component {#component-rel}
 - Interaction with `predict`, `get_loss` and `set_annotations`
 - Initialization life-cycle with `initialize`, correlation with add_label
 Example: relation extraction component (implemented as project template)
 Avoid duplication with usage/processing-pipelines#trainable-components ?
 -->
-<!-- ![Diagram of a pipeline component with its model](../images/layers-architectures.svg)
+This section outlines an example use-case of implementing a **novel relation
 extraction component** from scratch. We'll implement a binary relation
 extraction method that determines whether or not **two entities** in a document
 are related, and if so, what type of relation. We'll allow multiple types of
 relations between two such entities (multi-label setting). There are two major
 steps required:
 1. Implement a [machine learning model](#component-rel-model) specific to this
   task. It will have to extract candidates from a [`Doc`](/api/doc) and predict
   a relation for the available candidate pairs.
 2. Implement a custom [pipeline component](#component-rel-pipe) powered by the
   machine learning model that sets annotations on the [`Doc`](/api/doc) passing
   through the pipeline.
 <!-- TODO: <Project id="tutorials/ner-relations">
 </Project> -->
 #### Step 1: Implementing the Model {#component-rel-model}
 We need to implement a [`Model`](https://thinc.ai/docs/api-model) that takes a
 **list of documents** (~~List[Doc]~~) as input, and outputs a **two-dimensional
 matrix** (~~Floats2d~~) of predictions:
 > #### Model type annotations
 >
 > The `Model` class is a generic type that can specify its input and output
 > types, e.g. ~~Model[List[Doc], Floats2d]~~. Type hints are used for static
 > type checks and validation. See the section on [type signatures](#type-sigs)
 > for details.
 ```python
-def update(self, examples):
+### Register the model architecture
-    docs = [ex.predicted for ex in examples]
+@registry.architectures.register("rel_model.v1")
-    refs = [ex.reference for ex in examples]
+def create_relation_model(...) -> Model[List[Doc], Floats2d]:
-    predictions, backprop = self.model.begin_update(docs)
+    model = ...  # 👈 model will go here
-    gradient = self.get_loss(predictions, refs)
+    return model
    backprop(gradient)
 def __call__(self, doc):
    predictions = self.model([doc])
    self.set_annotations(predictions)
 ```
-->
+
 The first layer in this model will typically be an
 [embedding layer](/usage/embeddings-transformers) such as a
 [`Tok2Vec`](/api/tok2vec) component or a [`Transformer`](/api/transformer). This
 layer is assumed to be of type ~~Model[List[Doc], List[Floats2d]]~~ as it
 transforms each **document into a list of tokens**, with each token being
 represented by its embedding in the vector space.
 Next, we need a method that **generates pairs of entities** that we want to
 classify as being related or not. As these candidate pairs are typically formed
 within one document, this function takes a [`Doc`](/api/doc) as input and
 outputs a `List` of `Span` tuples. For instance, a very straightforward
 implementation would be to just take any two entities from the same document:
 ```python
 ### Simple candiate generation
 def get_candidates(doc: Doc) -> List[Tuple[Span, Span]]:
    candidates = []
    for ent1 in doc.ents:
        for ent2 in doc.ents:
            candidates.append((ent1, ent2))
    return candidates
 ```
 But we could also refine this further by **excluding relations** of an entity
 with itself, and posing a **maximum distance** (in number of tokens) between two
 entities. We register this function in the
 [`@misc` registry](/api/top-level#registry) so we can refer to it from the
 config, and easily swap it out for any other candidate generation function.
 > #### config.cfg (excerpt)
 >
 > ```ini
 > [model]
 > @architectures = "rel_model.v1"
 >
 > [model.tok2vec]
 > # ...
 >
 > [model.get_candidates]
 > @misc = "rel_cand_generator.v1"
 > max_length = 20
 > ```
 ```python
 ### Extended candidate generation {highlight="1,2,7,8"}
@registry.misc.register("rel_cand_generator.v1")
 def create_candidate_indices(max_length: int) -> Callable[[Doc], List[Tuple[Span, Span]]]:
    def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]:
        candidates = []
        for ent1 in doc.ents:
            for ent2 in doc.ents:
                if ent1 != ent2:
                    if max_length and abs(ent2.start - ent1.start) <= max_length:
                        candidates.append((ent1, ent2))
        return candidates
    return get_candidates
 ```
 Finally, we require a method that transforms the candidate entity pairs into a
 2D tensor using the specified [`Tok2Vec`](/api/tok2vec) or
 [`Transformer`](/api/transformer). The resulting ~~Floats2~~ object will then be
 processed by a final `output_layer` of the network. Putting all this together,
 we can define our relation model in a config file as such:
 ```ini
 ### config.cfg
 [model]
@architectures = "rel_model.v1"
 # ...
 [model.tok2vec]
 # ...
 [model.get_candidates]
@misc = "rel_cand_generator.v2"
 max_length = 20
 [model.create_candidate_tensor]
@misc = "rel_cand_tensor.v1"
 [model.output_layer]
@architectures = "rel_output_layer.v1"
 # ...
 ```
 <!-- TODO: link to project for implementation details -->
 <!-- TODO: maybe embed files from project that show the architectures? -->
 When creating this model, we store the custom functions as
 [attributes](https://thinc.ai/docs/api-model#properties) and the sublayers as
 references, so we can access them easily:
 ```python
 tok2vec_layer = model.get_ref("tok2vec")
 output_layer = model.get_ref("output_layer")
 create_candidate_tensor = model.attrs["create_candidate_tensor"]
 get_candidates = model.attrs["get_candidates"]
 ```
 #### Step 2: Implementing the pipeline component {#component-rel-pipe}
 To use our new relation extraction model as part of a custom
 [trainable component](/usage/processing-pipelines#trainable-components), we
 create a subclass of [`Pipe`](/api/pipe) that holds the model:
 ```python
 ### Pipeline component skeleton
 from spacy.pipeline import Pipe
 class RelationExtractor(Pipe):
     def __init__(self, vocab, model, name="rel"):
        """Create a component instance."""
        self.model = model
        self.vocab = vocab
        self.name = name
    def update(self, examples, drop=0.0, set_annotations=False, sgd=None, losses=None):
        """Learn from a batch of Example objects."""
        ...
    def predict(self, docs):
        """Apply the model to a batch of Doc objects."""
        ...
    def set_annotations(self, docs, predictions):
        """Modify a batch of Doc objects using the predictions."""
         ...
    def initialize(self, get_examples, nlp=None, labels=None):
        """Initialize the model before training."""
        ...
    def add_label(self, label):
        """Add a label to the component."""
        ...
 ```
 Before the model can be used, it needs to be
 [initialized](/usage/training#initialization). This function receives a callback
 to access the full **training data set**, or a representative sample. This data
 set can be used to deduce all **relevant labels**. Alternatively, a list of
 labels can be provided to `initialize`, or you can call the
 `RelationExtractoradd_label` directly. The number of labels defines the output
 dimensionality of the network, and will be used to do
 [shape inference](https://thinc.ai/docs/usage-models#validation) throughout the
 layers of the neural network. This is triggered by calling
 [`Model.initialize`](https://thinc.ai/api/model#initialize).
 ```python
 ### The initialize method {highlight="12,18,22"}
 from itertools import islice
 def initialize(
    self,
    get_examples: Callable[[], Iterable[Example]],
    *,
    nlp: Language = None,
    labels: Optional[List[str]] = None,
 ):
    if labels is not None:
        for label in labels:
            self.add_label(label)
    else:
        for example in get_examples():
            relations = example.reference._.rel
            for indices, label_dict in relations.items():
                for label in label_dict.keys():
                    self.add_label(label)
    subbatch = list(islice(get_examples(), 10))
    doc_sample = [eg.reference for eg in subbatch]
    label_sample = self._examples_to_truth(subbatch)
    self.model.initialize(X=doc_sample, Y=label_sample)
 ```
 The `initialize` method is triggered whenever this component is part of an `nlp`
 pipeline, and [`nlp.initialize`](/api/language#initialize) is invoked.
 Typically, this happens when the pipeline is set up before training in
 [`spacy train`](/api/cli#training). After initialization, the pipeline component
 and its internal model can be trained and used to make predictions.
 During training, the function [`update`](/api/pipe#update) is invoked which
 delegates to
 [`Model.begin_update`](https://thinc.ai/docs/api-model#begin_update) and a
 [`get_loss`](/api/pipe#get_loss) function that **calculate the loss** for a
 batch of examples, as well as the **gradient** of loss that will be used to
 update the weights of the model layers. Thinc provides several
 [loss functions](https://thinc.ai/docs/api-loss) that can be used for the
 implementation of the `get_loss` function.
 ```python
 ### The update method {highlight="12-14"}
 def update(
    self,
    examples: Iterable[Example],
    *,
    drop: float = 0.0,
    set_annotations: bool = False,
    sgd: Optional[Optimizer] = None,
    losses: Optional[Dict[str, float]] = None,
 ) -> Dict[str, float]:
    ...
    docs = [ex.predicted for ex in examples]
    predictions, backprop = self.model.begin_update(docs)
    loss, gradient = self.get_loss(examples, predictions)
    backprop(gradient)
    losses[self.name] += loss
    ...
    return losses
 ```
 When the internal model is trained, the component can be used to make novel
 **predictions**. The [`predict`](/api/pipe#predict) function needs to be
 implemented for each subclass of `Pipe`. In our case, we can simply delegate to
 the internal model's [predict](https://thinc.ai/docs/api-model#predict) function
 that takes a batch of `Doc` objects and returns a ~~Floats2d~~ array:
 ```python
 ### The predict method
 def predict(self, docs: Iterable[Doc]) -> Floats2d:
    predictions = self.model.predict(docs)
    return self.model.ops.asarray(predictions)
 ```
 The final method that needs to be implemented, is
 [`set_annotations`](/api/pipe#set_annotations). This function takes the
 predictions, and modifies the given `Doc` object in place to store them. For our
 relation extraction component, we store the data as a dictionary in a custom
 [extension attribute](/usage/processing-pipelines#custom-components-attributes)
 `doc._.rel`. As keys, we represent the candidate pair by the **start offsets of
 each entity**, as this defines an entity pair uniquely within one document.
 To interpret the scores predicted by the relation extraction model correctly, we
 need to refer to the model's `get_candidates` function that defined which pairs
 of entities were relevant candidates, so that the predictions can be linked to
 those exact entities:
 > #### Example output
 >
 > ```python
 > doc = nlp("Amsterdam is the capital of the Netherlands.")
 > print("spans", [(e.start, e.text, e.label_) for e in doc.ents])
 > for value, rel_dict in doc._.rel.items():
 >     print(f"{value}: {rel_dict}")
 >
 > # spans [(0, 'Amsterdam', 'LOC'), (6, 'Netherlands', 'LOC')]
 > # (0, 6): {'CAPITAL_OF': 0.89, 'LOCATED_IN': 0.75, 'UNRELATED': 0.002}
 > # (6, 0): {'CAPITAL_OF': 0.01, 'LOCATED_IN': 0.13, 'UNRELATED': 0.017}
 > ```
 ```python
 ### Registering the extension attribute
 from spacy.tokens import Doc
 Doc.set_extension("rel", default={})
 ```
 ```python
 ### The set_annotations method {highlight="5-6,10"}
 def set_annotations(self, docs: Iterable[Doc], predictions: Floats2d):
    c = 0
    get_candidates = self.model.attrs["get_candidates"]
    for doc in docs:
        for (e1, e2) in get_candidates(doc):
            offset = (e1.start, e2.start)
            if offset not in doc._.rel:
                doc._.rel[offset] = {}
            for j, label in enumerate(self.labels):
                doc._.rel[offset][label] = predictions[c, j]
            c += 1
 ```
 Under the hood, when the pipe is applied to a document, it delegates to the
 `predict` and `set_annotations` methods:
 ```python
 ### The __call__ method
 def __call__(self, Doc doc):
    predictions = self.predict([doc])
    self.set_annotations([doc], predictions)
    return doc
 ```
 Once our `Pipe` subclass is fully implemented, we can
 [register](/usage/processing-pipelines#custom-components-factories) the
 component with the [`@Language.factory`](/api/lnguage#factory) decorator. This
 assigns it a name and lets you create the component with
 [`nlp.add_pipe`](/api/language#add_pipe) and via the
 [config](/usage/training#config).
 > #### config.cfg (excerpt)
 >
 > ```ini
 > [components.relation_extractor]
 > factory = "relation_extractor"
 >
 > [components.relation_extractor.model]
 > @architectures = "rel_model.v1"
 >
 > [components.relation_extractor.model.tok2vec]
 > # ...
 >
 > [components.relation_extractor.model.get_candidates]
 > @misc = "rel_cand_generator.v1"
 > max_length = 20
 > ```
 ```python
 ### Registering the pipeline component
 from spacy.language import Language
@Language.factory("relation_extractor")
 def make_relation_extractor(nlp, name, model):
    return RelationExtractor(nlp.vocab, model, name)
 ```
 <!-- TODO: <Project id="tutorials/ner-relations">
 </Project> -->
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@ -1176,7 +1176,7 @@ plug fully custom machine learning components into your pipeline. You'll need
 the following:
 1. **Model:** A Thinc [`Model`](https://thinc.ai/docs/api-model) instance. This
-   can be a model using implemented in
+   can be a model implemented in
   [Thinc](/usage/layers-architectures#thinc), or a
   [wrapped model](/usage/layers-architectures#frameworks) implemented in
   PyTorch, TensorFlow, MXNet or a fully custom solution. The model must take a
--- a/website/docs/usage/projects.md
+++ b/website/docs/usage/projects.md
@ -216,15 +216,16 @@ pipelines.
 %%GITHUB_PROJECTS/pipelines/tagger_parser_ud/project.yml
 ```
-| Section       | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
+| Section         | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
-| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| --------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `title`       | An optional project title used in `--help` message and [auto-generated docs](#custom-docs).                                                                                                                                                                                                                                                                                                                                                                                                                  |
+| `title`         | An optional project title used in `--help` message and [auto-generated docs](#custom-docs).                                                                                                                                                                                                                                                                                                                                                                                                                  |
-| `description` | An optional project description used in [auto-generated docs](#custom-docs).                                                                                                                                                                                                                                                                                                                                                                                                                                 |
+| `description`   | An optional project description used in [auto-generated docs](#custom-docs).                                                                                                                                                                                                                                                                                                                                                                                                                                 |
-| `vars`        | A dictionary of variables that can be referenced in paths, URLs and scripts, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`.                                                                                                                                                |
+| `vars`          | A dictionary of variables that can be referenced in paths, URLs and scripts, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`.                                                                                                                                                |
-| `directories` | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist.                                                                                                                                                                                                                                                                                                                 |
+| `directories`   | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist.                                                                                                                                                                                                                                                                                                                 |
-| `assets`      | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. Instead of `url`, you can also provide a `git` block with the keys `repo`, `branch` and `path`, to download from a Git repo.                                                                        |
+| `assets`        | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. Instead of `url`, you can also provide a `git` block with the keys `repo`, `branch` and `path`, to download from a Git repo.                                                                        |
-| `workflows`   | A dictionary of workflow names, mapped to a list of command names, to execute in order. Workflows can be run with the [`project run`](/api/cli#project-run) command.                                                                                                                                                                                                                                                                                                                                         |
+| `workflows`     | A dictionary of workflow names, mapped to a list of command names, to execute in order. Workflows can be run with the [`project run`](/api/cli#project-run) command.                                                                                                                                                                                                                                                                                                                                         |
-| `commands`    | A list of named commands. A command can define an optional help message (shown in the CLI when the user adds `--help`) and the `script`, a list of commands to run. The `deps` and `outputs` let you define the created file the command depends on and produces, respectively. This lets spaCy determine whether a command needs to be re-run because its dependencies or outputs changed. Commands can be run as part of a workflow, or separately with the [`project run`](/api/cli#project-run) command. |
+| `commands`      | A list of named commands. A command can define an optional help message (shown in the CLI when the user adds `--help`) and the `script`, a list of commands to run. The `deps` and `outputs` let you define the created file the command depends on and produces, respectively. This lets spaCy determine whether a command needs to be re-run because its dependencies or outputs changed. Commands can be run as part of a workflow, or separately with the [`project run`](/api/cli#project-run) command. |
 | `spacy_version` | Optional spaCy version range like `>=3.0.0,<3.1.0` that the project is compatible with. If it's loaded with an incompatible version, an error is raised when the project is loaded.                                                                                                                                                                                                                                                                                                                          |
 ### Data assets {#data-assets}
--- a/website/src/styles/quickstart.module.sass
+++ b/website/src/styles/quickstart.module.sass
@ -38,7 +38,7 @@
    cursor: pointer
    display: inline-block
    padding: 0.35rem 0.5rem 0.25rem 0
-    margin: 0 1rem 0.75rem 0
+    margin: 0 1rem 0.5rem 0
    font-size: var(--font-size-xs)
    font-weight: bold
@ -73,16 +73,19 @@
        background: var(--color-theme)
    .checkbox + &:before
        $size: 18px
        content: ""
        display: inline-block
-        width: 20px
+        width: $size
-        height: 20px
+        height: $size
        border: 1px solid var(--color-subtle)
        vertical-align: middle
        margin-right: 0.5rem
        cursor: pointer
-        border-radius: var(--border-radius)
+        border-radius: $size / 4
        background: var(--color-back)
        position: relative
        top: -1px
    .checkbox:checked + &:before
        // Embed "check" icon here for simplicity
--- a/website/src/widgets/quickstart-install.js
+++ b/website/src/widgets/quickstart-install.js
@ -4,6 +4,8 @@ import { StaticQuery, graphql } from 'gatsby'
 import { Quickstart, QS } from '../components/quickstart'
 import { repo } from '../components/util'
 const DEFAULT_MODELS = ['en']
 const DEFAULT_OPT = 'efficiency'
 const DEFAULT_HARDWARE = 'cpu'
 const DEFAULT_CUDA = 'cuda100'
 const CUDA = {
@ -15,6 +17,7 @@ const CUDA = {
    '10.1': 'cuda101',
    '10.2': 'cuda102',
 }
 const LANG_EXTRAS = ['zh', 'ja'] // only for languages with models
 const DATA = [
    {
        id: 'os',
@ -68,14 +71,24 @@ const QuickstartInstall = ({ id, title }) => {
    const [train, setTrain] = useState(false)
    const [hardware, setHardware] = useState(DEFAULT_HARDWARE)
    const [cuda, setCuda] = useState(DEFAULT_CUDA)
    const [selectedModels, setModels] = useState(DEFAULT_MODELS)
    const [efficiency, setEfficiency] = useState(DEFAULT_OPT === 'efficiency')
    const setters = {
        hardware: v => (Array.isArray(v) ? setHardware(v[0]) : setCuda(v)),
        config: v => setTrain(v.includes('train')),
        models: setModels,
        optimize: v => setEfficiency(v.includes('efficiency')),
    }
    const showDropdown = {
        hardware: () => hardware === 'gpu',
    }
-    const pipExtras = [hardware === 'gpu' && cuda, train && 'transformers', train && 'lookups']
+    const modelExtras = train ? selectedModels.filter(m => LANG_EXTRAS.includes(m)) : []
    const pipExtras = [
        hardware === 'gpu' && cuda,
        train && 'transformers',
        train && 'lookups',
        ...modelExtras,
    ]
        .filter(e => e)
        .join(',')
    return (
@ -89,13 +102,37 @@ const QuickstartInstall = ({ id, title }) => {
                    ...DATA,
                    {
                        id: 'models',
-                        title: 'Trained Pipelines',
+                        title: 'Trained pipelines',
                        multiple: true,
                        options: models
                            .sort((a, b) => a.name.localeCompare(b.name))
-                            .map(({ code, name }) => ({ id: code, title: name })),
+                            .map(({ code, name }) => ({
                                id: code,
                                title: name,
                                checked: DEFAULT_MODELS.includes(code),
                            })),
                    },
                ]
                if (selectedModels.length) {
                    data.push({
                        id: 'optimize',
                        title: 'Select pipeline for',
                        options: [
                            {
                                id: 'efficiency',
                                title: 'efficiency',
                                checked: DEFAULT_OPT === 'efficiency',
                                help: 'Faster and smaller pipeline, but less accurate',
                            },
                            {
                                id: 'accuracy',
                                title: 'accuracy',
                                checked: DEFAULT_OPT === 'accuracy',
                                help: 'Larger and slower pipeline, but more accurate',
                            },
                        ],
                    })
                }
                return (
                    <Quickstart
                        data={data}
@ -149,11 +186,14 @@ const QuickstartInstall = ({ id, title }) => {
                            conda install -c conda-forge spacy-lookups-data
                        </QS>
-                        {models.map(({ code, models: modelOptions }) => (
+                        {models.map(({ code, models: modelOptions }) => {
-                            <QS models={code} key={code}>
+                            const pkg = modelOptions[efficiency ? 0 : modelOptions.length - 1]
-                                python -m spacy download {modelOptions[0]}
+                            return (
-                            </QS>
+                                <QS models={code} key={code}>
-                        ))}
+                                    python -m spacy download {pkg}
                                </QS>
                            )
                        })}
                    </Quickstart>
                )
            }}
--- a/website/src/widgets/quickstart-models.js
+++ b/website/src/widgets/quickstart-models.js
@ -31,25 +31,33 @@ const data = [
    },
    {
        id: 'optimize',
-        title: 'Optimize for',
+        title: 'Select for',
        help:
            'Optimize for efficiency (faster & smaller model) or higher accuracy (larger & slower model)',
        options: [
-            { id: 'efficiency', title: 'efficiency', checked: DEFAULT_OPT === 'efficiency' },
+            {
-            { id: 'accuracy', title: 'accuracy', checked: DEFAULT_OPT === 'accuracy' },
+                id: 'efficiency',
                title: 'efficiency',
                checked: DEFAULT_OPT === 'efficiency',
                help: 'Faster and smaller pipeline, but less accurate',
            },
            {
                id: 'accuracy',
                title: 'accuracy',
                checked: DEFAULT_OPT === 'accuracy',
                help: 'Larger and slower pipeline, but more accurate',
            },
        ],
    },
    {
        id: 'config',
        title: 'Options',
        multiple: true,
-        options: [{ id: 'example', title: 'Show usage example' }],
+        options: [{ id: 'example', title: 'Show text example' }],
    },
 ]
 const QuickstartInstall = ({ id, title, description, children }) => {
    const [lang, setLang] = useState(DEFAULT_LANG)
-    const [efficiency, setEfficiency] = useState(DEFAULT_OPT)
+    const [efficiency, setEfficiency] = useState(DEFAULT_OPT === 'efficiency')
    const setters = {
        lang: setLang,
        optimize: v => setEfficiency(v.includes('efficiency')),