Merge branch 'master' of https://github.com/explosion/spaCy into feature/etl

2025-08-02 19:30:19 +03:00 · 2022-11-09 12:24:36 +01:00 · 2022-11-09 12:24:36 +01:00 · 54bdc11353
commit 54bdc11353
parent 999c0fc6c6 03eebe9d1c
37 changed files with 377 additions and 208 deletions
--- a/.github/azure-steps.yml
+++ b/.github/azure-steps.yml
@ -1,9 +1,7 @@
 parameters:
  python_version: ''
-  architecture: ''
-  prefix: ''
-  gpu: false
-  num_build_jobs: 1
+  architecture: 'x64'
+  num_build_jobs: 2

 steps:
  - task: UsePythonVersion@0
@ -17,16 +15,16 @@ steps:
    displayName: 'Set variables'

  - script: |
-      ${{ parameters.prefix }} python -m pip install -U pip setuptools
-      ${{ parameters.prefix }} python -m pip install -U -r requirements.txt
+      python -m pip install -U build pip setuptools
+      python -m pip install -U -r requirements.txt
    displayName: "Install dependencies"

  - script: |
-      ${{ parameters.prefix }} python setup.py build_ext --inplace -j ${{ parameters.num_build_jobs }}
-      ${{ parameters.prefix }} python setup.py sdist --formats=gztar
-    displayName: "Compile and build sdist"
+      python -m build --sdist
+    displayName: "Build sdist"

-  - script: python -m mypy spacy
+  - script: |
+      python -m mypy spacy
    displayName: 'Run mypy'
    condition: ne(variables['python_version'], '3.6')

@ -35,35 +33,24 @@ steps:
      contents: "spacy"
    displayName: "Delete source directory"

+  - task: DeleteFiles@1
+    inputs:
+      contents: "*.egg-info"
+    displayName: "Delete egg-info directory"
+
  - script: |
-      ${{ parameters.prefix }} python -m pip freeze --exclude torch --exclude cupy-cuda110 > installed.txt
-      ${{ parameters.prefix }} python -m pip uninstall -y -r installed.txt
+      python -m pip freeze > installed.txt
+      python -m pip uninstall -y -r installed.txt
    displayName: "Uninstall all packages"

  - bash: |
-      ${{ parameters.prefix }} SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
-      ${{ parameters.prefix }} SPACY_NUM_BUILD_JOBS=2 python -m pip install dist/$SDIST
+      SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
+      SPACY_NUM_BUILD_JOBS=${{ parameters.num_build_jobs }} python -m pip install dist/$SDIST
    displayName: "Install from sdist"

  - script: |
-      ${{ parameters.prefix }} python -m pip install -U -r requirements.txt
-    displayName: "Install test requirements"
-
-  - script: |
-      ${{ parameters.prefix }} python -m pip install -U cupy-cuda110 -f https://github.com/cupy/cupy/releases/v9.0.0
-      ${{ parameters.prefix }} python -m pip install "torch==1.7.1+cu110" -f https://download.pytorch.org/whl/torch_stable.html
-    displayName: "Install GPU requirements"
-    condition: eq(${{ parameters.gpu }}, true)
-
-  - script: |
-      ${{ parameters.prefix }} python -m pytest --pyargs spacy -W error
-    displayName: "Run CPU tests"
-    condition: eq(${{ parameters.gpu }}, false)
-
-  - script: |
-      ${{ parameters.prefix }} python -m pytest --pyargs spacy -W error -p spacy.tests.enable_gpu
-    displayName: "Run GPU tests"
-    condition: eq(${{ parameters.gpu }}, true)
+      python -W error -c "import spacy"
+    displayName: "Test import"

  - script: |
      python -m spacy download ca_core_news_sm
@ -72,6 +59,11 @@ steps:
    displayName: 'Test download CLI'
    condition: eq(variables['python_version'], '3.8')

+  - script: |
+      python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
+    displayName: 'Test no warnings on load (#11713)'
+    condition: eq(variables['python_version'], '3.8')
+
  - script: |
      python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
    displayName: 'Test convert CLI'
@ -106,13 +98,22 @@ steps:
    displayName: 'Test assemble CLI vectors warning'
    condition: eq(variables['python_version'], '3.8')

+  - script: |
+      python -m pip install -U -r requirements.txt
+    displayName: "Install test requirements"
+
+  - script: |
+      python -m pytest --pyargs spacy -W error
+    displayName: "Run CPU tests"
+
+  - script: |
+      python -m pip install --pre thinc-apple-ops
+      python -m pytest --pyargs spacy
+    displayName: "Run CPU tests with thinc-apple-ops"
+    condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11'))
+
  - script: |
      python .github/validate_universe_json.py website/meta/universe.json
    displayName: 'Test website/meta/universe.json'
    condition: eq(variables['python_version'], '3.8')

-  - script: |
-      ${{ parameters.prefix }} python -m pip install --pre thinc-apple-ops
-      ${{ parameters.prefix }} python -m pytest --pyargs spacy
-    displayName: "Run CPU tests with thinc-apple-ops"
-    condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.10'))
--- a/.github/workflows/autoblack.yml
+++ b/.github/workflows/autoblack.yml
@ -12,10 +12,10 @@ jobs:
    if: github.repository_owner == 'explosion'
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
        with:
            ref: ${{ github.head_ref }}
-      - uses: actions/setup-python@v2
+      - uses: actions/setup-python@v4
      - run: pip install black
      - name: Auto-format code if needed
        run: black spacy
@ -23,10 +23,11 @@ jobs:
      # code and makes GitHub think the action failed
      - name: Check for modified files
        id: git-check
-        run: echo ::set-output name=modified::$(if git diff-index --quiet HEAD --; then echo "false"; else echo "true"; fi)
+        run: echo modified=$(if git diff-index --quiet HEAD --; then echo "false"; else echo "true"; fi) >> $GITHUB_OUTPUT
+
      - name: Create Pull Request
        if: steps.git-check.outputs.modified == 'true'
-        uses: peter-evans/create-pull-request@v3
+        uses: peter-evans/create-pull-request@v4
        with:
            title: Auto-format code with black
            labels: meta
--- a/.github/workflows/explosionbot.yml
+++ b/.github/workflows/explosionbot.yml
@ -8,14 +8,14 @@ on:

 jobs:
  explosion-bot:
-    runs-on: ubuntu-18.04
+    runs-on: ubuntu-latest
    steps:
      - name: Dump GitHub context
        env:
          GITHUB_CONTEXT: ${{ toJson(github) }}
        run: echo "$GITHUB_CONTEXT"
-      - uses: actions/checkout@v1
-      - uses: actions/setup-python@v1
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v4
      - name: Install and run explosion-bot
        run: |
          pip install git+https://${{ secrets.EXPLOSIONBOT_TOKEN }}@github.com/explosion/explosion-bot
--- a/.github/workflows/slowtests.yml
+++ b/.github/workflows/slowtests.yml
@ -14,7 +14,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
-        uses: actions/checkout@v1
+        uses: actions/checkout@v3
        with:
          ref: ${{ matrix.branch }}
      - name: Get commits from past 24 hours
@ -23,9 +23,9 @@ jobs:
          today=$(date '+%Y-%m-%d %H:%M:%S')
          yesterday=$(date -d "yesterday" '+%Y-%m-%d %H:%M:%S')
          if git log --after="$yesterday" --before="$today" | grep commit ; then
-            echo "::set-output name=run_tests::true"
+            echo run_tests=true >> $GITHUB_OUTPUT
          else
-            echo "::set-output name=run_tests::false"
+            echo run_tests=false >> $GITHUB_OUTPUT
          fi

      - name: Trigger buildkite build
--- a/.github/workflows/spacy_universe_alert.yml
+++ b/.github/workflows/spacy_universe_alert.yml
@ -17,8 +17,10 @@ jobs:
        run: |
          echo "$GITHUB_CONTEXT"

-      - uses: actions/checkout@v1
-      - uses: actions/setup-python@v1
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v4
+        with:
+          python-version: '3.10'
      - name: Install Bernadette app dependency and send an alert
        env:
          SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
--- a/README.md
+++ b/README.md
@ -8,7 +8,7 @@ be used in real products.

 spaCy comes with
 [pretrained pipelines](https://spacy.io/models) and
-currently supports tokenization and training for **60+ languages**. It features
+currently supports tokenization and training for **70+ languages**. It features
 state-of-the-art speed and **neural network models** for tagging,
 parsing, **named entity recognition**, **text classification** and more,
 multi-task learning with pretrained **transformers** like BERT, as well as a
@ -16,7 +16,7 @@ production-ready [**training system**](https://spacy.io/usage/training) and easy
 model packaging, deployment and workflow management. spaCy is commercial
 open-source software, released under the MIT license.

-💫 **Version 3.4.0 out now!**
+💫 **Version 3.4 out now!**
 [Check out the release notes here.](https://github.com/explosion/spaCy/releases)

 [![Azure Pipelines](https://img.shields.io/azure-devops/build/explosion-ai/public/8/master.svg?logo=azure-pipelines&style=flat-square&label=build)](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
@ -79,7 +79,7 @@ more people can benefit from it.

 ## Features

- Support for **60+ languages**
+- Support for **70+ languages**
 - **Trained pipelines** for different languages and tasks
 - Multi-task learning with pretrained **transformers** like BERT
 - Support for pretrained **word vectors** and embeddings
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@ -76,24 +76,24 @@ jobs:
        #        Python39Mac:
        #          imageName: "macos-latest"
        #          python.version: "3.9"
-        Python310Linux:
-          imageName: "ubuntu-latest"
-          python.version: "3.10"
+        #        Python310Linux:
+        #          imageName: "ubuntu-latest"
+        #          python.version: "3.10"
        Python310Windows:
          imageName: "windows-latest"
          python.version: "3.10"
-        Python310Mac:
-          imageName: "macos-latest"
-          python.version: "3.10"
+        #        Python310Mac:
+        #          imageName: "macos-latest"
+        #          python.version: "3.10"
        Python311Linux:
          imageName: 'ubuntu-latest'
-          python.version: '3.11.0-rc.2'
+          python.version: '3.11'
        Python311Windows:
          imageName: 'windows-latest'
-          python.version: '3.11.0-rc.2'
+          python.version: '3.11'
        Python311Mac:
          imageName: 'macos-latest'
-          python.version: '3.11.0-rc.2'
+          python.version: '3.11'
      maxParallel: 4
    pool:
      vmImage: $(imageName)
@ -101,20 +101,3 @@ jobs:
      - template: .github/azure-steps.yml
        parameters:
          python_version: '$(python.version)'
-          architecture: 'x64'
-
-#  - job: "TestGPU"
-#    dependsOn: "Validate"
-#    strategy:
-#      matrix:
-#        Python38LinuxX64_GPU:
-#          python.version: '3.8'
-#    pool:
-#      name: "LinuxX64_GPU"
-#    steps:
-#      - template: .github/azure-steps.yml
-#        parameters:
-#          python_version: '$(python.version)'
-#          architecture: 'x64'
-#          gpu: true
-#          num_build_jobs: 24
--- a/requirements.txt
+++ b/requirements.txt
@ -9,7 +9,7 @@ murmurhash>=0.28.0,<1.1.0
 wasabi>=0.9.1,<1.1.0
 srsly>=2.4.3,<3.0.0
 catalogue>=2.0.6,<2.1.0
-typer>=0.3.0,<0.5.0
+typer>=0.3.0,<0.8.0
 pathy>=0.3.5
 # Third party dependencies
 numpy>=1.15.0
--- a/setup.cfg
+++ b/setup.cfg
@ -51,7 +51,7 @@ install_requires =
    srsly>=2.4.3,<3.0.0
    catalogue>=2.0.6,<2.1.0
    # Third-party dependencies
-    typer>=0.3.0,<0.5.0
+    typer>=0.3.0,<0.8.0
    pathy>=0.3.5
    tqdm>=4.38.0,<5.0.0
    numpy>=1.15.0
--- a/spacy/cli/project/remote_storage.py
+++ b/spacy/cli/project/remote_storage.py
@ -10,6 +10,7 @@ from .._util import get_hash, get_checksum, download_file, ensure_pathy
 from ...util import make_tempdir, get_minor_version, ENV_VARS, check_bool_env_var
 from ...git_info import GIT_VERSION
 from ... import about
+from ...errors import Errors

 if TYPE_CHECKING:
    from pathy import Pathy  # noqa: F401
@ -84,7 +85,23 @@ class RemoteStorage:
                with tarfile.open(tar_loc, mode=mode_string) as tar_file:
                    # This requires that the path is added correctly, relative
                    # to root. This is how we set things up in push()
-                    tar_file.extractall(self.root)
+
+                    # Disallow paths outside the current directory for the tar
+                    # file (CVE-2007-4559, directory traversal vulnerability)
+                    def is_within_directory(directory, target):
+                        abs_directory = os.path.abspath(directory)
+                        abs_target = os.path.abspath(target)
+                        prefix = os.path.commonprefix([abs_directory, abs_target])
+                        return prefix == abs_directory
+
+                    def safe_extract(tar, path):
+                        for member in tar.getmembers():
+                            member_path = os.path.join(path, member.name)
+                            if not is_within_directory(path, member_path):
+                                raise ValueError(Errors.E852)
+                        tar.extractall(path)
+
+                    safe_extract(tar_file, self.root)
        return url

    def find(
--- a/spacy/cli/project/run.py
+++ b/spacy/cli/project/run.py
@ -53,6 +53,7 @@ def project_run(
    force: bool = False,
    dry: bool = False,
    capture: bool = False,
+    skip_requirements_check: bool = False,
 ) -> None:
    """Run a named script defined in the project.yml. If the script is part
    of the default pipeline (defined in the "run" section), DVC is used to
@ -69,6 +70,7 @@ def project_run(
        sys.exit will be called with the return code. You should use capture=False
        when you want to turn over execution to the command, and capture=True
        when you want to run the command more like a function.
+    skip_requirements_check (bool): Whether to skip the requirements check.
    """
    config = load_project_config(project_dir, overrides=overrides)
    commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
@ -76,9 +78,10 @@ def project_run(
    validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand)

    req_path = project_dir / "requirements.txt"
+    if not skip_requirements_check:
        if config.get("check_requirements", True) and os.path.exists(req_path):
            with req_path.open() as requirements_file:
-            _check_requirements([req.replace("\n", "") for req in requirements_file])
+                _check_requirements([req.strip() for req in requirements_file])

    if subcommand in workflows:
        msg.info(f"Running workflow '{subcommand}'")
@ -90,6 +93,7 @@ def project_run(
                force=force,
                dry=dry,
                capture=capture,
+                skip_requirements_check=True,
            )
    else:
        cmd = commands[subcommand]
@ -338,6 +342,10 @@ def _check_requirements(requirements: List[str]) -> Tuple[bool, bool]:
            failed_pkgs_msgs.append(dnf.report())
        except pkg_resources.VersionConflict as vc:
            conflicting_pkgs_msgs.append(vc.report())
+        except Exception:
+            msg.warn(f"Unable to check requirement: {req} "
+                     "Checks are currently limited to requirement specifiers "
+                     "(PEP 508)")

    if len(failed_pkgs_msgs) or len(conflicting_pkgs_msgs):
        msg.warn(
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -212,8 +212,8 @@ class Warnings(metaclass=ErrorsWithCodes):
    W121 = ("Attempting to trace non-existent method '{method}' in pipe '{pipe}'")
    W122 = ("Couldn't trace method '{method}' in pipe '{pipe}'. This can happen if the pipe class "
            "is a Cython extension type.")
-    W123 = ("Argument {arg} with value {arg_value} is used instead of {config_value} as specified in the config. Be "
-            "aware that this might affect other components in your pipeline.")
+    W123 = ("Argument `enable` with value {enable} does not contain all values specified in the config option "
+            "`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.")
    W124 = ("Using the features PREFIX and/or SUFFIX in a RichFeatureExtractor configuration may lead to the same "
            "information being fed forward twice if prefixes and suffixes of corresponding lengths are specified.")    

@ -546,6 +546,8 @@ class Errors(metaclass=ErrorsWithCodes):
            "during training, make sure to include it in 'annotating components'")

    # New errors added in v3.x
+    E852 = ("The tar file pulled from the remote attempted an unsafe path "
+            "traversal.")
    E853 = ("Unsupported component factory name '{name}'. The character '.' is "
            "not permitted in factory names.")
    E854 = ("Unable to set doc.ents. Check that the 'ents_filter' does not "
--- a/spacy/language.py
+++ b/spacy/language.py
@ -1879,31 +1879,22 @@ class Language:
        if isinstance(exclude, str):
            exclude = [exclude]

-        def fetch_pipes_status(value: Iterable[str], key: str) -> Iterable[str]:
-            """Fetch value for `enable` or `disable` w.r.t. the specified config and passed arguments passed to
-            .load(). If both arguments and config specified values for this field, the passed arguments take precedence
-            and a warning is printed.
-            value (Iterable[str]): Passed value for `enable` or `disable`.
-            key (str): Key for field in config (either "enabled" or "disabled").
-            RETURN (Iterable[str]):
-            """
-            # We assume that no argument was passed if the value is the specified default value.
-            if id(value) == id(_DEFAULT_EMPTY_PIPES):
-                return config["nlp"].get(key, [])
-            else:
-                if len(config["nlp"].get(key, [])):
+        # `enable` should not be merged with `enabled` (the opposite is true for `disable`/`disabled`). If the config
+        # specifies values for `enabled` not included in `enable`, emit warning.
+        if id(enable) != id(_DEFAULT_EMPTY_PIPES):
+            enabled = config["nlp"].get("enabled", [])
+            if len(enabled) and not set(enabled).issubset(enable):
                warnings.warn(
                    Warnings.W123.format(
-                            arg=key[:-1],
-                            arg_value=value,
-                            config_value=config["nlp"][key],
+                        enable=enable,
+                        enabled=enabled,
                    )
                )
-                return value

+        # Ensure sets of disabled/enabled pipe names are not contradictory.
        disabled_pipes = cls._resolve_component_status(
-            fetch_pipes_status(disable, "disabled"),
-            fetch_pipes_status(enable, "enabled"),
+            list({*disable, *config["nlp"].get("disabled", [])}),
+            enable,
            config["nlp"]["pipeline"],
        )
        nlp._disabled = set(p for p in disabled_pipes if p not in exclude)
@ -2084,10 +2075,12 @@ class Language:
        if enable:
            if isinstance(enable, str):
                enable = [enable]
-            to_disable = [
-                pipe_name for pipe_name in pipe_names if pipe_name not in enable
-            ]
-            if disable and disable != to_disable:
+            to_disable = {
+                *[pipe_name for pipe_name in pipe_names if pipe_name not in enable],
+                *disable,
+            }
+            # If any pipe to be enabled is in to_disable, the specification is inconsistent.
+            if len(set(enable) & to_disable):
                raise ValueError(Errors.E1042.format(enable=enable, disable=disable))

        return tuple(to_disable)
--- a/spacy/ml/models/entity_linker.py
+++ b/spacy/ml/models/entity_linker.py
@ -71,11 +71,10 @@ def span_maker_forward(model, docs: List[Doc], is_train) -> Tuple[Ragged, Callab
            cands.append((start_token, end_token))

        candidates.append(ops.asarray2i(cands))
-    candlens = ops.asarray1i([len(cands) for cands in candidates])
-    candidates = ops.xp.concatenate(candidates)
-    outputs = Ragged(candidates, candlens)
+    lengths = model.ops.asarray1i([len(cands) for cands in candidates])
+    out = Ragged(model.ops.flatten(candidates), lengths)
    # because this is just rearranging docs, the backprop does nothing
-    return outputs, lambda x: []
+    return out, lambda x: []


@registry.misc("spacy.KBFromFile.v1")
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@ -24,8 +24,8 @@ single_label_default_config = """
 [model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v2"
 width = 64
-rows = [2000, 2000, 1000, 1000, 1000, 1000]
-attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
+rows = [2000, 2000, 500, 1000, 500]
+attrs = ["NORM", "LOWER", "PREFIX", "SUFFIX", "SHAPE"]
 include_static_vectors = false

 [model.tok2vec.encode]
@ -72,7 +72,7 @@ subword_features = true
    "textcat",
    assigns=["doc.cats"],
    default_config={
-        "threshold": 0.5,
+        "threshold": 0.0,
        "model": DEFAULT_SINGLE_TEXTCAT_MODEL,
        "scorer": {"@scorers": "spacy.textcat_scorer.v1"},
    },
@ -144,7 +144,8 @@ class TextCategorizer(TrainablePipe):
        model (thinc.api.Model): The Thinc Model powering the pipeline component.
        name (str): The component instance name, used to add entries to the
            losses during training.
-        threshold (float): Cutoff to consider a prediction "positive".
+        threshold (float): Unused, not needed for single-label (exclusive
+            classes) classification.
        scorer (Optional[Callable]): The scoring method. Defaults to
                Scorer.score_cats for the attribute "cats".

@ -154,7 +155,11 @@ class TextCategorizer(TrainablePipe):
        self.model = model
        self.name = name
        self._rehearsal_model = None
-        cfg = {"labels": [], "threshold": threshold, "positive_label": None}
+        cfg: Dict[str, Any] = {
+            "labels": [],
+            "threshold": threshold,
+            "positive_label": None,
+        }
        self.cfg = dict(cfg)
        self.scorer = scorer

--- a/spacy/pipeline/textcat_multilabel.py
+++ b/spacy/pipeline/textcat_multilabel.py
@ -24,8 +24,8 @@ multi_label_default_config = """
 [model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v2"
 width = 64
-rows = [2000, 2000, 1000, 1000, 1000, 1000]
-attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
+rows = [2000, 2000, 500, 1000, 500]
+attrs = ["NORM", "LOWER", "PREFIX", "SUFFIX", "SHAPE"]
 include_static_vectors = false

 [model.tok2vec.encode]
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@ -446,7 +446,7 @@ class Scorer:
        labels (Iterable[str]): The set of possible labels. Defaults to [].
        multi_label (bool): Whether the attribute allows multiple labels.
            Defaults to True. When set to False (exclusive labels), missing
-            gold labels are interpreted as 0.0.
+            gold labels are interpreted as 0.0 and the threshold is set to 0.0.
        positive_label (str): The positive label for a binary task with
            exclusive classes. Defaults to None.
        threshold (float): Cutoff to consider a prediction "positive". Defaults
@ -471,6 +471,8 @@ class Scorer:
        """
        if threshold is None:
            threshold = 0.5 if multi_label else 0.0
+        if not multi_label:
+            threshold = 0.0
        f_per_type = {label: PRFScore() for label in labels}
        auc_per_type = {label: ROCAUCScore() for label in labels}
        labels = set(labels)
@ -505,11 +507,10 @@ class Scorer:
                # Get the highest-scoring for each.
                pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1])
                gold_label, gold_score = max(gold_cats.items(), key=lambda it: it[1])
-                if pred_label == gold_label and pred_score >= threshold:
+                if pred_label == gold_label:
                    f_per_type[pred_label].tp += 1
                else:
                    f_per_type[gold_label].fn += 1
-                    if pred_score >= threshold:
                    f_per_type[pred_label].fp += 1
            elif gold_cats:
                gold_label, gold_score = max(gold_cats, key=lambda it: it[1])
@ -517,7 +518,6 @@ class Scorer:
                    f_per_type[gold_label].fn += 1
            elif pred_cats:
                pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1])
-                if pred_score >= threshold:
                f_per_type[pred_label].fp += 1
        micro_prf = PRFScore()
        for label_prf in f_per_type.values():
--- a/spacy/tests/doc/test_json_doc_conversion.py
+++ b/spacy/tests/doc/test_json_doc_conversion.py
@ -370,3 +370,12 @@ def test_json_to_doc_validation_error(doc):
    doc_json.pop("tokens")
    with pytest.raises(ValueError):
        Doc(doc.vocab).from_json(doc_json, validate=True)
+
+
+def test_to_json_underscore_doc_getters(doc):
+    def get_text_length(doc):
+        return len(doc.text)
+
+    Doc.set_extension("text_length", getter=get_text_length)
+    doc_json = doc.to_json(underscore=["text_length"])
+    assert doc_json["_"]["text_length"] == get_text_length(doc)
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@ -9,6 +9,7 @@ from spacy.compat import pickle
 from spacy.kb import Candidate, InMemoryLookupKB, get_candidates, KnowledgeBase
 from spacy.lang.en import English
 from spacy.ml import load_kb
+from spacy.ml.models.entity_linker import build_span_maker
 from spacy.pipeline import EntityLinker
 from spacy.pipeline.legacy import EntityLinker_v1
 from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
@ -715,7 +716,11 @@ TRAIN_DATA = [
    ("Russ Cochran was a member of University of Kentucky's golf team.",
        {"links": {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}},
         "entities": [(0, 12, "PERSON"), (43, 51, "LOC")],
-         "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]})
+         "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}),
+    # having a blank instance shouldn't break things
+    ("The weather is nice today.",
+        {"links": {}, "entities": [],
+         "sent_starts": [1, -1, 0, 0, 0, 0]})
 ]
 GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"]
 # fmt: on
@ -1196,3 +1201,18 @@ def test_threshold(meet_threshold: bool, config: Dict[str, Any]):

    assert len(doc.ents) == 1
    assert doc.ents[0].kb_id_ == entity_id if meet_threshold else EntityLinker.NIL
+
+
+def test_span_maker_forward_with_empty():
+    """The forward pass of the span maker may have a doc with no entities."""
+    nlp = English()
+    doc1 = nlp("a b c")
+    ent = doc1[0:1]
+    ent.label_ = "X"
+    doc1.ents = [ent]
+    # no entities
+    doc2 = nlp("x y z")
+
+    # just to get a model
+    span_maker = build_span_maker()
+    span_maker([doc1, doc2], False)
--- a/spacy/tests/pipeline/test_pipe_methods.py
+++ b/spacy/tests/pipeline/test_pipe_methods.py
@ -615,20 +615,18 @@ def test_enable_disable_conflict_with_config():

    with make_tempdir() as tmp_dir:
        nlp.to_disk(tmp_dir)
-        # Expected to fail, as config and arguments conflict.
-        with pytest.raises(ValueError):
-            spacy.load(
+        # Expected to succeed, as config and arguments do not conflict.
+        assert spacy.load(
            tmp_dir, enable=["tagger"], config={"nlp": {"disabled": ["senter"]}}
-            )
+        ).disabled == ["senter", "sentencizer"]
        # Expected to succeed without warning due to the lack of a conflicting config option.
        spacy.load(tmp_dir, enable=["tagger"])
-        # Expected to succeed with a warning, as disable=[] should override the config setting.
-        with pytest.warns(UserWarning):
+        # Expected to fail due to conflict between enable and disabled.
+        with pytest.raises(ValueError):
            spacy.load(
                tmp_dir,
-                enable=["tagger"],
-                disable=[],
-                config={"nlp": {"disabled": ["senter"]}},
+                enable=["senter"],
+                config={"nlp": {"disabled": ["senter", "tagger"]}},
            )


--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@ -823,10 +823,10 @@ def test_textcat_loss(multi_label: bool, expected_loss: float):
    assert loss == expected_loss


-def test_textcat_threshold():
+def test_textcat_multilabel_threshold():
    # Ensure the scorer can be called with a different threshold
    nlp = English()
-    nlp.add_pipe("textcat")
+    nlp.add_pipe("textcat_multilabel")

    train_examples = []
    for text, annotations in TRAIN_DATA_SINGLE_LABEL:
@ -849,7 +849,7 @@ def test_textcat_threshold():
    )
    pos_f = scores["cats_score"]
    assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 1.0
-    assert pos_f > macro_f
+    assert pos_f >= macro_f


 def test_textcat_multi_threshold():
--- a/spacy/tests/serialize/test_serialize_pipeline.py
+++ b/spacy/tests/serialize/test_serialize_pipeline.py
@ -404,11 +404,10 @@ def test_serialize_pipeline_disable_enable():
    assert nlp3.component_names == ["ner", "tagger"]
    with make_tempdir() as d:
        nlp3.to_disk(d)
-        with pytest.warns(UserWarning):
        nlp4 = spacy.load(d, disable=["ner"])
-    assert nlp4.pipe_names == ["tagger"]
+    assert nlp4.pipe_names == []
    assert nlp4.component_names == ["ner", "tagger"]
-    assert nlp4.disabled == ["ner"]
+    assert nlp4.disabled == ["ner", "tagger"]
    with make_tempdir() as d:
        nlp.to_disk(d)
        nlp5 = spacy.load(d, exclude=["tagger"])
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@ -1,5 +1,6 @@
 import os
 import math
+import pkg_resources
 from random import sample
 from typing import Counter

@ -25,6 +26,7 @@ from spacy.cli.download import get_compatibility, get_version
 from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config
 from spacy.cli.package import get_third_party_dependencies
 from spacy.cli.package import _is_permitted_package_name
+from spacy.cli.project.run import _check_requirements
 from spacy.cli.validate import get_model_pkgs
 from spacy.lang.en import English
 from spacy.lang.nl import Dutch
@ -855,3 +857,42 @@ def test_span_length_freq_dist_output_must_be_correct():
    span_freqs = _get_spans_length_freq_dist(sample_span_lengths, threshold)
    assert sum(span_freqs.values()) >= threshold
    assert list(span_freqs.keys()) == [3, 1, 4, 5, 2]
+
+
+@pytest.mark.parametrize(
+    "reqs,output",
+    [
+        [
+            """
+            spacy
+
+            # comment
+
+            thinc""",
+            (False, False),
+        ],
+        [
+            """# comment
+            --some-flag
+            spacy""",
+            (False, False),
+        ],
+        [
+            """# comment
+            --some-flag
+            spacy; python_version >= '3.6'""",
+            (False, False),
+        ],
+        [
+            """# comment
+             spacyunknowndoesnotexist12345""",
+            (True, False),
+        ],
+    ],
+)
+def test_project_check_requirements(reqs, output):
+    # excessive guard against unlikely package name
+    try:
+        pkg_resources.require("spacyunknowndoesnotexist12345")
+    except pkg_resources.DistributionNotFound:
+        assert output == _check_requirements([req.strip() for req in reqs.split("\n")])
--- a/spacy/tests/test_models.py
+++ b/spacy/tests/test_models.py
@ -23,7 +23,7 @@ def get_textcat_bow_kwargs():


 def get_textcat_cnn_kwargs():
-    return {"tok2vec": test_tok2vec(), "exclusive_classes": False, "nO": 13}
+    return {"tok2vec": make_test_tok2vec(), "exclusive_classes": False, "nO": 13}


 def get_all_params(model):
@ -65,7 +65,7 @@ def get_tok2vec_kwargs():
    }


-def test_tok2vec():
+def make_test_tok2vec():
    return build_Tok2Vec_model(**get_tok2vec_kwargs())


--- a/spacy/tests/test_scorer.py
+++ b/spacy/tests/test_scorer.py
@ -474,3 +474,50 @@ def test_prf_score():
    assert (a.precision, a.recall, a.fscore) == approx(
        (c.precision, c.recall, c.fscore)
    )
+
+
+def test_score_cats(en_tokenizer):
+    text = "some text"
+    gold_doc = en_tokenizer(text)
+    gold_doc.cats = {"POSITIVE": 1.0, "NEGATIVE": 0.0}
+    pred_doc = en_tokenizer(text)
+    pred_doc.cats = {"POSITIVE": 0.75, "NEGATIVE": 0.25}
+    example = Example(pred_doc, gold_doc)
+    # threshold is ignored for multi_label=False
+    scores1 = Scorer.score_cats(
+        [example],
+        "cats",
+        labels=list(gold_doc.cats.keys()),
+        multi_label=False,
+        positive_label="POSITIVE",
+        threshold=0.1,
+    )
+    scores2 = Scorer.score_cats(
+        [example],
+        "cats",
+        labels=list(gold_doc.cats.keys()),
+        multi_label=False,
+        positive_label="POSITIVE",
+        threshold=0.9,
+    )
+    assert scores1["cats_score"] == 1.0
+    assert scores2["cats_score"] == 1.0
+    assert scores1 == scores2
+    # threshold is relevant for multi_label=True
+    scores = Scorer.score_cats(
+        [example],
+        "cats",
+        labels=list(gold_doc.cats.keys()),
+        multi_label=True,
+        threshold=0.9,
+    )
+    assert scores["cats_macro_f"] == 0.0
+    # threshold is relevant for multi_label=True
+    scores = Scorer.score_cats(
+        [example],
+        "cats",
+        labels=list(gold_doc.cats.keys()),
+        multi_label=True,
+        threshold=0.1,
+    )
+    assert scores["cats_macro_f"] == 0.5
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -1667,6 +1667,20 @@ cdef class Doc:

        if underscore:
            user_keys = set()
+            # Handle doc attributes with .get to include values from getters
+            # and not only values stored in user_data, for backwards
+            # compatibility
+            for attr in underscore:
+                if self.has_extension(attr):
+                    if "_" not in data:
+                        data["_"] = {}
+                    value = self._.get(attr)
+                    if not srsly.is_json_serializable(value):
+                        raise ValueError(Errors.E107.format(attr=attr, value=repr(value)))
+                    data["_"][attr] = value
+                    user_keys.add(attr)
+            # Token and span attributes only include values stored in user_data
+            # and not values generated by getters
            if self.user_data:
                for data_key, value in self.user_data.copy().items():
                    if type(data_key) == tuple and len(data_key) >= 4 and data_key[0] == "._.":
@ -1677,20 +1691,15 @@ cdef class Doc:
                            user_keys.add(attr)
                            if not srsly.is_json_serializable(value):
                                raise ValueError(Errors.E107.format(attr=attr, value=repr(value)))
-                            # Check if doc attribute
-                            if start is None:
-                                if "_" not in data:
-                                    data["_"] = {}
-                                data["_"][attr] = value
-                            # Check if token attribute
-                            elif end is None:
+                            # Token attribute
+                            if start is not None and end is None:
                                if "underscore_token" not in data:
                                    data["underscore_token"] = {}
                                if attr not in data["underscore_token"]:
                                    data["underscore_token"][attr] = []
                                data["underscore_token"][attr].append({"start": start, "value": value})
-                            # Else span attribute
-                            else:
+                            # Span attribute
+                            elif start is not None and end is not None:
                                if "underscore_span" not in data:
                                    data["underscore_span"] = {}
                                if attr not in data["underscore_span"]:
--- a/spacy/tokens/span.pyi
+++ b/spacy/tokens/span.pyi
@ -117,15 +117,13 @@ class Span:
    end_char: int
    label: int
    kb_id: int
+    id: int
    ent_id: int
    ent_id_: str
    @property
-    def id(self) -> int: ...
-    @property
-    def id_(self) -> str: ...
-    @property
    def orth_(self) -> str: ...
    @property
    def lemma_(self) -> str: ...
    label_: str
    kb_id_: str
+    id_: str
--- a/spacy/util.py
+++ b/spacy/util.py
@ -443,9 +443,9 @@ def load_model_from_package(
    name: str,
    *,
    vocab: Union["Vocab", bool] = True,
-    disable: Union[str, Iterable[str]] = SimpleFrozenList(),
-    enable: Union[str, Iterable[str]] = SimpleFrozenList(),
-    exclude: Union[str, Iterable[str]] = SimpleFrozenList(),
+    disable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
+    enable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
+    exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
    config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
 ) -> "Language":
    """Load a model from an installed package.
@ -619,9 +619,9 @@ def load_model_from_init_py(
    init_file: Union[Path, str],
    *,
    vocab: Union["Vocab", bool] = True,
-    disable: Union[str, Iterable[str]] = SimpleFrozenList(),
-    enable: Union[str, Iterable[str]] = SimpleFrozenList(),
-    exclude: Union[str, Iterable[str]] = SimpleFrozenList(),
+    disable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
+    enable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
+    exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
    config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
 ) -> "Language":
    """Helper function to use in the `load()` method of a model package's
--- a/website/docs/api/language.md
+++ b/website/docs/api/language.md
@ -64,12 +64,12 @@ spaCy loads a model under the hood based on its
 > ```

 | Name                                  | Description                                                                                                                                                                                                                                                                                                      |
-| ------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| ------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `config`                              | The loaded config. ~~Union[Dict[str, Any], Config]~~                                                                                                                                                                                                                                                             |
 | _keyword-only_                        |                                                                                                                                                                                                                                                                                                                  |
 | `vocab`                               | A `Vocab` object. If `True`, a vocab is created using the default language data settings. ~~Vocab~~                                                                                                                                                                                                              |
-| `disable`                             | Name(s) of pipeline component(s) to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [`nlp.enable_pipe`](/api/language#enable_pipe). ~~Union[str, Iterable[str]]~~ |
-| `enable` <Tag variant="new">3.4</Tag> | Name(s) of pipeline component(s) to [enable](/usage/processing-pipelines#disabling). All other pipes will be disabled, but can be enabled again using [`nlp.enable_pipe`](/api/language#enable_pipe). ~~Union[str, Iterable[str]]~~                                  |
+| `disable`                             | Name(s) of pipeline component(s) to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). Is merged with the config entry `nlp.disabled`. ~~Union[str, Iterable[str]]~~ |
+| `enable` <Tag variant="new">3.4</Tag> | Name(s) of pipeline component(s) to [enable](/usage/processing-pipelines#disabling). All other pipes will be disabled, but can be enabled again using [nlp.enable_pipe](/api/language#enable_pipe). ~~Union[str, Iterable[str]]~~                                       |
 | `exclude`                             | Name(s) of pipeline component(s) to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~Union[str, Iterable[str]]~~                                                                                                                                                         |
 | `meta`                                | [Meta data](/api/data-formats#meta) overrides. ~~Dict[str, Any]~~                                                                                                                                                                                                                                                |
 | `auto_fill`                           | Whether to automatically fill in missing values in the config, based on defaults and function argument annotations. Defaults to `True`. ~~bool~~                                                                                                                                                                 |
--- a/website/docs/api/scorer.md
+++ b/website/docs/api/scorer.md
@ -230,14 +230,15 @@ The reported `{attr}_score` depends on the classification properties:
 > ```

 | Name             | Description                                                                                                                                                                                        |
-| ---------------- | -------------------------------------------------------------------------------------------------------------------------------------------------- |
+| ---------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `examples`       | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~                                                                                |
 | `attr`           | The attribute to score. ~~str~~                                                                                                                                                                    |
 | _keyword-only_   |                                                                                                                                                                                                    |
 | `getter`         | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the cats for an individual `Doc`. ~~Callable[[Doc, str], Dict[str, float]]~~                                                 |
 | labels           | The set of possible labels. Defaults to `[]`. ~~Iterable[str]~~                                                                                                                                    |
-| `multi_label`    | Whether the attribute allows multiple labels. Defaults to `True`. ~~bool~~                                                                         |
+| `multi_label`    | Whether the attribute allows multiple labels. Defaults to `True`. When set to `False` (exclusive labels), missing gold labels are interpreted as `0.0` and the threshold is set to `0.0`. ~~bool~~ |
 | `positive_label` | The positive label for a binary task with exclusive classes. Defaults to `None`. ~~Optional[str]~~                                                                                                 |
+| `threshold`      | Cutoff to consider a prediction "positive". Defaults to `0.5` for multi-label, and `0.0` (i.e. whatever's highest scoring) otherwise. ~~float~~                                                    |
 | **RETURNS**      | A dictionary containing the scores, with inapplicable scores as `None`. ~~Dict[str, Optional[float]]~~                                                                                             |

 ## Scorer.score_links {#score_links tag="staticmethod" new="3"}
--- a/website/docs/api/textcategorizer.md
+++ b/website/docs/api/textcategorizer.md
@ -63,7 +63,6 @@ architectures and their arguments and hyperparameters.
 > ```python
 > from spacy.pipeline.textcat import DEFAULT_SINGLE_TEXTCAT_MODEL
 > config = {
->    "threshold": 0.5,
 >    "model": DEFAULT_SINGLE_TEXTCAT_MODEL,
 > }
 > nlp.add_pipe("textcat", config=config)
@ -82,7 +81,7 @@ architectures and their arguments and hyperparameters.

 | Setting     | Description                                                                                                                                                      |
 | ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~                                                                   |
+| `threshold` | Cutoff to consider a prediction "positive", relevant for `textcat_multilabel` when calculating accuracy scores. ~~float~~                                        |
 | `model`     | A model instance that predicts scores for each category. Defaults to [TextCatEnsemble](/api/architectures#TextCatEnsemble). ~~Model[List[Doc], List[Floats2d]]~~ |
 | `scorer`    | The scoring method. Defaults to [`Scorer.score_cats`](/api/scorer#score_cats) for the attribute `"cats"`. ~~Optional[Callable]~~                                 |

@ -123,7 +122,7 @@ shortcut for this and instantiate the component using its string name and
 | `model`        | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~       |
 | `name`         | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                              |
 | _keyword-only_ |                                                                                                                                  |
-| `threshold`    | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~                                   |
+| `threshold`    | Cutoff to consider a prediction "positive", relevant for `textcat_multilabel` when calculating accuracy scores. ~~float~~        |
 | `scorer`       | The scoring method. Defaults to [`Scorer.score_cats`](/api/scorer#score_cats) for the attribute `"cats"`. ~~Optional[Callable]~~ |

 ## TextCategorizer.\_\_call\_\_ {#call tag="method"}
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@ -46,11 +46,11 @@ specified separately using the new `exclude` keyword argument.
 > ```

 | Name                                  | Description                                                                                                                                                                                                                                                                                                      |
-| ------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| ------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `name`                                | Pipeline to load, i.e. package name or path. ~~Union[str, Path]~~                                                                                                                                                                                                                                                |
 | _keyword-only_                        |                                                                                                                                                                                                                                                                                                                  |
 | `vocab`                               | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~                                                                                                                                                                            |
-| `disable`                             | Name(s) of pipeline component(s) to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). ~~Union[str, Iterable[str]]~~ |
+| `disable`                             | Name(s) of pipeline component(s) to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). Is merged with the config entry `nlp.disabled`. ~~Union[str, Iterable[str]]~~ |
 | `enable` <Tag variant="new">3.4</Tag> | Name(s) of pipeline component(s) to [enable](/usage/processing-pipelines#disabling). All other pipes will be disabled. ~~Union[str, Iterable[str]]~~                                                                                                                    |
 | `exclude` <Tag variant="new">3</Tag>  | Name(s) of pipeline component(s) to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~Union[str, Iterable[str]]~~                                                                                                                                                         |
 | `config` <Tag variant="new">3</Tag>   | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~                                                                                                                                               |
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@ -363,7 +363,8 @@ nlp.enable_pipe("tagger")
 ```

 In addition to `disable`, `spacy.load()` also accepts `enable`. If `enable` is
-set, all components except for those in `enable` are disabled.
+set, all components except for those in `enable` are disabled. If `enable` and
+`disable` conflict (i.e. the same component is included in both), an error is raised.

 ```python
 # Load the complete pipeline, but disable all components except for tok2vec and tagger
--- a/website/docs/usage/rule-based-matching.md
+++ b/website/docs/usage/rule-based-matching.md
@ -1792,7 +1792,7 @@ the entity `Span` – for example `._.orgs` or `._.prev_orgs` and
 > [`Doc.retokenize`](/api/doc#retokenize) context manager:
 >
 > ```python
-> with doc.retokenize() as retokenize:
+> with doc.retokenize() as retokenizer:
 >   for ent in doc.ents:
 >       retokenizer.merge(ent)
 > ```
--- a/website/meta/languages.json
+++ b/website/meta/languages.json
@ -4,12 +4,22 @@
            "code": "af",
            "name": "Afrikaans"
        },
+        {
+            "code": "am",
+            "name": "Amharic",
+            "has_examples": true
+        },
        {
            "code": "ar",
            "name": "Arabic",
            "example": "هذه جملة",
            "has_examples": true
        },
+        {
+            "code": "az",
+            "name": "Azerbaijani",
+            "has_examples": true
+        },
        {
            "code": "bg",
            "name": "Bulgarian",
@ -142,6 +152,11 @@
            "code": "ga",
            "name": "Irish"
        },
+        {
+            "code": "grc",
+            "name": "Ancient Greek",
+            "has_examples": true
+        },
        {
            "code": "gu",
            "name": "Gujarati",
@ -260,6 +275,10 @@
            "example": "Адамга эң кыйыны — күн сайын адам болуу",
            "has_examples": true
        },
+        {
+            "code": "la",
+            "name": "Latin"
+        },
        {
            "code": "lb",
            "name": "Luxembourgish",
@ -448,6 +467,11 @@
            "example": "นี่คือประโยค",
            "has_examples": true
        },
+        {
+            "code": "ti",
+            "name": "Tigrinya",
+            "has_examples": true
+        },
        {
            "code": "tl",
            "name": "Tagalog"
--- a/website/src/styles/quickstart.module.sass
+++ b/website/src/styles/quickstart.module.sass
@ -149,6 +149,9 @@
    & > span
        display: block

+    a
+        text-decoration: underline
+
 .small
    font-size: var(--font-size-code)
    line-height: 1.65
--- a/website/src/widgets/quickstart-install.js
+++ b/website/src/widgets/quickstart-install.js
@ -159,6 +159,9 @@ const QuickstartInstall = ({ id, title }) => {
                        setters={setters}
                        showDropdown={showDropdown}
                    >
+                        <QS os="mac" hardware="gpu" platform="arm">
+                            # Note M1 GPU support is experimental, see <a href="https://github.com/explosion/thinc/issues/792">Thinc issue #792</a>
+                        </QS>
                        <QS package="pip" config="venv">
                            python -m venv .env
                        </QS>
@ -198,7 +201,13 @@ const QuickstartInstall = ({ id, title }) => {
                            {nightly ? ' --pre' : ''}
                        </QS>
                        <QS package="conda">conda install -c conda-forge spacy</QS>
-                        <QS package="conda" hardware="gpu">
+                        <QS package="conda" hardware="gpu" os="windows">
+                            conda install -c conda-forge cupy
+                        </QS>
+                        <QS package="conda" hardware="gpu" os="linux">
+                            conda install -c conda-forge cupy
+                        </QS>
+                        <QS package="conda" hardware="gpu" os="mac" platform="x86">
                            conda install -c conda-forge cupy
                        </QS>
                        <QS package="conda" config="train">