Merge remote-tracking branch 'upstream/master' into chore/update-v4-from-master-4

2025-07-15 18:52:29 +03:00 · 2022-11-03 09:42:36 +01:00 · 2022-11-03 09:42:36 +01:00 · 68b8fa2df2
commit 68b8fa2df2
parent cae4589f5a 1211552f0e
22 changed files with 217 additions and 124 deletions
--- a/.github/azure-steps.yml
+++ b/.github/azure-steps.yml
@ -1,9 +1,7 @@
 parameters:
  python_version: ''
-  architecture: ''
-  prefix: ''
-  gpu: false
-  num_build_jobs: 1
+  architecture: 'x64'
+  num_build_jobs: 2

 steps:
  - task: UsePythonVersion@0
@ -17,16 +15,16 @@ steps:
    displayName: 'Set variables'

  - script: |
-      ${{ parameters.prefix }} python -m pip install -U pip setuptools
-      ${{ parameters.prefix }} python -m pip install -U -r requirements.txt
+      python -m pip install -U build pip setuptools
+      python -m pip install -U -r requirements.txt
    displayName: "Install dependencies"

  - script: |
-      ${{ parameters.prefix }} python setup.py build_ext --inplace -j ${{ parameters.num_build_jobs }}
-      ${{ parameters.prefix }} python setup.py sdist --formats=gztar
-    displayName: "Compile and build sdist"
+      python -m build --sdist
+    displayName: "Build sdist"

-  - script: python -m mypy spacy
+  - script: |
+      python -m mypy spacy
    displayName: 'Run mypy'
    condition: ne(variables['python_version'], '3.6')

@ -35,35 +33,24 @@ steps:
      contents: "spacy"
    displayName: "Delete source directory"

+  - task: DeleteFiles@1
+    inputs:
+      contents: "*.egg-info"
+    displayName: "Delete egg-info directory"
+
  - script: |
-      ${{ parameters.prefix }} python -m pip freeze --exclude torch --exclude cupy-cuda110 > installed.txt
-      ${{ parameters.prefix }} python -m pip uninstall -y -r installed.txt
+      python -m pip freeze > installed.txt
+      python -m pip uninstall -y -r installed.txt
    displayName: "Uninstall all packages"

  - bash: |
-      ${{ parameters.prefix }} SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
-      ${{ parameters.prefix }} SPACY_NUM_BUILD_JOBS=2 python -m pip install dist/$SDIST
+      SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
+      SPACY_NUM_BUILD_JOBS=${{ parameters.num_build_jobs }} python -m pip install dist/$SDIST
    displayName: "Install from sdist"

  - script: |
-      ${{ parameters.prefix }} python -m pip install -U -r requirements.txt
-    displayName: "Install test requirements"
-
-  - script: |
-      ${{ parameters.prefix }} python -m pip install -U cupy-cuda110 -f https://github.com/cupy/cupy/releases/v9.0.0
-      ${{ parameters.prefix }} python -m pip install "torch==1.7.1+cu110" -f https://download.pytorch.org/whl/torch_stable.html
-    displayName: "Install GPU requirements"
-    condition: eq(${{ parameters.gpu }}, true)
-
-  - script: |
-      ${{ parameters.prefix }} python -m pytest --pyargs spacy -W error
-    displayName: "Run CPU tests"
-    condition: eq(${{ parameters.gpu }}, false)
-
-  - script: |
-      ${{ parameters.prefix }} python -m pytest --pyargs spacy -W error -p spacy.tests.enable_gpu
-    displayName: "Run GPU tests"
-    condition: eq(${{ parameters.gpu }}, true)
+      python -W error -c "import spacy"
+    displayName: "Test import"

  - script: |
      python -m spacy download ca_core_news_sm
@ -106,13 +93,22 @@ steps:
    displayName: 'Test assemble CLI vectors warning'
    condition: eq(variables['python_version'], '3.8')

+  - script: |
+      python -m pip install -U -r requirements.txt
+    displayName: "Install test requirements"
+
+  - script: |
+      python -m pytest --pyargs spacy -W error
+    displayName: "Run CPU tests"
+
+  - script: |
+      python -m pip install --pre thinc-apple-ops
+      python -m pytest --pyargs spacy
+    displayName: "Run CPU tests with thinc-apple-ops"
+    condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11'))
+
  - script: |
      python .github/validate_universe_json.py website/meta/universe.json
    displayName: 'Test website/meta/universe.json'
    condition: eq(variables['python_version'], '3.8')

-  - script: |
-      ${{ parameters.prefix }} python -m pip install --pre thinc-apple-ops
-      ${{ parameters.prefix }} python -m pytest --pyargs spacy
-    displayName: "Run CPU tests with thinc-apple-ops"
-    condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.10'))
--- a/.github/workflows/autoblack.yml
+++ b/.github/workflows/autoblack.yml
@ -12,10 +12,10 @@ jobs:
    if: github.repository_owner == 'explosion'
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
        with:
            ref: ${{ github.head_ref }}
-      - uses: actions/setup-python@v2
+      - uses: actions/setup-python@v4
      - run: pip install black
      - name: Auto-format code if needed
        run: black spacy
@ -23,10 +23,11 @@ jobs:
      # code and makes GitHub think the action failed
      - name: Check for modified files
        id: git-check
-        run: echo ::set-output name=modified::$(if git diff-index --quiet HEAD --; then echo "false"; else echo "true"; fi)
+        run: echo modified=$(if git diff-index --quiet HEAD --; then echo "false"; else echo "true"; fi) >> $GITHUB_OUTPUT
+
      - name: Create Pull Request
        if: steps.git-check.outputs.modified == 'true'
-        uses: peter-evans/create-pull-request@v3
+        uses: peter-evans/create-pull-request@v4
        with:
            title: Auto-format code with black
            labels: meta
--- a/.github/workflows/explosionbot.yml
+++ b/.github/workflows/explosionbot.yml
@ -8,14 +8,14 @@ on:

 jobs:
  explosion-bot:
-    runs-on: ubuntu-18.04
+    runs-on: ubuntu-latest
    steps:
      - name: Dump GitHub context
        env:
          GITHUB_CONTEXT: ${{ toJson(github) }}
        run: echo "$GITHUB_CONTEXT"
-      - uses: actions/checkout@v1
-      - uses: actions/setup-python@v1
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v4
      - name: Install and run explosion-bot
        run: |
          pip install git+https://${{ secrets.EXPLOSIONBOT_TOKEN }}@github.com/explosion/explosion-bot
--- a/.github/workflows/slowtests.yml
+++ b/.github/workflows/slowtests.yml
@ -14,7 +14,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
-        uses: actions/checkout@v1
+        uses: actions/checkout@v3
        with:
          ref: ${{ matrix.branch }}
      - name: Get commits from past 24 hours
@ -23,9 +23,9 @@ jobs:
          today=$(date '+%Y-%m-%d %H:%M:%S')
          yesterday=$(date -d "yesterday" '+%Y-%m-%d %H:%M:%S')
          if git log --after="$yesterday" --before="$today" | grep commit ; then
-            echo "::set-output name=run_tests::true"
+            echo run_tests=true >> $GITHUB_OUTPUT
          else
-            echo "::set-output name=run_tests::false"
+            echo run_tests=false >> $GITHUB_OUTPUT
          fi

      - name: Trigger buildkite build
--- a/.github/workflows/spacy_universe_alert.yml
+++ b/.github/workflows/spacy_universe_alert.yml
@ -17,8 +17,8 @@ jobs:
        run: |
          echo "$GITHUB_CONTEXT"

-      - uses: actions/checkout@v1
-      - uses: actions/setup-python@v1
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v4
      - name: Install Bernadette app dependency and send an alert
        env:
          SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
--- a/README.md
+++ b/README.md
@ -8,7 +8,7 @@ be used in real products.

 spaCy comes with
 [pretrained pipelines](https://spacy.io/models) and
-currently supports tokenization and training for **60+ languages**. It features
+currently supports tokenization and training for **70+ languages**. It features
 state-of-the-art speed and **neural network models** for tagging,
 parsing, **named entity recognition**, **text classification** and more,
 multi-task learning with pretrained **transformers** like BERT, as well as a
@ -16,7 +16,7 @@ production-ready [**training system**](https://spacy.io/usage/training) and easy
 model packaging, deployment and workflow management. spaCy is commercial
 open-source software, released under the MIT license.

-💫 **Version 3.4.0 out now!**
+💫 **Version 3.4 out now!**
 [Check out the release notes here.](https://github.com/explosion/spaCy/releases)

 [![Azure Pipelines](https://img.shields.io/azure-devops/build/explosion-ai/public/8/master.svg?logo=azure-pipelines&style=flat-square&label=build)](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
@ -79,7 +79,7 @@ more people can benefit from it.

 ## Features

- Support for **60+ languages**
+- Support for **70+ languages**
 - **Trained pipelines** for different languages and tasks
 - Multi-task learning with pretrained **transformers** like BERT
 - Support for pretrained **word vectors** and embeddings
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@ -76,24 +76,24 @@ jobs:
        #        Python39Mac:
        #          imageName: "macos-latest"
        #          python.version: "3.9"
-        Python310Linux:
-          imageName: "ubuntu-latest"
-          python.version: "3.10"
+        #        Python310Linux:
+        #          imageName: "ubuntu-latest"
+        #          python.version: "3.10"
        Python310Windows:
          imageName: "windows-latest"
          python.version: "3.10"
-        Python310Mac:
-          imageName: "macos-latest"
-          python.version: "3.10"
+        #        Python310Mac:
+        #          imageName: "macos-latest"
+        #          python.version: "3.10"
        Python311Linux:
          imageName: 'ubuntu-latest'
-          python.version: '3.11.0-rc.2'
+          python.version: '3.11.0'
        Python311Windows:
          imageName: 'windows-latest'
-          python.version: '3.11.0-rc.2'
+          python.version: '3.11.0'
        Python311Mac:
          imageName: 'macos-latest'
-          python.version: '3.11.0-rc.2'
+          python.version: '3.11.0'
      maxParallel: 4
    pool:
      vmImage: $(imageName)
@ -101,20 +101,3 @@ jobs:
      - template: .github/azure-steps.yml
        parameters:
          python_version: '$(python.version)'
-          architecture: 'x64'
-
-#  - job: "TestGPU"
-#    dependsOn: "Validate"
-#    strategy:
-#      matrix:
-#        Python38LinuxX64_GPU:
-#          python.version: '3.8'
-#    pool:
-#      name: "LinuxX64_GPU"
-#    steps:
-#      - template: .github/azure-steps.yml
-#        parameters:
-#          python_version: '$(python.version)'
-#          architecture: 'x64'
-#          gpu: true
-#          num_build_jobs: 24
--- a/spacy/ml/models/entity_linker.py
+++ b/spacy/ml/models/entity_linker.py
@ -71,11 +71,10 @@ def span_maker_forward(model, docs: List[Doc], is_train) -> Tuple[Ragged, Callab
            cands.append((start_token, end_token))

        candidates.append(ops.asarray2i(cands))
-    candlens = ops.asarray1i([len(cands) for cands in candidates])
-    candidates = ops.xp.concatenate(candidates)
-    outputs = Ragged(candidates, candlens)
+    lengths = model.ops.asarray1i([len(cands) for cands in candidates])
+    out = Ragged(model.ops.flatten(candidates), lengths)
    # because this is just rearranging docs, the backprop does nothing
-    return outputs, lambda x: []
+    return out, lambda x: []


@registry.misc("spacy.KBFromFile.v1")
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@ -27,8 +27,8 @@ single_label_default_config = """
 [model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v2"
 width = 64
-rows = [2000, 2000, 1000, 1000, 1000, 1000]
-attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
+rows = [2000, 2000, 500, 1000, 500]
+attrs = ["NORM", "LOWER", "PREFIX", "SUFFIX", "SHAPE"]
 include_static_vectors = false

 [model.tok2vec.encode]
@ -75,7 +75,7 @@ subword_features = true
    "textcat",
    assigns=["doc.cats"],
    default_config={
-        "threshold": 0.5,
+        "threshold": 0.0,
        "model": DEFAULT_SINGLE_TEXTCAT_MODEL,
        "scorer": {"@scorers": "spacy.textcat_scorer.v1"},
        "save_activations": False,
@ -158,7 +158,8 @@ class TextCategorizer(TrainablePipe):
        model (thinc.api.Model): The Thinc Model powering the pipeline component.
        name (str): The component instance name, used to add entries to the
            losses during training.
-        threshold (float): Cutoff to consider a prediction "positive".
+        threshold (float): Unused, not needed for single-label (exclusive
+            classes) classification.
        scorer (Optional[Callable]): The scoring method. Defaults to
                Scorer.score_cats for the attribute "cats".

@ -168,7 +169,7 @@ class TextCategorizer(TrainablePipe):
        self.model = model
        self.name = name
        self._rehearsal_model = None
-        cfg = {"labels": [], "threshold": threshold, "positive_label": None}
+        cfg: Dict[str, Any] = {"labels": [], "threshold": threshold, "positive_label": None}
        self.cfg = dict(cfg)
        self.scorer = scorer
        self.save_activations = save_activations
--- a/spacy/pipeline/textcat_multilabel.py
+++ b/spacy/pipeline/textcat_multilabel.py
@ -24,8 +24,8 @@ multi_label_default_config = """
 [model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v2"
 width = 64
-rows = [2000, 2000, 1000, 1000, 1000, 1000]
-attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
+rows = [2000, 2000, 500, 1000, 500]
+attrs = ["NORM", "LOWER", "PREFIX", "SUFFIX", "SHAPE"]
 include_static_vectors = false

 [model.tok2vec.encode]
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@ -446,7 +446,7 @@ class Scorer:
        labels (Iterable[str]): The set of possible labels. Defaults to [].
        multi_label (bool): Whether the attribute allows multiple labels.
            Defaults to True. When set to False (exclusive labels), missing
-            gold labels are interpreted as 0.0.
+            gold labels are interpreted as 0.0 and the threshold is set to 0.0.
        positive_label (str): The positive label for a binary task with
            exclusive classes. Defaults to None.
        threshold (float): Cutoff to consider a prediction "positive". Defaults
@ -471,6 +471,8 @@ class Scorer:
        """
        if threshold is None:
            threshold = 0.5 if multi_label else 0.0
+        if not multi_label:
+            threshold = 0.0
        f_per_type = {label: PRFScore() for label in labels}
        auc_per_type = {label: ROCAUCScore() for label in labels}
        labels = set(labels)
@ -505,20 +507,18 @@ class Scorer:
                # Get the highest-scoring for each.
                pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1])
                gold_label, gold_score = max(gold_cats.items(), key=lambda it: it[1])
-                if pred_label == gold_label and pred_score >= threshold:
+                if pred_label == gold_label:
                    f_per_type[pred_label].tp += 1
                else:
                    f_per_type[gold_label].fn += 1
-                    if pred_score >= threshold:
-                        f_per_type[pred_label].fp += 1
+                    f_per_type[pred_label].fp += 1
            elif gold_cats:
                gold_label, gold_score = max(gold_cats, key=lambda it: it[1])
                if gold_score > 0:
                    f_per_type[gold_label].fn += 1
            elif pred_cats:
                pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1])
-                if pred_score >= threshold:
-                    f_per_type[pred_label].fp += 1
+                f_per_type[pred_label].fp += 1
        micro_prf = PRFScore()
        for label_prf in f_per_type.values():
            micro_prf.tp += label_prf.tp
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@ -10,6 +10,7 @@ from spacy.compat import pickle
 from spacy.kb import Candidate, InMemoryLookupKB, get_candidates, KnowledgeBase
 from spacy.lang.en import English
 from spacy.ml import load_kb
+from spacy.ml.models.entity_linker import build_span_maker
 from spacy.pipeline import EntityLinker, TrainablePipe
 from spacy.pipeline.legacy import EntityLinker_v1
 from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
@ -716,7 +717,11 @@ TRAIN_DATA = [
    ("Russ Cochran was a member of University of Kentucky's golf team.",
        {"links": {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}},
         "entities": [(0, 12, "PERSON"), (43, 51, "LOC")],
-         "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]})
+         "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}),
+    # having a blank instance shouldn't break things
+    ("The weather is nice today.",
+        {"links": {}, "entities": [],
+         "sent_starts": [1, -1, 0, 0, 0, 0]})
 ]
 GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"]
 # fmt: on
@ -1260,3 +1265,18 @@ def test_save_activations():
    assert scores.data.shape == (2, 1)
    assert scores.data.dtype == "float32"
    assert scores.lengths.shape == (1,)
+
+
+def test_span_maker_forward_with_empty():
+    """The forward pass of the span maker may have a doc with no entities."""
+    nlp = English()
+    doc1 = nlp("a b c")
+    ent = doc1[0:1]
+    ent.label_ = "X"
+    doc1.ents = [ent]
+    # no entities
+    doc2 = nlp("x y z")
+
+    # just to get a model
+    span_maker = build_span_maker()
+    span_maker([doc1, doc2], False)
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@ -824,10 +824,10 @@ def test_textcat_loss(multi_label: bool, expected_loss: float):
    assert loss == expected_loss


-def test_textcat_threshold():
+def test_textcat_multilabel_threshold():
    # Ensure the scorer can be called with a different threshold
    nlp = English()
-    nlp.add_pipe("textcat")
+    nlp.add_pipe("textcat_multilabel")

    train_examples = []
    for text, annotations in TRAIN_DATA_SINGLE_LABEL:
@ -850,7 +850,7 @@ def test_textcat_threshold():
    )
    pos_f = scores["cats_score"]
    assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 1.0
-    assert pos_f > macro_f
+    assert pos_f >= macro_f


 def test_textcat_multi_threshold():
--- a/spacy/tests/pipeline/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@ -231,7 +231,7 @@ def test_tok2vec_listener_callback():


 def test_tok2vec_listener_overfitting():
-    """ Test that a pipeline with a listener properly overfits, even if 'tok2vec' is in the annotating components """
+    """Test that a pipeline with a listener properly overfits, even if 'tok2vec' is in the annotating components"""
    orig_config = Config().from_str(cfg_string)
    nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
    train_examples = []
@ -264,7 +264,7 @@ def test_tok2vec_listener_overfitting():


 def test_tok2vec_frozen_not_annotating():
-    """ Test that a pipeline with a frozen tok2vec raises an error when the tok2vec is not annotating """
+    """Test that a pipeline with a frozen tok2vec raises an error when the tok2vec is not annotating"""
    orig_config = Config().from_str(cfg_string)
    nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
    train_examples = []
@ -274,12 +274,16 @@ def test_tok2vec_frozen_not_annotating():

    for i in range(2):
        losses = {}
-        with pytest.raises(ValueError, match=r"the tok2vec embedding layer is not updated"):
-            nlp.update(train_examples, sgd=optimizer, losses=losses, exclude=["tok2vec"])
+        with pytest.raises(
+            ValueError, match=r"the tok2vec embedding layer is not updated"
+        ):
+            nlp.update(
+                train_examples, sgd=optimizer, losses=losses, exclude=["tok2vec"]
+            )


 def test_tok2vec_frozen_overfitting():
-    """ Test that a pipeline with a frozen & annotating tok2vec can still overfit """
+    """Test that a pipeline with a frozen & annotating tok2vec can still overfit"""
    orig_config = Config().from_str(cfg_string)
    nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
    train_examples = []
@ -289,7 +293,13 @@ def test_tok2vec_frozen_overfitting():

    for i in range(100):
        losses = {}
-        nlp.update(train_examples, sgd=optimizer, losses=losses, exclude=["tok2vec"], annotates=["tok2vec"])
+        nlp.update(
+            train_examples,
+            sgd=optimizer,
+            losses=losses,
+            exclude=["tok2vec"],
+            annotates=["tok2vec"],
+        )
    assert losses["tagger"] < 0.0001

    # test the trained model
--- a/spacy/tests/test_models.py
+++ b/spacy/tests/test_models.py
@ -23,7 +23,7 @@ def get_textcat_bow_kwargs():


 def get_textcat_cnn_kwargs():
-    return {"tok2vec": test_tok2vec(), "exclusive_classes": False, "nO": 13}
+    return {"tok2vec": make_test_tok2vec(), "exclusive_classes": False, "nO": 13}


 def get_all_params(model):
@ -65,7 +65,7 @@ def get_tok2vec_kwargs():
    }


-def test_tok2vec():
+def make_test_tok2vec():
    return build_Tok2Vec_model(**get_tok2vec_kwargs())


--- a/spacy/tests/test_scorer.py
+++ b/spacy/tests/test_scorer.py
@ -474,3 +474,50 @@ def test_prf_score():
    assert (a.precision, a.recall, a.fscore) == approx(
        (c.precision, c.recall, c.fscore)
    )
+
+
+def test_score_cats(en_tokenizer):
+    text = "some text"
+    gold_doc = en_tokenizer(text)
+    gold_doc.cats = {"POSITIVE": 1.0, "NEGATIVE": 0.0}
+    pred_doc = en_tokenizer(text)
+    pred_doc.cats = {"POSITIVE": 0.75, "NEGATIVE": 0.25}
+    example = Example(pred_doc, gold_doc)
+    # threshold is ignored for multi_label=False
+    scores1 = Scorer.score_cats(
+        [example],
+        "cats",
+        labels=list(gold_doc.cats.keys()),
+        multi_label=False,
+        positive_label="POSITIVE",
+        threshold=0.1,
+    )
+    scores2 = Scorer.score_cats(
+        [example],
+        "cats",
+        labels=list(gold_doc.cats.keys()),
+        multi_label=False,
+        positive_label="POSITIVE",
+        threshold=0.9,
+    )
+    assert scores1["cats_score"] == 1.0
+    assert scores2["cats_score"] == 1.0
+    assert scores1 == scores2
+    # threshold is relevant for multi_label=True
+    scores = Scorer.score_cats(
+        [example],
+        "cats",
+        labels=list(gold_doc.cats.keys()),
+        multi_label=True,
+        threshold=0.9,
+    )
+    assert scores["cats_macro_f"] == 0.0
+    # threshold is relevant for multi_label=True
+    scores = Scorer.score_cats(
+        [example],
+        "cats",
+        labels=list(gold_doc.cats.keys()),
+        multi_label=True,
+        threshold=0.1,
+    )
+    assert scores["cats_macro_f"] == 0.5
--- a/website/docs/api/scorer.md
+++ b/website/docs/api/scorer.md
@ -229,16 +229,17 @@ The reported `{attr}_score` depends on the classification properties:
 > print(scores["cats_macro_auc"])
 > ```

-| Name             | Description                                                                                                                                        |
-| ---------------- | -------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `examples`       | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~                                |
-| `attr`           | The attribute to score. ~~str~~                                                                                                                    |
-| _keyword-only_   |                                                                                                                                                    |
-| `getter`         | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the cats for an individual `Doc`. ~~Callable[[Doc, str], Dict[str, float]]~~ |
-| labels           | The set of possible labels. Defaults to `[]`. ~~Iterable[str]~~                                                                                    |
-| `multi_label`    | Whether the attribute allows multiple labels. Defaults to `True`. ~~bool~~                                                                         |
-| `positive_label` | The positive label for a binary task with exclusive classes. Defaults to `None`. ~~Optional[str]~~                                                 |
-| **RETURNS**      | A dictionary containing the scores, with inapplicable scores as `None`. ~~Dict[str, Optional[float]]~~                                             |
+| Name             | Description                                                                                                                                                                                        |
+| ---------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `examples`       | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~                                                                                |
+| `attr`           | The attribute to score. ~~str~~                                                                                                                                                                    |
+| _keyword-only_   |                                                                                                                                                                                                    |
+| `getter`         | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the cats for an individual `Doc`. ~~Callable[[Doc, str], Dict[str, float]]~~                                                 |
+| labels           | The set of possible labels. Defaults to `[]`. ~~Iterable[str]~~                                                                                                                                    |
+| `multi_label`    | Whether the attribute allows multiple labels. Defaults to `True`. When set to `False` (exclusive labels), missing gold labels are interpreted as `0.0` and the threshold is set to `0.0`. ~~bool~~ |
+| `positive_label` | The positive label for a binary task with exclusive classes. Defaults to `None`. ~~Optional[str]~~                                                                                                 |
+| `threshold`      | Cutoff to consider a prediction "positive". Defaults to `0.5` for multi-label, and `0.0` (i.e. whatever's highest scoring) otherwise. ~~float~~                                                    |
+| **RETURNS**      | A dictionary containing the scores, with inapplicable scores as `None`. ~~Dict[str, Optional[float]]~~                                                                                             |

 ## Scorer.score_links {#score_links tag="staticmethod" new="3"}

--- a/website/docs/api/textcategorizer.md
+++ b/website/docs/api/textcategorizer.md
@ -63,7 +63,6 @@ architectures and their arguments and hyperparameters.
 > ```python
 > from spacy.pipeline.textcat import DEFAULT_SINGLE_TEXTCAT_MODEL
 > config = {
->    "threshold": 0.5,
 >    "model": DEFAULT_SINGLE_TEXTCAT_MODEL,
 > }
 > nlp.add_pipe("textcat", config=config)
@ -82,7 +81,7 @@ architectures and their arguments and hyperparameters.

 | Setting     | Description                                                                                                                                                      |
 | ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~                                                                   |
+| `threshold` | Cutoff to consider a prediction "positive", relevant for `textcat_multilabel` when calculating accuracy scores. ~~float~~                                        |
 | `model`     | A model instance that predicts scores for each category. Defaults to [TextCatEnsemble](/api/architectures#TextCatEnsemble). ~~Model[List[Doc], List[Floats2d]]~~ |
 | `scorer`    | The scoring method. Defaults to [`Scorer.score_cats`](/api/scorer#score_cats) for the attribute `"cats"`. ~~Optional[Callable]~~                                 |

@ -123,7 +122,7 @@ shortcut for this and instantiate the component using its string name and
 | `model`                                         | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~       |
 | `name`                                          | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                              |
 | _keyword-only_                                  |                                                                                                                                  |
-| `threshold`                                     | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~                                   |
+| `threshold`                                     | Cutoff to consider a prediction "positive", relevant for `textcat_multilabel` when calculating accuracy scores. ~~float~~        |
 | `scorer`                                        | The scoring method. Defaults to [`Scorer.score_cats`](/api/scorer#score_cats) for the attribute `"cats"`. ~~Optional[Callable]~~ |
 | `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. The supported activations is `"probabilities"`. ~~Union[bool, list[str]]~~            |

--- a/website/docs/usage/rule-based-matching.md
+++ b/website/docs/usage/rule-based-matching.md
@ -1791,7 +1791,7 @@ the entity `Span` – for example `._.orgs` or `._.prev_orgs` and
 > [`Doc.retokenize`](/api/doc#retokenize) context manager:
 >
 > ```python
-> with doc.retokenize() as retokenize:
+> with doc.retokenize() as retokenizer:
 >   for ent in doc.ents:
 >       retokenizer.merge(ent)
 > ```
--- a/website/meta/languages.json
+++ b/website/meta/languages.json
@ -4,12 +4,22 @@
            "code": "af",
            "name": "Afrikaans"
        },
+        {
+            "code": "am",
+            "name": "Amharic",
+            "has_examples": true
+        },
        {
            "code": "ar",
            "name": "Arabic",
            "example": "هذه جملة",
            "has_examples": true
        },
+        {
+            "code": "az",
+            "name": "Azerbaijani",
+            "has_examples": true
+        },
        {
            "code": "bg",
            "name": "Bulgarian",
@ -65,7 +75,7 @@
        {
            "code": "dsb",
            "name": "Lower Sorbian",
-	    "has_examples": true
+            "has_examples": true
        },
        {
            "code": "el",
@ -142,6 +152,11 @@
            "code": "ga",
            "name": "Irish"
        },
+        {
+            "code": "grc",
+            "name": "Ancient Greek",
+            "has_examples": true
+        },
        {
            "code": "gu",
            "name": "Gujarati",
@ -172,7 +187,7 @@
        {
            "code": "hsb",
            "name": "Upper Sorbian",
-	    "has_examples": true
+            "has_examples": true
        },
        {
            "code": "hu",
@ -260,6 +275,10 @@
            "example": "Адамга эң кыйыны — күн сайын адам болуу",
            "has_examples": true
        },
+        {
+            "code": "la",
+            "name": "Latin"
+        },
        {
            "code": "lb",
            "name": "Luxembourgish",
@ -448,6 +467,11 @@
            "example": "นี่คือประโยค",
            "has_examples": true
        },
+        {
+            "code": "ti",
+            "name": "Tigrinya",
+            "has_examples": true
+        },
        {
            "code": "tl",
            "name": "Tagalog"
--- a/website/src/styles/quickstart.module.sass
+++ b/website/src/styles/quickstart.module.sass
@ -149,6 +149,9 @@
    & > span
        display: block

+    a
+        text-decoration: underline
+
 .small
    font-size: var(--font-size-code)
    line-height: 1.65
--- a/website/src/widgets/quickstart-install.js
+++ b/website/src/widgets/quickstart-install.js
@ -159,6 +159,9 @@ const QuickstartInstall = ({ id, title }) => {
                        setters={setters}
                        showDropdown={showDropdown}
                    >
+                        <QS os="mac" hardware="gpu" platform="arm">
+                            # Note M1 GPU support is experimental, see <a href="https://github.com/explosion/thinc/issues/792">Thinc issue #792</a>
+                        </QS>
                        <QS package="pip" config="venv">
                            python -m venv .env
                        </QS>
@ -198,7 +201,13 @@ const QuickstartInstall = ({ id, title }) => {
                            {nightly ? ' --pre' : ''}
                        </QS>
                        <QS package="conda">conda install -c conda-forge spacy</QS>
-                        <QS package="conda" hardware="gpu">
+                        <QS package="conda" hardware="gpu" os="windows">
+                            conda install -c conda-forge cupy
+                        </QS>
+                        <QS package="conda" hardware="gpu" os="linux">
+                            conda install -c conda-forge cupy
+                        </QS>
+                        <QS package="conda" hardware="gpu" os="mac" platform="x86">
                            conda install -c conda-forge cupy
                        </QS>
                        <QS package="conda" config="train">