From 84d9cb6b387572293c8bcf26b0e71b508104b165 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Fri, 21 Oct 2022 11:54:17 +0200
Subject: [PATCH 1/9] Auto-format code with black (#11687)

Co-authored-by: explosion-bot <explosion-bot@users.noreply.github.com>
---
 spacy/tests/pipeline/test_tok2vec.py | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py
index 659274db9..e423d9a19 100644
--- a/spacy/tests/pipeline/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@@ -231,7 +231,7 @@ def test_tok2vec_listener_callback():
 
 
 def test_tok2vec_listener_overfitting():
-    """ Test that a pipeline with a listener properly overfits, even if 'tok2vec' is in the annotating components """
+    """Test that a pipeline with a listener properly overfits, even if 'tok2vec' is in the annotating components"""
     orig_config = Config().from_str(cfg_string)
     nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
     train_examples = []
@@ -264,7 +264,7 @@ def test_tok2vec_listener_overfitting():
 
 
 def test_tok2vec_frozen_not_annotating():
-    """ Test that a pipeline with a frozen tok2vec raises an error when the tok2vec is not annotating """
+    """Test that a pipeline with a frozen tok2vec raises an error when the tok2vec is not annotating"""
     orig_config = Config().from_str(cfg_string)
     nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
     train_examples = []
@@ -274,12 +274,16 @@ def test_tok2vec_frozen_not_annotating():
 
     for i in range(2):
         losses = {}
-        with pytest.raises(ValueError, match=r"the tok2vec embedding layer is not updated"):
-            nlp.update(train_examples, sgd=optimizer, losses=losses, exclude=["tok2vec"])
+        with pytest.raises(
+            ValueError, match=r"the tok2vec embedding layer is not updated"
+        ):
+            nlp.update(
+                train_examples, sgd=optimizer, losses=losses, exclude=["tok2vec"]
+            )
 
 
 def test_tok2vec_frozen_overfitting():
-    """ Test that a pipeline with a frozen & annotating tok2vec can still overfit """
+    """Test that a pipeline with a frozen & annotating tok2vec can still overfit"""
     orig_config = Config().from_str(cfg_string)
     nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
     train_examples = []
@@ -289,7 +293,13 @@ def test_tok2vec_frozen_overfitting():
 
     for i in range(100):
         losses = {}
-        nlp.update(train_examples, sgd=optimizer, losses=losses, exclude=["tok2vec"], annotates=["tok2vec"])
+        nlp.update(
+            train_examples,
+            sgd=optimizer,
+            losses=losses,
+            exclude=["tok2vec"],
+            annotates=["tok2vec"],
+        )
     assert losses["tagger"] < 0.0001
 
     # test the trained model

From 88d35450dcedd89fa739640d8a8d3e62f3643b4a Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 25 Oct 2022 14:53:18 +0200
Subject: [PATCH 2/9] Rename test helper method with non-test_ name (#11701)

---
 spacy/tests/test_models.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/tests/test_models.py b/spacy/tests/test_models.py
index 2306cabb7..d91ed1201 100644
--- a/spacy/tests/test_models.py
+++ b/spacy/tests/test_models.py
@@ -23,7 +23,7 @@ def get_textcat_bow_kwargs():
 
 
 def get_textcat_cnn_kwargs():
-    return {"tok2vec": test_tok2vec(), "exclusive_classes": False, "nO": 13}
+    return {"tok2vec": make_test_tok2vec(), "exclusive_classes": False, "nO": 13}
 
 
 def get_all_params(model):
@@ -65,7 +65,7 @@ def get_tok2vec_kwargs():
     }
 
 
-def test_tok2vec():
+def make_test_tok2vec():
     return build_Tok2Vec_model(**get_tok2vec_kwargs())
 
 

From 8740e4341f03fe2720f50c64e2f94a339d6bd4be Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 25 Oct 2022 14:54:54 +0200
Subject: [PATCH 3/9] Update languages and version in README and website
 (#11694)

---
 README.md                   |  6 +++---
 website/meta/languages.json | 28 ++++++++++++++++++++++++++--
 2 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index d9ef83e01..abfc3da67 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@ be used in real products.
 
 spaCy comes with
 [pretrained pipelines](https://spacy.io/models) and
-currently supports tokenization and training for **60+ languages**. It features
+currently supports tokenization and training for **70+ languages**. It features
 state-of-the-art speed and **neural network models** for tagging,
 parsing, **named entity recognition**, **text classification** and more,
 multi-task learning with pretrained **transformers** like BERT, as well as a
@@ -16,7 +16,7 @@ production-ready [**training system**](https://spacy.io/usage/training) and easy
 model packaging, deployment and workflow management. spaCy is commercial
 open-source software, released under the MIT license.
 
-💫 **Version 3.4.0 out now!**
+💫 **Version 3.4 out now!**
 [Check out the release notes here.](https://github.com/explosion/spaCy/releases)
 
 [![Azure Pipelines](https://img.shields.io/azure-devops/build/explosion-ai/public/8/master.svg?logo=azure-pipelines&style=flat-square&label=build)](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
@@ -79,7 +79,7 @@ more people can benefit from it.
 
 ## Features
 
-- Support for **60+ languages**
+- Support for **70+ languages**
 - **Trained pipelines** for different languages and tasks
 - Multi-task learning with pretrained **transformers** like BERT
 - Support for pretrained **word vectors** and embeddings
diff --git a/website/meta/languages.json b/website/meta/languages.json
index 0028b4a5f..bd1535c90 100644
--- a/website/meta/languages.json
+++ b/website/meta/languages.json
@@ -4,12 +4,22 @@
             "code": "af",
             "name": "Afrikaans"
         },
+        {
+            "code": "am",
+            "name": "Amharic",
+            "has_examples": true
+        },
         {
             "code": "ar",
             "name": "Arabic",
             "example": "هذه جملة",
             "has_examples": true
         },
+        {
+            "code": "az",
+            "name": "Azerbaijani",
+            "has_examples": true
+        },
         {
             "code": "bg",
             "name": "Bulgarian",
@@ -65,7 +75,7 @@
         {
             "code": "dsb",
             "name": "Lower Sorbian",
-	    "has_examples": true
+            "has_examples": true
         },
         {
             "code": "el",
@@ -142,6 +152,11 @@
             "code": "ga",
             "name": "Irish"
         },
+        {
+            "code": "grc",
+            "name": "Ancient Greek",
+            "has_examples": true
+        },
         {
             "code": "gu",
             "name": "Gujarati",
@@ -172,7 +187,7 @@
         {
             "code": "hsb",
             "name": "Upper Sorbian",
-	    "has_examples": true
+            "has_examples": true
         },
         {
             "code": "hu",
@@ -260,6 +275,10 @@
             "example": "Адамга эң кыйыны — күн сайын адам болуу",
             "has_examples": true
         },
+        {
+            "code": "la",
+            "name": "Latin"
+        },
         {
             "code": "lb",
             "name": "Luxembourgish",
@@ -448,6 +467,11 @@
             "example": "นี่คือประโยค",
             "has_examples": true
         },
+        {
+            "code": "ti",
+            "name": "Tigrinya",
+            "has_examples": true
+        },
         {
             "code": "tl",
             "name": "Tagalog"

From 0a9859ba01c8a51842218e1817dff7ff784951df Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 25 Oct 2022 19:38:23 +0200
Subject: [PATCH 4/9] Reduce python 3.10 in CI to one OS (#11703)

---
 azure-pipelines.yml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 357cce835..eea07cb7a 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -76,15 +76,15 @@ jobs:
         #        Python39Mac:
         #          imageName: "macos-latest"
         #          python.version: "3.9"
-        Python310Linux:
-          imageName: "ubuntu-latest"
-          python.version: "3.10"
+        #        Python310Linux:
+        #          imageName: "ubuntu-latest"
+        #          python.version: "3.10"
         Python310Windows:
           imageName: "windows-latest"
           python.version: "3.10"
-        Python310Mac:
-          imageName: "macos-latest"
-          python.version: "3.10"
+        #        Python310Mac:
+        #          imageName: "macos-latest"
+        #          python.version: "3.10"
         Python311Linux:
           imageName: 'ubuntu-latest'
           python.version: '3.11.0-rc.2'

From a9139907a943f0cc91dac0338aa43caa38939778 Mon Sep 17 00:00:00 2001
From: Ryn Daniels <397565+ryndaniels@users.noreply.github.com>
Date: Wed, 26 Oct 2022 09:15:13 +0300
Subject: [PATCH 5/9] update github actions to deal with deprecations (#11702)

---
 .github/workflows/autoblack.yml | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/autoblack.yml b/.github/workflows/autoblack.yml
index 8d0282650..3ad4cf408 100644
--- a/.github/workflows/autoblack.yml
+++ b/.github/workflows/autoblack.yml
@@ -12,10 +12,10 @@ jobs:
     if: github.repository_owner == 'explosion'
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
         with:
             ref: ${{ github.head_ref }}
-      - uses: actions/setup-python@v2
+      - uses: actions/setup-python@v3
       - run: pip install black
       - name: Auto-format code if needed
         run: black spacy
@@ -23,10 +23,11 @@ jobs:
       # code and makes GitHub think the action failed
       - name: Check for modified files
         id: git-check
-        run: echo ::set-output name=modified::$(if git diff-index --quiet HEAD --; then echo "false"; else echo "true"; fi)
+        run: echo modified=$(if git diff-index --quiet HEAD --; then echo "false"; else echo "true"; fi) >> $GITHUB_OUTPUT
+
       - name: Create Pull Request
         if: steps.git-check.outputs.modified == 'true'
-        uses: peter-evans/create-pull-request@v3
+        uses: peter-evans/create-pull-request@v4
         with:
             title: Auto-format code with black
             labels: meta

From 865691d169c3be413007f0d7324e03a7aac3b3cb Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 26 Oct 2022 08:43:00 +0200
Subject: [PATCH 6/9] Adjust default attrs for textcat configs (#11698)

---
 spacy/pipeline/textcat.py            | 4 ++--
 spacy/pipeline/textcat_multilabel.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index c45f819fc..59549ad99 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -24,8 +24,8 @@ single_label_default_config = """
 [model.tok2vec.embed]
 @architectures = "spacy.MultiHashEmbed.v2"
 width = 64
-rows = [2000, 2000, 1000, 1000, 1000, 1000]
-attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
+rows = [2000, 2000, 500, 1000, 500]
+attrs = ["NORM", "LOWER", "PREFIX", "SUFFIX", "SHAPE"]
 include_static_vectors = false
 
 [model.tok2vec.encode]
diff --git a/spacy/pipeline/textcat_multilabel.py b/spacy/pipeline/textcat_multilabel.py
index 493c440c3..eb83d9cb7 100644
--- a/spacy/pipeline/textcat_multilabel.py
+++ b/spacy/pipeline/textcat_multilabel.py
@@ -24,8 +24,8 @@ multi_label_default_config = """
 [model.tok2vec.embed]
 @architectures = "spacy.MultiHashEmbed.v2"
 width = 64
-rows = [2000, 2000, 1000, 1000, 1000, 1000]
-attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
+rows = [2000, 2000, 500, 1000, 500]
+attrs = ["NORM", "LOWER", "PREFIX", "SUFFIX", "SHAPE"]
 include_static_vectors = false
 
 [model.tok2vec.encode]

From 6b78135b9e158e5bc02e39c1a73ef28bb360a44f Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Thu, 27 Oct 2022 22:08:24 +0900
Subject: [PATCH 7/9] Add warning to install widget for M1 GPUs (#11666)

* Add warning to install widget for M1 GPUs

* Use Thinc tracking issue instead

* Update website/src/widgets/quickstart-install.js

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* Underline URL in warning

* Update website/src/widgets/quickstart-install.js

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* Don't install cupy on m1 gpus

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 website/src/styles/quickstart.module.sass |  3 +++
 website/src/widgets/quickstart-install.js | 11 ++++++++++-
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/website/src/styles/quickstart.module.sass b/website/src/styles/quickstart.module.sass
index 8ad106a78..d0f9db551 100644
--- a/website/src/styles/quickstart.module.sass
+++ b/website/src/styles/quickstart.module.sass
@@ -149,6 +149,9 @@
     & > span
         display: block
 
+    a
+        text-decoration: underline
+
 .small
     font-size: var(--font-size-code)
     line-height: 1.65
diff --git a/website/src/widgets/quickstart-install.js b/website/src/widgets/quickstart-install.js
index 0d2186acb..28dd14ecc 100644
--- a/website/src/widgets/quickstart-install.js
+++ b/website/src/widgets/quickstart-install.js
@@ -159,6 +159,9 @@ const QuickstartInstall = ({ id, title }) => {
                         setters={setters}
                         showDropdown={showDropdown}
                     >
+                        <QS os="mac" hardware="gpu" platform="arm">
+                            # Note M1 GPU support is experimental, see <a href="https://github.com/explosion/thinc/issues/792">Thinc issue #792</a>
+                        </QS>
                         <QS package="pip" config="venv">
                             python -m venv .env
                         </QS>
@@ -198,7 +201,13 @@ const QuickstartInstall = ({ id, title }) => {
                             {nightly ? ' --pre' : ''}
                         </QS>
                         <QS package="conda">conda install -c conda-forge spacy</QS>
-                        <QS package="conda" hardware="gpu">
+                        <QS package="conda" hardware="gpu" os="windows">
+                            conda install -c conda-forge cupy
+                        </QS>
+                        <QS package="conda" hardware="gpu" os="linux">
+                            conda install -c conda-forge cupy
+                        </QS>
+                        <QS package="conda" hardware="gpu" os="mac" platform="x86">
                             conda install -c conda-forge cupy
                         </QS>
                         <QS package="conda" config="train">

From d61e742960ef230b423dfa157449b291a03bd119 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Fri, 28 Oct 2022 17:25:34 +0900
Subject: [PATCH 8/9] Handle Docs with no entities in EntityLinker (#11640)

* Handle docs with no entities

If a whole batch contains no entities it won't make it to the model, but
it's possible for individual Docs to have no entities. Before this
commit, those Docs would cause an error when attempting to concatenate
arrays because the dimensions didn't match.

It turns out the process of preparing the Ragged at the end of the span
maker forward was a little different from list2ragged, which just uses
the flatten function directly. Letting list2ragged do the conversion
avoids the dimension issue.

This did not come up before because in NEL demo projects it's typical
for data with no entities to be discarded before it reaches the NEL
component.

This includes a simple direct test that shows the issue and checks it's
resolved. It doesn't check if there are any downstream changes, so a
more complete test could be added. A full run was tested by adding an
example with no entities to the Emerson sample project.

* Add a blank instance to default training data in tests

Rather than adding a specific test, since not failing on instances with
no entities is basic functionality, it makes sense to add it to the
default set.

* Fix without modifying architecture

If the architecture is modified this would have to be a new version, but
this change isn't big enough to merit that.
---
 spacy/ml/models/entity_linker.py           |  7 +++----
 spacy/tests/pipeline/test_entity_linker.py | 22 +++++++++++++++++++++-
 2 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py
index 4d18d216a..299b6bb52 100644
--- a/spacy/ml/models/entity_linker.py
+++ b/spacy/ml/models/entity_linker.py
@@ -71,11 +71,10 @@ def span_maker_forward(model, docs: List[Doc], is_train) -> Tuple[Ragged, Callab
             cands.append((start_token, end_token))
 
         candidates.append(ops.asarray2i(cands))
-    candlens = ops.asarray1i([len(cands) for cands in candidates])
-    candidates = ops.xp.concatenate(candidates)
-    outputs = Ragged(candidates, candlens)
+    lengths = model.ops.asarray1i([len(cands) for cands in candidates])
+    out = Ragged(model.ops.flatten(candidates), lengths)
     # because this is just rearranging docs, the backprop does nothing
-    return outputs, lambda x: []
+    return out, lambda x: []
 
 
 @registry.misc("spacy.KBFromFile.v1")
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index 4d683acc5..99f164f15 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -9,6 +9,7 @@ from spacy.compat import pickle
 from spacy.kb import Candidate, InMemoryLookupKB, get_candidates, KnowledgeBase
 from spacy.lang.en import English
 from spacy.ml import load_kb
+from spacy.ml.models.entity_linker import build_span_maker
 from spacy.pipeline import EntityLinker
 from spacy.pipeline.legacy import EntityLinker_v1
 from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
@@ -715,7 +716,11 @@ TRAIN_DATA = [
     ("Russ Cochran was a member of University of Kentucky's golf team.",
         {"links": {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}},
          "entities": [(0, 12, "PERSON"), (43, 51, "LOC")],
-         "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]})
+         "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}),
+    # having a blank instance shouldn't break things
+    ("The weather is nice today.",
+        {"links": {}, "entities": [],
+         "sent_starts": [1, -1, 0, 0, 0, 0]})
 ]
 GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"]
 # fmt: on
@@ -1196,3 +1201,18 @@ def test_threshold(meet_threshold: bool, config: Dict[str, Any]):
 
     assert len(doc.ents) == 1
     assert doc.ents[0].kb_id_ == entity_id if meet_threshold else EntityLinker.NIL
+
+
+def test_span_maker_forward_with_empty():
+    """The forward pass of the span maker may have a doc with no entities."""
+    nlp = English()
+    doc1 = nlp("a b c")
+    ent = doc1[0:1]
+    ent.label_ = "X"
+    doc1.ents = [ent]
+    # no entities
+    doc2 = nlp("x y z")
+
+    # just to get a model
+    span_maker = build_span_maker()
+    span_maker([doc1, doc2], False)

From d25f09468c4eca20eb464d78d35e439474ed2dbc Mon Sep 17 00:00:00 2001
From: Aaron Zipp <15341396+aaronzipp@users.noreply.github.com>
Date: Mon, 31 Oct 2022 05:27:12 +0100
Subject: [PATCH 9/9] Spelling mistake in rule-based-matching.md (#11717)

Changed retokenize to retokenizer
---
 website/docs/usage/rule-based-matching.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md
index f096890cb..64bbf8e7b 100644
--- a/website/docs/usage/rule-based-matching.md
+++ b/website/docs/usage/rule-based-matching.md
@@ -1792,7 +1792,7 @@ the entity `Span` – for example `._.orgs` or `._.prev_orgs` and
 > [`Doc.retokenize`](/api/doc#retokenize) context manager:
 >
 > ```python
-> with doc.retokenize() as retokenize:
+> with doc.retokenize() as retokenizer:
 >   for ent in doc.ents:
 >       retokenizer.merge(ent)
 > ```