From f00254ae276eca963991efb8a45748b2948b1c77 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Mon, 20 Jun 2022 08:48:40 +0100
Subject: [PATCH 01/25] add counts to verbose list of NER labels (#10957)

---
 spacy/cli/debug_data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index 8a6dde955..bd05471b1 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -361,7 +361,7 @@ def debug_data(
             if label != "-"
         ]
         labels_with_counts = _format_labels(labels_with_counts, counts=True)
-        msg.text(f"Labels in train data: {_format_labels(labels)}", show=verbose)
+        msg.text(f"Labels in train data: {labels_with_counts}", show=verbose)
         missing_labels = model_labels - labels
         if missing_labels:
             msg.warn(

From cdad815c6854a5349abbde469f2478585b118e6a Mon Sep 17 00:00:00 2001
From: Lucaterre <ls.terriel@gmail.com>
Date: Mon, 20 Jun 2022 14:28:49 +0200
Subject: [PATCH 02/25] updated spacy universe for spacyfishing

---
 .github/contributors/Lucaterre.md | 106 ++++++++++++++++++++++++++++++
 website/meta/universe.json        |  29 ++++++++
 2 files changed, 135 insertions(+)
 create mode 100644 .github/contributors/Lucaterre.md

diff --git a/.github/contributors/Lucaterre.md b/.github/contributors/Lucaterre.md
new file mode 100644
index 000000000..5da763b22
--- /dev/null
+++ b/.github/contributors/Lucaterre.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry         |
+|------------------------------- |---------------|
+| Name                           | Lucas Terriel |
+| Company name (if applicable)   |               |
+| Title or role (if applicable)  |               |
+| Date                           | 2022-06-20    |
+| GitHub username                | Lucaterre     |
+| Website (optional)             |               |
\ No newline at end of file
diff --git a/website/meta/universe.json b/website/meta/universe.json
index 9b644adf4..ce2c63739 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -1,5 +1,34 @@
 {
     "resources": [
+        {
+            "id": "spacyfishing",
+            "title": "spaCy fishing",
+            "slogan": "Named entity disambiguation and linking on Wikidata in spaCy with Entity-Fishing.",
+            "description": "A spaCy wrapper of Entity-Fishing for named entity disambiguation and linking against a Wikidata knowledge base.",
+            "github": "Lucaterre/spacyfishing",
+            "pip": "spacyfishing",
+            "code_example": [
+                "import spacy",
+                "text = 'Victor Hugo and Honoré de Balzac are French writers who lived in Paris.'",
+                "nlp = spacy.load('en_core_web_sm')",
+                "nlp.add_pipe('spacyfishing')",
+                "doc = nlp(text)",
+                "for span in doc.ents:",
+                "    print((ent.text, ent.label_, ent._.kb_qid, ent._.url_wikidata, ent._.nerd_score))",
+                "# ('Victor Hugo', 'PERSON', 'Q535', 'https://www.wikidata.org/wiki/Q535', 0.972)",
+                "# ('Honoré de Balzac', 'PERSON', 'Q9711', 'https://www.wikidata.org/wiki/Q9711', 0.9724)",
+                "# ('French', 'NORP', 'Q121842', 'https://www.wikidata.org/wiki/Q121842', 0.3739)",
+                "# ('Paris', 'GPE', 'Q90', 'https://www.wikidata.org/wiki/Q90', 0.5652)",
+                "## Set parameter `extra_info` to `True` and check also span._.description, span._.src_description, span._.normal_term, span._.other_ids"
+            ],
+            "category": ["models", "pipeline"],
+            "tags": ["NER", "NEL"],
+            "author": "Lucas Terriel",
+            "author_links": {
+                "twitter": "TerreLuca",
+                "github": "Lucaterre"
+            }
+        },
         {
             "id": "aim-spacy",
             "title": "Aim-spaCy",

From 2820d7dd8daa66e12bb7c07b1dcfb31423741a72 Mon Sep 17 00:00:00 2001
From: Lucaterre <ls.terriel@gmail.com>
Date: Mon, 20 Jun 2022 15:26:23 +0200
Subject: [PATCH 03/25] correct typo in universe.json for 'code_example' key :
 pipe name 'entityfishing'

---
 website/meta/universe.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/meta/universe.json b/website/meta/universe.json
index ce2c63739..4a3ec6225 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -11,7 +11,7 @@
                 "import spacy",
                 "text = 'Victor Hugo and Honoré de Balzac are French writers who lived in Paris.'",
                 "nlp = spacy.load('en_core_web_sm')",
-                "nlp.add_pipe('spacyfishing')",
+                "nlp.add_pipe('entityfishing')",
                 "doc = nlp(text)",
                 "for span in doc.ents:",
                 "    print((ent.text, ent.label_, ent._.kb_qid, ent._.url_wikidata, ent._.nerd_score))",

From a08ca064e53810cf1c7c0aa1ee7030654d11b5aa Mon Sep 17 00:00:00 2001
From: Victoria <80417010+victorialslocum@users.noreply.github.com>
Date: Tue, 21 Jun 2022 01:03:41 -0500
Subject: [PATCH 04/25] Update linguistic-features.md (#10993)

Change link for downloading fasttext word vectors
---
 website/docs/usage/linguistic-features.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md
index c547ec0bc..9dae6f2ee 100644
--- a/website/docs/usage/linguistic-features.md
+++ b/website/docs/usage/linguistic-features.md
@@ -1899,7 +1899,7 @@ access to some nice Latin vectors. You can then pass the directory path to
 > ```
 
 ```cli
-$ wget https://s3-us-west-1.amazonaws.com/fasttext-vectors/word-vectors-v2/cc.la.300.vec.gz
+$ wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.la.300.vec.gz
 $ python -m spacy init vectors en cc.la.300.vec.gz /tmp/la_vectors_wiki_lg
 ```
 

From 0271306f1603a3f70870c1786e8783fe39e22bd2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 21 Jun 2022 08:26:59 +0200
Subject: [PATCH 05/25] Use thinc-apple-ops>=0.1.0.dev0 with `apple` extras
 (#10904)

* Use thinc-apple-ops>=0.1.0.dev0 with `apple` extras

Also test with thinc-apple-ops that is at least 0.1.0.dev0.

* Check thinc-apple-ops on macOS with Python 3.10

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* Use `pip install --pre` for installing thinc-apple-ops in CI

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 .github/azure-steps.yml | 4 ++--
 setup.cfg               | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml
index 80c88b0b8..d7233328a 100644
--- a/.github/azure-steps.yml
+++ b/.github/azure-steps.yml
@@ -111,7 +111,7 @@ steps:
     condition: eq(variables['python_version'], '3.8')
 
   - script: |
-      ${{ parameters.prefix }} python -m pip install thinc-apple-ops
+      ${{ parameters.prefix }} python -m pip install --pre thinc-apple-ops
       ${{ parameters.prefix }} python -m pytest --pyargs spacy
     displayName: "Run CPU tests with thinc-apple-ops"
-    condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.9'))
+    condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.10'))
diff --git a/setup.cfg b/setup.cfg
index 110a2e4ee..d317847ba 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -104,7 +104,7 @@ cuda114 =
 cuda115 =
     cupy-cuda115>=5.0.0b4,<11.0.0
 apple =
-    thinc-apple-ops>=0.0.4,<1.0.0
+    thinc-apple-ops>=0.1.0.dev0,<1.0.0
 # Language tokenizers with external dependencies
 ja =
     sudachipy>=0.5.2,!=0.6.1

From 0fa004c4cd718319d750abad896447c114f39106 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Tue, 21 Jun 2022 21:00:07 +0100
Subject: [PATCH 06/25] the 'new' indicator wants a 'number' (#10997)

---
 website/docs/api/spanruler.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/api/spanruler.md b/website/docs/api/spanruler.md
index a1c222714..b573f7c58 100644
--- a/website/docs/api/spanruler.md
+++ b/website/docs/api/spanruler.md
@@ -2,7 +2,7 @@
 title: SpanRuler
 tag: class
 source: spacy/pipeline/span_ruler.py
-new: 3.3.1
+new: 3.3
 teaser: 'Pipeline component for rule-based span and named entity recognition'
 api_string_name: span_ruler
 api_trainable: false

From bed23ff291f3e97f5ba6ee42f1a80db7c713b691 Mon Sep 17 00:00:00 2001
From: jademlc <68696651+jademlc@users.noreply.github.com>
Date: Wed, 22 Jun 2022 20:45:26 +0200
Subject: [PATCH 07/25] Update serialization methods code block (#11004)

* Update serialization methods code block

* Update website/docs/usage/saving-loading.md

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 website/docs/usage/saving-loading.md | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/website/docs/usage/saving-loading.md b/website/docs/usage/saving-loading.md
index af140e7a7..0fd713a49 100644
--- a/website/docs/usage/saving-loading.md
+++ b/website/docs/usage/saving-loading.md
@@ -203,11 +203,14 @@ the data to and from a JSON file.
 
 ```python
 ### {highlight="16-23,25-30"}
+import json
+from spacy import Language
 from spacy.util import ensure_path
 
 @Language.factory("my_component")
 class CustomComponent:
-    def __init__(self):
+    def __init__(self, nlp: Language, name: str = "my_component"):
+        self.name = name
         self.data = []
 
     def __call__(self, doc):
@@ -231,7 +234,7 @@ class CustomComponent:
         # This will receive the directory path + /my_component
         data_path = path / "data.json"
         with data_path.open("r", encoding="utf8") as f:
-            self.data = json.loads(f)
+            self.data = json.load(f)
         return self
 ```
 

From 3335bb9d0c9df99f20460ed18e07d8844200d7d7 Mon Sep 17 00:00:00 2001
From: Peter Baumgartner <5107405+pmbaumgartner@users.noreply.github.com>
Date: Thu, 23 Jun 2022 02:15:28 -0400
Subject: [PATCH 08/25] remove `cuda116` extra from install widget (#11012)

---
 website/src/widgets/quickstart-install.js | 1 -
 1 file changed, 1 deletion(-)

diff --git a/website/src/widgets/quickstart-install.js b/website/src/widgets/quickstart-install.js
index 926d76ae3..ccc6b56d9 100644
--- a/website/src/widgets/quickstart-install.js
+++ b/website/src/widgets/quickstart-install.js
@@ -24,7 +24,6 @@ const CUDA = {
     '11.3': 'cuda113',
     '11.4': 'cuda114',
     '11.5': 'cuda115',
-    '11.6': 'cuda116',
 }
 const LANG_EXTRAS = ['ja'] // only for languages with models
 

From f1197d9175927b453312be633cd789157c17a6e7 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 23 Jun 2022 08:16:38 +0200
Subject: [PATCH 09/25] Add API docs for token attribute symbols (#10836)

* Add API docs for token attribute symbols

* Remove NBSP's

* Fix typo

* Rephrase

Co-authored-by: svlandeg <svlandeg@github.com>
---
 website/docs/api/attributes.md | 78 ++++++++++++++++++++++++++++++++++
 website/meta/sidebars.json     |  1 +
 2 files changed, 79 insertions(+)
 create mode 100644 website/docs/api/attributes.md

diff --git a/website/docs/api/attributes.md b/website/docs/api/attributes.md
new file mode 100644
index 000000000..adacd3898
--- /dev/null
+++ b/website/docs/api/attributes.md
@@ -0,0 +1,78 @@
+---
+title: Attributes
+teaser: Token attributes
+source: spacy/attrs.pyx
+---
+
+[Token](/api/token) attributes are specified using internal IDs in many places
+including:
+
+- [`Matcher` patterns](/api/matcher#patterns),
+- [`Doc.to_array`](/api/doc#to_array) and
+  [`Doc.from_array`](/api/doc#from_array)
+- [`Doc.has_annotation`](/api/doc#has_annotation)
+- [`MultiHashEmbed`](/api/architectures#MultiHashEmbed) Tok2Vec architecture
+  `attrs`
+
+> ```python
+> import spacy
+> from spacy.attrs import DEP
+>
+> nlp = spacy.blank("en")
+> doc = nlp("There are many attributes.")
+>
+> # DEP always has the same internal value
+> assert DEP == 76
+>
+> # "DEP" is automatically converted to DEP
+> assert DEP == nlp.vocab.strings["DEP"]
+> assert doc.has_annotation(DEP) == doc.has_annotation("DEP")
+>
+> # look up IDs in spacy.attrs.IDS
+> from spacy.attrs import IDS
+> assert IDS["DEP"] == DEP
+> ```
+
+All methods automatically convert between the string version of an ID (`"DEP"`)
+and the internal integer symbols (`DEP`). The internal IDs can be imported from
+`spacy.attrs` or retrieved from the [`StringStore`](/api/stringstore). A map
+from string attribute names to internal attribute IDs is stored in
+`spacy.attrs.IDS`.
+
+The corresponding [`Token` object attributes](/api/token#attributes) can be
+accessed using the same names in lowercase, e.g. `token.orth` or `token.length`.
+For attributes that represent string values, the internal integer ID is
+accessed as `Token.attr`, e.g. `token.dep`, while the string value can be
+retrieved by appending `_` as in `token.dep_`.
+
+
+| Attribute    | Description                                                                                                                                                   |
+| ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `DEP`        | The token's dependency label. ~~str~~                                                                                                                         |
+| `ENT_ID`     | The token's entity ID (`ent_id`). ~~str~~                                                                                                                     |
+| `ENT_IOB`    | The IOB part of the token's entity tag. Uses custom integer vaues rather than the string store: unset is `0`, `I` is `1`, `O` is `2`, and `B` is `3`. ~~str~~ |
+| `ENT_KB_ID`  | The token's entity knowledge base ID. ~~str~~                                                                                                                 |
+| `ENT_TYPE`   | The token's entity label. ~~str~~                                                                                                                             |
+| `IS_ALPHA`   | Token text consists of alphabetic characters. ~~bool~~                                                                                                        |
+| `IS_ASCII`   | Token text consists of ASCII characters. ~~bool~~                                                                                                             |
+| `IS_DIGIT`   | Token text consists of digits. ~~bool~~                                                                                                                       |
+| `IS_LOWER`   | Token text is in lowercase. ~~bool~~                                                                                                                          |
+| `IS_PUNCT`   | Token is punctuation. ~~bool~~                                                                                                                                |
+| `IS_SPACE`   | Token is whitespace. ~~bool~~                                                                                                                                 |
+| `IS_STOP`    | Token is a stop word. ~~bool~~                                                                                                                                |
+| `IS_TITLE`   | Token text is in titlecase. ~~bool~~                                                                                                                          |
+| `IS_UPPER`   | Token text is in uppercase. ~~bool~~                                                                                                                          |
+| `LEMMA`      | The token's lemma. ~~str~~                                                                                                                                    |
+| `LENGTH`     | The length of the token text. ~~int~~                                                                                                                         |
+| `LIKE_EMAIL` | Token text resembles an email address. ~~bool~~                                                                                                               |
+| `LIKE_NUM`   | Token text resembles a number. ~~bool~~                                                                                                                       |
+| `LIKE_URL`   | Token text resembles a URL. ~~bool~~                                                                                                                          |
+| `LOWER`      | The lowercase form of the token text. ~~str~~                                                                                                                 |
+| `MORPH`      | The token's morphological analysis. ~~MorphAnalysis~~                                                                                                         |
+| `NORM`       | The normalized form of the token text. ~~str~~                                                                                                                |
+| `ORTH`       | The exact verbatim text of a token. ~~str~~                                                                                                                   |
+| `POS`        | The token's universal part of speech (UPOS). ~~str~~                                                                                                          |
+| `SENT_START` | Token is start of sentence. ~~bool~~                                                                                                                          |
+| `SHAPE`      | The token's shape. ~~str~~                                                                                                                                    |
+| `SPACY`      | Token has a trailing space. ~~bool~~                                                                                                                          |
+| `TAG`        | The token's fine-grained part of speech. ~~str~~                                                                                                              |
diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json
index c23f0a255..1bc395a66 100644
--- a/website/meta/sidebars.json
+++ b/website/meta/sidebars.json
@@ -124,6 +124,7 @@
             {
                 "label": "Other",
                 "items": [
+                    { "text": "Attributes", "url": "/api/attributes" },
                     { "text": "Corpus", "url": "/api/corpus" },
                     { "text": "KnowledgeBase", "url": "/api/kb" },
                     { "text": "Lookups", "url": "/api/lookups" },

From d4e3f43639a963125bad123abe9514a1e6da81fc Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 23 Jun 2022 09:50:25 +0200
Subject: [PATCH 10/25] Update thinc version to switch back to blis v0.7
 (#11014)

---
 pyproject.toml   | 2 +-
 requirements.txt | 2 +-
 setup.cfg        | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 4fea41be2..4e388e54f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ requires = [
     "cymem>=2.0.2,<2.1.0",
     "preshed>=3.0.2,<3.1.0",
     "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=8.1.0.dev2,<8.2.0",
+    "thinc>=8.1.0.dev3,<8.2.0",
     "pathy",
     "numpy>=1.15.0",
 ]
diff --git a/requirements.txt b/requirements.txt
index 082ef1522..3b77140f6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,7 @@ spacy-legacy>=3.0.9,<3.1.0
 spacy-loggers>=1.0.0,<2.0.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=8.1.0.dev2,<8.2.0
+thinc>=8.1.0.dev3,<8.2.0
 ml_datasets>=0.2.0,<0.3.0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.9.1,<1.1.0
diff --git a/setup.cfg b/setup.cfg
index d317847ba..ba5b46ff0 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -38,7 +38,7 @@ setup_requires =
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
     murmurhash>=0.28.0,<1.1.0
-    thinc>=8.1.0.dev2,<8.2.0
+    thinc>=8.1.0.dev3,<8.2.0
 install_requires =
     # Our libraries
     spacy-legacy>=3.0.9,<3.1.0
@@ -46,7 +46,7 @@ install_requires =
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
-    thinc>=8.1.0.dev2,<8.2.0
+    thinc>=8.1.0.dev3,<8.2.0
     wasabi>=0.9.1,<1.1.0
     srsly>=2.4.3,<3.0.0
     catalogue>=2.0.6,<2.1.0

From f8116078ce2c5760ae218bc1657977ed116fcf18 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Thu, 23 Jun 2022 09:57:46 +0100
Subject: [PATCH 11/25] disable failing test because Stanford servers are down
 (#11015)

---
 spacy/tests/training/test_readers.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/spacy/tests/training/test_readers.py b/spacy/tests/training/test_readers.py
index 8c5c81625..eb07a52b1 100644
--- a/spacy/tests/training/test_readers.py
+++ b/spacy/tests/training/test_readers.py
@@ -60,11 +60,12 @@ def test_readers():
     assert isinstance(extra_corpus, Callable)
 
 
+# TODO: enable IMDB test once Stanford servers are back up and running
 @pytest.mark.slow
 @pytest.mark.parametrize(
     "reader,additional_config",
     [
-        ("ml_datasets.imdb_sentiment.v1", {"train_limit": 10, "dev_limit": 10}),
+        #        ("ml_datasets.imdb_sentiment.v1", {"train_limit": 10, "dev_limit": 10}),
         ("ml_datasets.dbpedia.v1", {"train_limit": 10, "dev_limit": 10}),
         ("ml_datasets.cmu_movies.v1", {"limit": 10, "freq_cutoff": 200, "split": 0.8}),
     ],

From 4cd8b4cc222bebc2108eb52b4400eea562db4ac2 Mon Sep 17 00:00:00 2001
From: Dmytro Sadovnychyi <git@dmit.ro>
Date: Thu, 23 Jun 2022 17:53:00 +0200
Subject: [PATCH 12/25] Fix some of the broken links on universe pages (#11011)

Currently some of the "AUTHOR INFO" links (e.g. here[0]) are broken:

```
https://github.com/https://github.com/explosion
```

[0] https://spacy.io/universe/project/spacy-experimental


Also one remains broken with `https://szegedai.github.io/`.
---
 website/meta/universe.json | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/website/meta/universe.json b/website/meta/universe.json
index 4a3ec6225..ab64fe895 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -84,7 +84,7 @@
             "code_language": "python",
             "author": "Leap Beyond",
             "author_links": {
-                "github": "https://github.com/LeapBeyond",
+                "github": "LeapBeyond",
                 "website": "https://leapbeyond.ai"
             },
             "code_example": [
@@ -107,8 +107,8 @@
             "code_language": "python",
             "author": "Peter Baumgartner",
             "author_links": {
-                "twitter" : "https://twitter.com/pmbaumgartner",
-                "github": "https://github.com/pmbaumgartner",
+                "twitter" : "pmbaumgartner",
+                "github": "pmbaumgartner",
                 "website": "https://www.peterbaumgartner.com/"
             },
             "code_example": [
@@ -127,8 +127,8 @@
             "code_language": "python",
             "author": "Explosion",
             "author_links": {
-                "twitter" : "https://twitter.com/explosion_ai",
-                "github": "https://github.com/explosion",
+                "twitter" : "explosion_ai",
+                "github": "explosion",
                 "website": "https://explosion.ai/"
             },
             "code_example": [
@@ -600,8 +600,8 @@
             "code_language": "python",
             "author": "Keith Rozario",
             "author_links": {
-                "twitter" : "https://twitter.com/keithrozario",
-                "github": "https://github.com/keithrozario",
+                "twitter" : "keithrozario",
+                "github": "keithrozario",
                 "website": "https://www.keithrozario.com"
             },
             "code_example": [
@@ -2324,7 +2324,7 @@
             "author": "Daniel Whitenack & Chris Benson",
             "author_links": {
                 "website": "https://changelog.com/practicalai",
-                "twitter": "https://twitter.com/PracticalAIFM"
+                "twitter": "PracticalAIFM"
             },
             "category": ["podcasts"]
         },

From 9738b69c0e3babb365cafaa26b872ca1028c9696 Mon Sep 17 00:00:00 2001
From: Peter Baumgartner <5107405+pmbaumgartner@users.noreply.github.com>
Date: Fri, 24 Jun 2022 02:11:29 -0400
Subject: [PATCH 13/25] Update Code Conventions.md (#11018)

---
 extra/DEVELOPER_DOCS/Code Conventions.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/extra/DEVELOPER_DOCS/Code Conventions.md b/extra/DEVELOPER_DOCS/Code Conventions.md
index 37cd8ff27..31a87d362 100644
--- a/extra/DEVELOPER_DOCS/Code Conventions.md	
+++ b/extra/DEVELOPER_DOCS/Code Conventions.md	
@@ -455,6 +455,10 @@ Regression tests are tests that refer to bugs reported in specific issues. They
 
 The test suite also provides [fixtures](https://github.com/explosion/spaCy/blob/master/spacy/tests/conftest.py) for different language tokenizers that can be used as function arguments of the same name and will be passed in automatically. Those should only be used for tests related to those specific languages. We also have [test utility functions](https://github.com/explosion/spaCy/blob/master/spacy/tests/util.py) for common operations, like creating a temporary file.
 
+### Testing Cython Code
+
+If you're developing Cython code (`.pyx` files), those extensions will need to be built before the test runner can test that code - otherwise it's going to run the tests with stale code from the last time the extension was built. You can build the extensions locally with `python setup.py build_ext -i`.
+
 ### Constructing objects and state
 
 Test functions usually follow the same simple structure: they set up some state, perform the operation you want to test and `assert` conditions that you expect to be true, usually before and after the operation.

From bffe54d02b840a73f8dec4d8cd50056507695853 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 24 Jun 2022 08:48:58 +0200
Subject: [PATCH 14/25] Set version to v3.4.0

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index 03eabc2e9..ef0358e1a 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "3.3.0"
+__version__ = "3.4.0"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __projects__ = "https://github.com/explosion/projects"

From d9320db7db74b970b3751e38ed6f14de5b7d16d5 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 1 Apr 2022 10:42:25 +0200
Subject: [PATCH 15/25] Temporarily skip tests that require models/compat

---
 .github/azure-steps.yml | 34 +++++++++++++++++-----------------
 spacy/tests/test_cli.py |  2 ++
 2 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml
index d7233328a..41f743feb 100644
--- a/.github/azure-steps.yml
+++ b/.github/azure-steps.yml
@@ -64,12 +64,12 @@ steps:
     displayName: "Run GPU tests"
     condition: eq(${{ parameters.gpu }}, true)
 
-  - script: |
-      python -m spacy download ca_core_news_sm
-      python -m spacy download ca_core_news_md
-      python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
-    displayName: 'Test download CLI'
-    condition: eq(variables['python_version'], '3.8')
+#  - script: |
+#      python -m spacy download ca_core_news_sm
+#      python -m spacy download ca_core_news_md
+#      python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
+#    displayName: 'Test download CLI'
+#    condition: eq(variables['python_version'], '3.8')
 
   - script: |
       python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
@@ -93,17 +93,17 @@ steps:
     displayName: 'Test train CLI'
     condition: eq(variables['python_version'], '3.8')
 
-  - script: |
-      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
-      PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
-    displayName: 'Test assemble CLI'
-    condition: eq(variables['python_version'], '3.8')
-
-  - script: |
-      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
-      python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
-    displayName: 'Test assemble CLI vectors warning'
-    condition: eq(variables['python_version'], '3.8')
+#  - script: |
+#      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
+#      PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
+#    displayName: 'Test assemble CLI'
+#    condition: eq(variables['python_version'], '3.8')
+#
+#  - script: |
+#      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
+#      python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
+#    displayName: 'Test assemble CLI vectors warning'
+#    condition: eq(variables['python_version'], '3.8')
 
   - script: |
       python .github/validate_universe_json.py website/meta/universe.json
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index 838e00369..fe8b3a8a1 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -589,6 +589,7 @@ def test_string_to_list_intify(value):
     assert string_to_list(value, intify=True) == [1, 2, 3]
 
 
+@pytest.mark.skip(reason="Temporarily skip for dev version")
 def test_download_compatibility():
     spec = SpecifierSet("==" + about.__version__)
     spec.prereleases = False
@@ -599,6 +600,7 @@ def test_download_compatibility():
         assert get_minor_version(about.__version__) == get_minor_version(version)
 
 
+@pytest.mark.skip(reason="Temporarily skip for dev version")
 def test_validate_compatibility_table():
     spec = SpecifierSet("==" + about.__version__)
     spec.prereleases = False

From 8f1ba4de582c5e5282c022a7713a56b47302cabe Mon Sep 17 00:00:00 2001
From: Madeesh Kannan <shadeMe@users.noreply.github.com>
Date: Fri, 24 Jun 2022 13:39:52 +0200
Subject: [PATCH 16/25] Backport parser/alignment optimizations from
 `feature/refactor-parser` (#10952)

---
 spacy/training/alignment_array.pyx |  20 +++--
 spacy/training/example.pyx         | 129 +++++++++++++++++++++++------
 spacy/util.py                      |   7 ++
 3 files changed, 123 insertions(+), 33 deletions(-)

diff --git a/spacy/training/alignment_array.pyx b/spacy/training/alignment_array.pyx
index b58f08786..01e9d9bf8 100644
--- a/spacy/training/alignment_array.pyx
+++ b/spacy/training/alignment_array.pyx
@@ -1,33 +1,39 @@
 from typing import List
 from ..errors import Errors
 import numpy
+from libc.stdint cimport int32_t
 
 
 cdef class AlignmentArray:
     """AlignmentArray is similar to Thinc's Ragged with two simplfications:
     indexing returns numpy arrays and this type can only be used for CPU arrays.
-    However, these changes make AlginmentArray more efficient for indexing in a
+    However, these changes make AlignmentArray more efficient for indexing in a
     tight loop."""
 
     __slots__ = []
 
     def __init__(self, alignment: List[List[int]]):
-        self._lengths = None
-        self._starts_ends = numpy.zeros(len(alignment) + 1, dtype="i")
-
         cdef int data_len = 0
         cdef int outer_len
         cdef int idx
+
+        self._starts_ends = numpy.zeros(len(alignment) + 1, dtype='int32')
+        cdef int32_t* starts_ends_ptr = <int32_t*>self._starts_ends.data
+
         for idx, outer in enumerate(alignment):
             outer_len = len(outer)
-            self._starts_ends[idx + 1] = self._starts_ends[idx] + outer_len
+            starts_ends_ptr[idx + 1] = starts_ends_ptr[idx] + outer_len
             data_len += outer_len
 
-        self._data = numpy.empty(data_len, dtype="i")
+        self._lengths = None
+        self._data = numpy.empty(data_len, dtype="int32")
+
         idx = 0
+        cdef int32_t* data_ptr = <int32_t*>self._data.data
+
         for outer in alignment:
             for inner in outer:
-                self._data[idx] = inner
+                data_ptr[idx] = inner
                 idx += 1
 
     def __getitem__(self, idx):
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index 045f0b483..473364f93 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -13,7 +13,7 @@ from .iob_utils import biluo_tags_to_spans, remove_bilu_prefix
 from ..errors import Errors, Warnings
 from ..pipeline._parser_internals import nonproj
 from ..tokens.token cimport MISSING_DEP
-from ..util import logger, to_ternary_int
+from ..util import logger, to_ternary_int, all_equal
 
 
 cpdef Doc annotations_to_doc(vocab, tok_annot, doc_annot):
@@ -151,50 +151,127 @@ cdef class Example:
                 self._y_sig = y_sig
                 return self._cached_alignment
 
+
+    def _get_aligned_vectorized(self, align, gold_values):
+        # Fast path for Doc attributes/fields that are predominantly a single value,
+        # i.e., TAG, POS, MORPH.
+        x2y_single_toks = []
+        x2y_single_toks_i = []
+
+        x2y_multiple_toks = []
+        x2y_multiple_toks_i = []
+
+        # Gather indices of gold tokens aligned to the candidate tokens into two buckets.
+        #   Bucket 1: All tokens that have a one-to-one alignment.
+        #   Bucket 2: All tokens that have a one-to-many alignment.
+        for idx, token in enumerate(self.predicted):
+            aligned_gold_i = align[token.i]
+            aligned_gold_len = len(aligned_gold_i)
+
+            if aligned_gold_len == 1:
+                x2y_single_toks.append(aligned_gold_i.item())
+                x2y_single_toks_i.append(idx)
+            elif aligned_gold_len > 1:
+                x2y_multiple_toks.append(aligned_gold_i)
+                x2y_multiple_toks_i.append(idx)
+
+        # Map elements of the first bucket directly to the output array.
+        output = numpy.full(len(self.predicted), None)
+        output[x2y_single_toks_i] = gold_values[x2y_single_toks].squeeze()
+
+        # Collapse many-to-one alignments into one-to-one alignments if they
+        # share the same value. Map to None in all other cases.
+        for i in range(len(x2y_multiple_toks)):
+            aligned_gold_values = gold_values[x2y_multiple_toks[i]]
+
+            # If all aligned tokens have the same value, use it.
+            if all_equal(aligned_gold_values):
+                x2y_multiple_toks[i] = aligned_gold_values[0].item()
+            else:
+                x2y_multiple_toks[i] = None
+
+        output[x2y_multiple_toks_i] = x2y_multiple_toks
+
+        return output.tolist()
+
+
+    def _get_aligned_non_vectorized(self, align, gold_values):
+        # Slower path for fields that return multiple values (resulting
+        # in ragged arrays that cannot be vectorized trivially).
+        output = [None] * len(self.predicted)
+
+        for token in self.predicted:
+            aligned_gold_i = align[token.i]
+            values = gold_values[aligned_gold_i].ravel()
+            if len(values) == 1:
+                output[token.i] = values.item()
+            elif all_equal(values):
+                # If all aligned tokens have the same value, use it.
+                output[token.i] = values[0].item()
+
+        return output
+
+
     def get_aligned(self, field, as_string=False):
         """Return an aligned array for a token attribute."""
         align = self.alignment.x2y
+        gold_values = self.reference.to_array([field])
+
+        if len(gold_values.shape) == 1:
+            output = self._get_aligned_vectorized(align, gold_values)
+        else:
+            output = self._get_aligned_non_vectorized(align, gold_values)
 
         vocab = self.reference.vocab
-        gold_values = self.reference.to_array([field])
-        output = [None] * len(self.predicted)
-        for token in self.predicted:
-            values = gold_values[align[token.i]]
-            values = values.ravel()
-            if len(values) == 0:
-                output[token.i] = None
-            elif len(values) == 1:
-                output[token.i] = values[0]
-            elif len(set(list(values))) == 1:
-                # If all aligned tokens have the same value, use it.
-                output[token.i] = values[0]
-            else:
-                output[token.i] = None
         if as_string and field not in ["ENT_IOB", "SENT_START"]:
             output = [vocab.strings[o] if o is not None else o for o in output]
+
         return output
 
     def get_aligned_parse(self, projectivize=True):
         cand_to_gold = self.alignment.x2y
         gold_to_cand = self.alignment.y2x
-        aligned_heads = [None] * self.x.length
-        aligned_deps = [None] * self.x.length
-        has_deps = [token.has_dep() for token in self.y]
-        has_heads = [token.has_head() for token in self.y]
         heads = [token.head.i for token in self.y]
         deps = [token.dep_ for token in self.y]
+
         if projectivize:
             proj_heads, proj_deps = nonproj.projectivize(heads, deps)
+            has_deps = [token.has_dep() for token in self.y]
+            has_heads = [token.has_head() for token in self.y]
+
             # ensure that missing data remains missing
             heads = [h if has_heads[i] else heads[i] for i, h in enumerate(proj_heads)]
             deps = [d if has_deps[i] else deps[i] for i, d in enumerate(proj_deps)]
-        for cand_i in range(self.x.length):
-            if cand_to_gold.lengths[cand_i] == 1:
-                gold_i = cand_to_gold[cand_i][0]
-                if gold_to_cand.lengths[heads[gold_i]] == 1:
-                    aligned_heads[cand_i] = int(gold_to_cand[heads[gold_i]][0])
-                    aligned_deps[cand_i] = deps[gold_i]
-        return aligned_heads, aligned_deps
+
+        # Select all candidate tokens that are aligned to a single gold token.
+        c2g_single_toks = numpy.where(cand_to_gold.lengths == 1)[0]
+
+        # Fetch all aligned gold token incides.
+        if c2g_single_toks.shape == cand_to_gold.lengths.shape:
+            # This the most likely case.
+            gold_i = cand_to_gold[:].squeeze()
+        else:
+            gold_i = numpy.vectorize(lambda x: cand_to_gold[int(x)][0])(c2g_single_toks).squeeze()
+
+        # Fetch indices of all gold heads for the aligned gold tokens.
+        heads = numpy.asarray(heads, dtype='i')
+        gold_head_i = heads[gold_i]
+
+        # Select all gold tokens that are heads of the previously selected 
+        # gold tokens (and are aligned to a single candidate token).
+        g2c_len_heads = gold_to_cand.lengths[gold_head_i]
+        g2c_len_heads = numpy.where(g2c_len_heads == 1)[0]
+        g2c_i = numpy.vectorize(lambda x: gold_to_cand[int(x)][0])(gold_head_i[g2c_len_heads]).squeeze()
+
+        # Update head/dep alignments with the above.
+        aligned_heads = numpy.full((self.x.length), None)
+        aligned_heads[c2g_single_toks[g2c_len_heads]] = g2c_i
+
+        deps = numpy.asarray(deps)
+        aligned_deps = numpy.full((self.x.length), None)
+        aligned_deps[c2g_single_toks] = deps[gold_i]
+
+        return aligned_heads.tolist(), aligned_deps.tolist()
 
     def get_aligned_sent_starts(self):
         """Get list of SENT_START attributes aligned to the predicted tokenization.
diff --git a/spacy/util.py b/spacy/util.py
index 9b871b87b..4f21d618a 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -1716,3 +1716,10 @@ def packages_distributions() -> Dict[str, List[str]]:
         for pkg in (dist.read_text("top_level.txt") or "").split():
             pkg_to_dist[pkg].append(dist.metadata["Name"])
     return dict(pkg_to_dist)
+
+
+def all_equal(iterable):
+    """Return True if all the elements are equal to each other
+    (or if the input is an empty sequence), False otherwise."""
+    g = itertools.groupby(iterable)
+    return next(g, True) and not next(g, False)

From 4155a59d470c231b5bfca26044a6d4f93bea7e48 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Mon, 27 Jun 2022 09:35:35 +0200
Subject: [PATCH 17/25] Auto-format code with black (#11022)

Co-authored-by: explosion-bot <explosion-bot@users.noreply.github.com>
---
 spacy/tests/parser/test_ner.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index 53bb2d554..00889efdc 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -158,13 +158,18 @@ def test_issue3209():
 
 
 def test_labels_from_BILUO():
-    """Test that labels are inferred correctly when there's a - in label.
-    """
+    """Test that labels are inferred correctly when there's a - in label."""
     nlp = English()
     ner = nlp.add_pipe("ner")
     ner.add_label("LARGE-ANIMAL")
     nlp.initialize()
-    move_names = ["O", "B-LARGE-ANIMAL", "I-LARGE-ANIMAL", "L-LARGE-ANIMAL", "U-LARGE-ANIMAL"]
+    move_names = [
+        "O",
+        "B-LARGE-ANIMAL",
+        "I-LARGE-ANIMAL",
+        "L-LARGE-ANIMAL",
+        "U-LARGE-ANIMAL",
+    ]
     labels = {"LARGE-ANIMAL"}
     assert ner.move_names == move_names
     assert set(ner.labels) == labels

From 308a612ec98f27098fe7f69ec20be0b5e88d51fa Mon Sep 17 00:00:00 2001
From: Eric Holscher <25510+ericholscher@users.noreply.github.com>
Date: Mon, 27 Jun 2022 00:45:22 -0700
Subject: [PATCH 18/25] Remove `simply` (#11017)

I was reading this page, and as a relative beginner, nothing about it was simple :)
---
 website/docs/api/architectures.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md
index 2bddcb28c..2537faff6 100644
--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@@ -587,7 +587,7 @@ consists of either two or three subnetworks:
   run once for each batch.
 - **lower**: Construct a feature-specific vector for each `(token, feature)`
   pair. This is also run once for each batch. Constructing the state
-  representation is then simply a matter of summing the component features and
+  representation is then a matter of summing the component features and
   applying the non-linearity.
 - **upper** (optional): A feed-forward network that predicts scores from the
   state representation. If not present, the output from the lower model is used
@@ -628,7 +628,7 @@ same signature, but the `use_upper` argument was `True` by default.
 > ```
 
 Build a tagger model, using a provided token-to-vector component. The tagger
-model simply adds a linear layer with softmax activation to predict scores given
+model adds a linear layer with softmax activation to predict scores given
 the token vectors.
 
 | Name        | Description                                                                                |
@@ -920,5 +920,5 @@ A function that reads an existing `KnowledgeBase` from file.
 A function that takes as input a [`KnowledgeBase`](/api/kb) and a
 [`Span`](/api/span) object denoting a named entity, and returns a list of
 plausible [`Candidate`](/api/kb/#candidate) objects. The default
-`CandidateGenerator` simply uses the text of a mention to find its potential
+`CandidateGenerator` uses the text of a mention to find its potential
 aliases in the `KnowledgeBase`. Note that this function is case-dependent.

From 8ffff18ac4e6a1d4fdae76dd7a9ecdf251b149fa Mon Sep 17 00:00:00 2001
From: Zackere <wo1001@wp.pl>
Date: Tue, 28 Jun 2022 15:11:15 +0200
Subject: [PATCH 19/25] Try cloning repo from main & master (#10843)

* Try cloning repo from main & master

* fixup! Try cloning repo from main & master

* fixup! fixup! Try cloning repo from main & master

* refactor clone and check for repo:branch existence

* spacing fix

* make mypy happy

* type util function

* Update spacy/cli/project/clone.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

Co-authored-by: Peter Baumgartner <5107405+pmbaumgartner@users.noreply.github.com>
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/cli/_util.py         | 17 +++++++++++++++++
 spacy/cli/project/clone.py | 30 +++++++++++++++++++++++-------
 2 files changed, 40 insertions(+), 7 deletions(-)

diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index bb7f2d352..ae43b991b 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -462,6 +462,23 @@ def git_sparse_checkout(repo, subpath, dest, branch):
         shutil.move(str(source_path), str(dest))
 
 
+def git_repo_branch_exists(repo: str, branch: str) -> bool:
+    """Uses 'git ls-remote' to check if a repository and branch exists
+
+    repo (str): URL to get repo.
+    branch (str): Branch on repo to check.
+    RETURNS (bool): True if repo:branch exists.
+    """
+    get_git_version()
+    cmd = f"git ls-remote {repo} {branch}"
+    # We might be tempted to use `--exit-code` with `git ls-remote`, but
+    # `run_command` handles the `returncode` for us, so we'll rely on
+    # the fact that stdout returns '' if the requested branch doesn't exist
+    ret = run_command(cmd, capture=True)
+    exists = ret.stdout != ""
+    return exists
+
+
 def get_git_version(
     error: str = "Could not run 'git'. Make sure it's installed and the executable is available.",
 ) -> Tuple[int, int]:
diff --git a/spacy/cli/project/clone.py b/spacy/cli/project/clone.py
index 360ee3428..14b4ed9b5 100644
--- a/spacy/cli/project/clone.py
+++ b/spacy/cli/project/clone.py
@@ -7,11 +7,11 @@ import re
 from ... import about
 from ...util import ensure_path
 from .._util import project_cli, Arg, Opt, COMMAND, PROJECT_FILE
-from .._util import git_checkout, get_git_version
+from .._util import git_checkout, get_git_version, git_repo_branch_exists
 
 DEFAULT_REPO = about.__projects__
 DEFAULT_PROJECTS_BRANCH = about.__projects_branch__
-DEFAULT_BRANCH = "master"
+DEFAULT_BRANCHES = ["main", "master"]
 
 
 @project_cli.command("clone")
@@ -20,7 +20,7 @@ def project_clone_cli(
     name: str = Arg(..., help="The name of the template to clone"),
     dest: Optional[Path] = Arg(None, help="Where to clone the project. Defaults to current working directory", exists=False),
     repo: str = Opt(DEFAULT_REPO, "--repo", "-r", help="The repository to clone from"),
-    branch: Optional[str] = Opt(None, "--branch", "-b", help="The branch to clone from"),
+    branch: Optional[str] = Opt(None, "--branch", "-b", help=f"The branch to clone from. If not provided, will attempt {', '.join(DEFAULT_BRANCHES)}"),
     sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse Git checkout to only check out and clone the files needed. Requires Git v22.2+.")
     # fmt: on
 ):
@@ -33,9 +33,25 @@ def project_clone_cli(
     """
     if dest is None:
         dest = Path.cwd() / Path(name).parts[-1]
+    if repo == DEFAULT_REPO and branch is None:
+        branch = DEFAULT_PROJECTS_BRANCH
+
     if branch is None:
-        # If it's a user repo, we want to default to other branch
-        branch = DEFAULT_PROJECTS_BRANCH if repo == DEFAULT_REPO else DEFAULT_BRANCH
+        for default_branch in DEFAULT_BRANCHES:
+            if git_repo_branch_exists(repo, default_branch):
+                branch = default_branch
+                break
+        if branch is None:
+            default_branches_msg = ", ".join(f"'{b}'" for b in DEFAULT_BRANCHES)
+            msg.fail(
+                "No branch provided and attempted default "
+                f"branches {default_branches_msg} do not exist.",
+                exits=1,
+            )
+    else:
+        if not git_repo_branch_exists(repo, branch):
+            msg.fail(f"repo: {repo} (branch: {branch}) does not exist.", exits=1)
+    assert isinstance(branch, str)
     project_clone(name, dest, repo=repo, branch=branch, sparse_checkout=sparse_checkout)
 
 
@@ -61,9 +77,9 @@ def project_clone(
     try:
         git_checkout(repo, name, dest, branch=branch, sparse=sparse_checkout)
     except subprocess.CalledProcessError:
-        err = f"Could not clone '{name}' from repo '{repo_name}'"
+        err = f"Could not clone '{name}' from repo '{repo_name}' (branch '{branch}')"
         msg.fail(err, exits=1)
-    msg.good(f"Cloned '{name}' from {repo_name}", project_dir)
+    msg.good(f"Cloned '{name}' from '{repo_name}' (branch '{branch}')", project_dir)
     if not (project_dir / PROJECT_FILE).exists():
         msg.warn(f"No {PROJECT_FILE} found in directory")
     else:

From a9559e7435f99648aa0004f301692f1a2dfe72fe Mon Sep 17 00:00:00 2001
From: Richard Hudson <richard@explosion.ai>
Date: Tue, 28 Jun 2022 15:35:32 +0200
Subject: [PATCH 20/25] Handle Cyrillic combining diacritics (#10837)

* Handle Russian, Ukrainian and Bulgarian

* Corrections

* Correction

* Correction to comment

* Changes based on review

* Correction

* Reverted irrelevant change in punctuation.py

* Remove unnecessary group

* Reverted accidental change
---
 spacy/lang/bg/__init__.py             |  5 +++-
 spacy/lang/char_classes.py            |  4 ++++
 spacy/lang/punctuation.py             | 22 ++++++++++++++++-
 spacy/lang/ru/__init__.py             |  4 ++++
 spacy/lang/uk/__init__.py             |  4 ++++
 spacy/tests/lang/bg/test_tokenizer.py |  8 +++++++
 spacy/tests/lang/ru/test_tokenizer.py | 34 +++++++++++++++++++++++++++
 spacy/tests/lang/uk/test_tokenizer.py |  7 ++++++
 8 files changed, 86 insertions(+), 2 deletions(-)
 create mode 100644 spacy/tests/lang/bg/test_tokenizer.py

diff --git a/spacy/lang/bg/__init__.py b/spacy/lang/bg/__init__.py
index 559cc34c4..c9176b946 100644
--- a/spacy/lang/bg/__init__.py
+++ b/spacy/lang/bg/__init__.py
@@ -2,7 +2,8 @@ from .stop_words import STOP_WORDS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .lex_attrs import LEX_ATTRS
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
-
+from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_INFIXES
+from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
 from ...language import Language, BaseDefaults
 from ...attrs import LANG
 from ...util import update_exc
@@ -16,6 +17,8 @@ class BulgarianDefaults(BaseDefaults):
 
     stop_words = STOP_WORDS
     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
+    suffixes = COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
+    infixes = COMBINING_DIACRITICS_TOKENIZER_INFIXES
 
 
 class Bulgarian(Language):
diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py
index b15bb3cf3..1d204c46c 100644
--- a/spacy/lang/char_classes.py
+++ b/spacy/lang/char_classes.py
@@ -258,6 +258,10 @@ ALPHA = group_chars(
 ALPHA_LOWER = group_chars(_lower + _uncased)
 ALPHA_UPPER = group_chars(_upper + _uncased)
 
+_combining_diacritics = r"\u0300-\u036f"
+
+COMBINING_DIACRITICS = _combining_diacritics
+
 _units = (
     "km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft "
     "kg g mg µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb "
diff --git a/spacy/lang/punctuation.py b/spacy/lang/punctuation.py
index e712e71d6..a1cfe6224 100644
--- a/spacy/lang/punctuation.py
+++ b/spacy/lang/punctuation.py
@@ -1,5 +1,5 @@
 from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
-from .char_classes import LIST_ICONS, HYPHENS, CURRENCY, UNITS
+from .char_classes import LIST_ICONS, HYPHENS, CURRENCY, UNITS, COMBINING_DIACRITICS
 from .char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
 
 
@@ -44,3 +44,23 @@ TOKENIZER_INFIXES = (
         r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
     ]
 )
+
+
+# Some languages e.g. written with the Cyrillic alphabet permit the use of diacritics
+# to mark stressed syllables in words where stress is distinctive. Such languages
+# should use the COMBINING_DIACRITICS... suffix and infix regex lists in
+# place of the standard ones.
+COMBINING_DIACRITICS_TOKENIZER_SUFFIXES = list(TOKENIZER_SUFFIXES) + [
+    r"(?<=[{a}][{d}])\.".format(a=ALPHA, d=COMBINING_DIACRITICS),
+]
+
+COMBINING_DIACRITICS_TOKENIZER_INFIXES = list(TOKENIZER_INFIXES) + [
+    r"(?<=[{al}][{d}])\.(?=[{au}{q}])".format(
+        al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES, d=COMBINING_DIACRITICS
+    ),
+    r"(?<=[{a}][{d}]),(?=[{a}])".format(a=ALPHA, d=COMBINING_DIACRITICS),
+    r"(?<=[{a}][{d}])(?:{h})(?=[{a}])".format(
+        a=ALPHA, d=COMBINING_DIACRITICS, h=HYPHENS
+    ),
+    r"(?<=[{a}][{d}])[:<>=/](?=[{a}])".format(a=ALPHA, d=COMBINING_DIACRITICS),
+]
diff --git a/spacy/lang/ru/__init__.py b/spacy/lang/ru/__init__.py
index 5d31d8ea2..c118c26ff 100644
--- a/spacy/lang/ru/__init__.py
+++ b/spacy/lang/ru/__init__.py
@@ -5,6 +5,8 @@ from .stop_words import STOP_WORDS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .lex_attrs import LEX_ATTRS
 from .lemmatizer import RussianLemmatizer
+from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_INFIXES
+from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
 from ...language import Language, BaseDefaults
 
 
@@ -12,6 +14,8 @@ class RussianDefaults(BaseDefaults):
     tokenizer_exceptions = TOKENIZER_EXCEPTIONS
     lex_attr_getters = LEX_ATTRS
     stop_words = STOP_WORDS
+    suffixes = COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
+    infixes = COMBINING_DIACRITICS_TOKENIZER_INFIXES
 
 
 class Russian(Language):
diff --git a/spacy/lang/uk/__init__.py b/spacy/lang/uk/__init__.py
index 21f9649f2..737243b66 100644
--- a/spacy/lang/uk/__init__.py
+++ b/spacy/lang/uk/__init__.py
@@ -6,6 +6,8 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .lemmatizer import UkrainianLemmatizer
+from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_INFIXES
+from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
 from ...language import Language, BaseDefaults
 
 
@@ -13,6 +15,8 @@ class UkrainianDefaults(BaseDefaults):
     tokenizer_exceptions = TOKENIZER_EXCEPTIONS
     lex_attr_getters = LEX_ATTRS
     stop_words = STOP_WORDS
+    suffixes = COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
+    infixes = COMBINING_DIACRITICS_TOKENIZER_INFIXES
 
 
 class Ukrainian(Language):
diff --git a/spacy/tests/lang/bg/test_tokenizer.py b/spacy/tests/lang/bg/test_tokenizer.py
new file mode 100644
index 000000000..2e2c45001
--- /dev/null
+++ b/spacy/tests/lang/bg/test_tokenizer.py
@@ -0,0 +1,8 @@
+import pytest
+
+
+def test_bg_tokenizer_handles_final_diacritics(bg_tokenizer):
+    text = "Ня̀маше яйца̀. Ня̀маше яйца̀."
+    tokens = bg_tokenizer(text)
+    assert tokens[1].text == "яйца̀"
+    assert tokens[2].text == "."
diff --git a/spacy/tests/lang/ru/test_tokenizer.py b/spacy/tests/lang/ru/test_tokenizer.py
index 1cfdc50ee..083b55a09 100644
--- a/spacy/tests/lang/ru/test_tokenizer.py
+++ b/spacy/tests/lang/ru/test_tokenizer.py
@@ -1,3 +1,4 @@
+from string import punctuation
 import pytest
 
 
@@ -122,3 +123,36 @@ def test_ru_tokenizer_splits_bracket_period(ru_tokenizer):
     text = "(Раз, два, три, проверка)."
     tokens = ru_tokenizer(text)
     assert tokens[len(tokens) - 1].text == "."
+
+
+@pytest.mark.parametrize(
+    "text",
+    [
+        "рекоменду́я подда́ть жару́. Самого́ Баргамота",
+        "РЕКОМЕНДУ́Я ПОДДА́ТЬ ЖАРУ́. САМОГО́ БАРГАМОТА",
+        "рекоменду̍я подда̍ть жару̍.Самого̍ Баргамота",
+        "рекоменду̍я подда̍ть жару̍.'Самого̍ Баргамота",
+        "рекоменду̍я подда̍ть жару̍,самого̍ Баргамота",
+        "рекоменду̍я подда̍ть жару̍:самого̍ Баргамота",
+        "рекоменду̍я подда̍ть жару̍. самого̍ Баргамота",
+        "рекоменду̍я подда̍ть жару̍, самого̍ Баргамота",
+        "рекоменду̍я подда̍ть жару̍: самого̍ Баргамота",
+        "рекоменду̍я подда̍ть жару̍-самого̍ Баргамота",
+    ],
+)
+def test_ru_tokenizer_handles_final_diacritics(ru_tokenizer, text):
+    tokens = ru_tokenizer(text)
+    assert tokens[2].text in ("жару́", "ЖАРУ́", "жару̍")
+    assert tokens[3].text in punctuation
+
+
+@pytest.mark.parametrize(
+    "text",
+    [
+        "РЕКОМЕНДУ́Я ПОДДА́ТЬ ЖАРУ́.САМОГО́ БАРГАМОТА",
+        "рекоменду̍я подда̍ть жару́.самого́ Баргамота",
+    ],
+)
+def test_ru_tokenizer_handles_final_diacritic_and_period(ru_tokenizer, text):
+    tokens = ru_tokenizer(text)
+    assert tokens[2].text.lower() == "жару́.самого́"
diff --git a/spacy/tests/lang/uk/test_tokenizer.py b/spacy/tests/lang/uk/test_tokenizer.py
index 3d6e87301..6596f490a 100644
--- a/spacy/tests/lang/uk/test_tokenizer.py
+++ b/spacy/tests/lang/uk/test_tokenizer.py
@@ -140,3 +140,10 @@ def test_uk_tokenizer_splits_bracket_period(uk_tokenizer):
     text = "(Раз, два, три, проверка)."
     tokens = uk_tokenizer(text)
     assert tokens[len(tokens) - 1].text == "."
+
+
+def test_uk_tokenizer_handles_final_diacritics(uk_tokenizer):
+    text = "Хлібі́в не було́. Хлібі́в не було́."
+    tokens = uk_tokenizer(text)
+    assert tokens[2].text == "було́"
+    assert tokens[3].text == "."

From 1d5cad0b42c5919dde27a59808ff97f8e15cfaa0 Mon Sep 17 00:00:00 2001
From: Madeesh Kannan <shadeMe@users.noreply.github.com>
Date: Tue, 28 Jun 2022 19:42:58 +0200
Subject: [PATCH 21/25] `Example.get_aligned_parse`: Handle unit and zero
 length vectors correctly (#11026)

* `Example.get_aligned_parse`: Do not squeeze gold token idx vector
Correctly handle zero-size vectors passed to `np.vectorize`

* Add tests

* Use `Doc` ctor to initialize attributes

* Remove unintended change

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* Remove unused import

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 spacy/tests/training/test_training.py | 25 +++++++++++++++++++++++++
 spacy/training/example.pyx            |  6 +++---
 2 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py
index 31bf7e07b..4384a796d 100644
--- a/spacy/tests/training/test_training.py
+++ b/spacy/tests/training/test_training.py
@@ -679,6 +679,31 @@ def test_projectivize(en_tokenizer):
     assert proj_heads == [3, 2, 3, 3, 3]
     assert nonproj_heads == [3, 2, 3, 3, 2]
 
+    # Test single token documents
+    doc = en_tokenizer("Conrail")
+    heads = [0]
+    deps = ["dep"]
+    example = Example.from_dict(doc, {"heads": heads, "deps": deps})
+    proj_heads, proj_labels = example.get_aligned_parse(projectivize=True)
+    assert proj_heads == heads
+    assert proj_labels == deps
+
+    # Test documents with no alignments
+    doc_a = Doc(
+        doc.vocab, words=["Double-Jointed"], spaces=[False], deps=["ROOT"], heads=[0]
+    )
+    doc_b = Doc(
+        doc.vocab,
+        words=["Double", "-", "Jointed"],
+        spaces=[True, True, True],
+        deps=["amod", "punct", "ROOT"],
+        heads=[2, 2, 2],
+    )
+    example = Example(doc_a, doc_b)
+    proj_heads, proj_deps = example.get_aligned_parse(projectivize=True)
+    assert proj_heads == [None]
+    assert proj_deps == [None]
+
 
 def test_iob_to_biluo():
     good_iob = ["O", "O", "B-LOC", "I-LOC", "O", "B-PERSON"]
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index 473364f93..d592e5a52 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -249,9 +249,9 @@ cdef class Example:
         # Fetch all aligned gold token incides.
         if c2g_single_toks.shape == cand_to_gold.lengths.shape:
             # This the most likely case.
-            gold_i = cand_to_gold[:].squeeze()
+            gold_i = cand_to_gold[:]
         else:
-            gold_i = numpy.vectorize(lambda x: cand_to_gold[int(x)][0])(c2g_single_toks).squeeze()
+            gold_i = numpy.vectorize(lambda x: cand_to_gold[int(x)][0], otypes='i')(c2g_single_toks)
 
         # Fetch indices of all gold heads for the aligned gold tokens.
         heads = numpy.asarray(heads, dtype='i')
@@ -261,7 +261,7 @@ cdef class Example:
         # gold tokens (and are aligned to a single candidate token).
         g2c_len_heads = gold_to_cand.lengths[gold_head_i]
         g2c_len_heads = numpy.where(g2c_len_heads == 1)[0]
-        g2c_i = numpy.vectorize(lambda x: gold_to_cand[int(x)][0])(gold_head_i[g2c_len_heads]).squeeze()
+        g2c_i = numpy.vectorize(lambda x: gold_to_cand[int(x)][0], otypes='i')(gold_head_i[g2c_len_heads]).squeeze()
 
         # Update head/dep alignments with the above.
         aligned_heads = numpy.full((self.x.length), None)

From 24f4908fce4740130fc5355f28e9aa87cadd9817 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 28 Jun 2022 19:50:47 +0200
Subject: [PATCH 22/25] Update vector handling in similarity methods (#11013)

Distinguish between vectors that are 0 vs. missing vectors when warning
about missing vectors.

Update `Doc.has_vector` to match `Span.has_vector` and
`Token.has_vector` for cases where the vocab has vectors but none of the
tokens in the container have vectors.
---
 spacy/tests/vocab_vectors/test_similarity.py | 33 +++++++++++++++-----
 spacy/tests/vocab_vectors/test_vectors.py    | 10 +++---
 spacy/tokens/doc.pyx                         |  5 +--
 spacy/tokens/span.pyx                        |  3 +-
 spacy/tokens/token.pyx                       |  3 +-
 5 files changed, 36 insertions(+), 18 deletions(-)

diff --git a/spacy/tests/vocab_vectors/test_similarity.py b/spacy/tests/vocab_vectors/test_similarity.py
index 47cd1f060..1efcdd81e 100644
--- a/spacy/tests/vocab_vectors/test_similarity.py
+++ b/spacy/tests/vocab_vectors/test_similarity.py
@@ -1,6 +1,7 @@
 import pytest
 import numpy
 from spacy.tokens import Doc
+from spacy.vocab import Vocab
 
 from ..util import get_cosine, add_vecs_to_vocab
 
@@ -71,19 +72,17 @@ def test_vectors_similarity_DD(vocab, vectors):
 def test_vectors_similarity_TD(vocab, vectors):
     [(word1, vec1), (word2, vec2)] = vectors
     doc = Doc(vocab, words=[word1, word2])
-    with pytest.warns(UserWarning):
-        assert isinstance(doc.similarity(doc[0]), float)
-        assert isinstance(doc[0].similarity(doc), float)
-        assert doc.similarity(doc[0]) == doc[0].similarity(doc)
+    assert isinstance(doc.similarity(doc[0]), float)
+    assert isinstance(doc[0].similarity(doc), float)
+    assert doc.similarity(doc[0]) == doc[0].similarity(doc)
 
 
 def test_vectors_similarity_TS(vocab, vectors):
     [(word1, vec1), (word2, vec2)] = vectors
     doc = Doc(vocab, words=[word1, word2])
-    with pytest.warns(UserWarning):
-        assert isinstance(doc[:2].similarity(doc[0]), float)
-        assert isinstance(doc[0].similarity(doc[-2]), float)
-        assert doc[:2].similarity(doc[0]) == doc[0].similarity(doc[:2])
+    assert isinstance(doc[:2].similarity(doc[0]), float)
+    assert isinstance(doc[0].similarity(doc[:2]), float)
+    assert doc[:2].similarity(doc[0]) == doc[0].similarity(doc[:2])
 
 
 def test_vectors_similarity_DS(vocab, vectors):
@@ -91,3 +90,21 @@ def test_vectors_similarity_DS(vocab, vectors):
     doc = Doc(vocab, words=[word1, word2])
     assert isinstance(doc.similarity(doc[:2]), float)
     assert doc.similarity(doc[:2]) == doc[:2].similarity(doc)
+
+
+def test_vectors_similarity_no_vectors():
+    vocab = Vocab()
+    doc1 = Doc(vocab, words=["a", "b"])
+    doc2 = Doc(vocab, words=["c", "d", "e"])
+    with pytest.warns(UserWarning):
+        doc1.similarity(doc2)
+    with pytest.warns(UserWarning):
+        doc1.similarity(doc2[1])
+    with pytest.warns(UserWarning):
+        doc1.similarity(doc2[:2])
+    with pytest.warns(UserWarning):
+        doc2.similarity(doc1)
+    with pytest.warns(UserWarning):
+        doc2[1].similarity(doc1)
+    with pytest.warns(UserWarning):
+        doc2[:2].similarity(doc1)
diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py
index e3ad206f4..dd2cfc596 100644
--- a/spacy/tests/vocab_vectors/test_vectors.py
+++ b/spacy/tests/vocab_vectors/test_vectors.py
@@ -318,17 +318,15 @@ def test_vectors_lexeme_doc_similarity(vocab, text):
 @pytest.mark.parametrize("text", [["apple", "orange", "juice"]])
 def test_vectors_span_span_similarity(vocab, text):
     doc = Doc(vocab, words=text)
-    with pytest.warns(UserWarning):
-        assert doc[0:2].similarity(doc[1:3]) == doc[1:3].similarity(doc[0:2])
-        assert -1.0 < doc[0:2].similarity(doc[1:3]) < 1.0
+    assert doc[0:2].similarity(doc[1:3]) == doc[1:3].similarity(doc[0:2])
+    assert -1.0 < doc[0:2].similarity(doc[1:3]) < 1.0
 
 
 @pytest.mark.parametrize("text", [["apple", "orange", "juice"]])
 def test_vectors_span_doc_similarity(vocab, text):
     doc = Doc(vocab, words=text)
-    with pytest.warns(UserWarning):
-        assert doc[0:2].similarity(doc) == doc.similarity(doc[0:2])
-        assert -1.0 < doc[0:2].similarity(doc) < 1.0
+    assert doc[0:2].similarity(doc) == doc.similarity(doc[0:2])
+    assert -1.0 < doc[0:2].similarity(doc) < 1.0
 
 
 @pytest.mark.parametrize(
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index e38de02b4..d9a104ac8 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -607,7 +607,8 @@ cdef class Doc:
         if self.vocab.vectors.n_keys == 0:
             warnings.warn(Warnings.W007.format(obj="Doc"))
         if self.vector_norm == 0 or other.vector_norm == 0:
-            warnings.warn(Warnings.W008.format(obj="Doc"))
+            if not self.has_vector or not other.has_vector:
+                warnings.warn(Warnings.W008.format(obj="Doc"))
             return 0.0
         vector = self.vector
         xp = get_array_module(vector)
@@ -627,7 +628,7 @@ cdef class Doc:
         if "has_vector" in self.user_hooks:
             return self.user_hooks["has_vector"](self)
         elif self.vocab.vectors.size:
-            return True
+            return any(token.has_vector for token in self)
         elif self.tensor.size:
             return True
         else:
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index ab888ae95..c3495f497 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -354,7 +354,8 @@ cdef class Span:
         if self.vocab.vectors.n_keys == 0:
             warnings.warn(Warnings.W007.format(obj="Span"))
         if self.vector_norm == 0.0 or other.vector_norm == 0.0:
-            warnings.warn(Warnings.W008.format(obj="Span"))
+            if not self.has_vector or not other.has_vector:
+                warnings.warn(Warnings.W008.format(obj="Span"))
             return 0.0
         vector = self.vector
         xp = get_array_module(vector)
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index d14930348..7fff6b162 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -206,7 +206,8 @@ cdef class Token:
         if self.vocab.vectors.n_keys == 0:
             warnings.warn(Warnings.W007.format(obj="Token"))
         if self.vector_norm == 0 or other.vector_norm == 0:
-            warnings.warn(Warnings.W008.format(obj="Token"))
+            if not self.has_vector or not other.has_vector:
+                warnings.warn(Warnings.W008.format(obj="Token"))
             return 0.0
         vector = self.vector
         xp = get_array_module(vector)

From dd038b536cf632408080d9a88f3bc4bf2ffdefe4 Mon Sep 17 00:00:00 2001
From: Peter Baumgartner <5107405+pmbaumgartner@users.noreply.github.com>
Date: Tue, 28 Jun 2022 14:42:40 -0400
Subject: [PATCH 23/25] fix to horizontal space (#10994)

---
 spacy/displacy/render.py    | 19 +++++++++++++++----
 spacy/displacy/templates.py |  2 +-
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/spacy/displacy/render.py b/spacy/displacy/render.py
index 247ad996b..a730ce522 100644
--- a/spacy/displacy/render.py
+++ b/spacy/displacy/render.py
@@ -64,8 +64,11 @@ class SpanRenderer:
         # Set up how the text and labels will be rendered
         self.direction = DEFAULT_DIR
         self.lang = DEFAULT_LANG
+        # These values are in px
         self.top_offset = options.get("top_offset", 40)
-        self.top_offset_step = options.get("top_offset_step", 17)
+        # This is how far under the top offset the span labels appear
+        self.span_label_offset = options.get("span_label_offset", 20)
+        self.offset_step = options.get("top_offset_step", 17)
 
         # Set up which templates will be used
         template = options.get("template")
@@ -161,8 +164,16 @@ class SpanRenderer:
             if entities:
                 slices = self._get_span_slices(token["entities"])
                 starts = self._get_span_starts(token["entities"])
+                total_height = (
+                    self.top_offset
+                    + self.span_label_offset
+                    + (self.offset_step * (len(entities) - 1))
+                )
                 markup += self.span_template.format(
-                    text=token["text"], span_slices=slices, span_starts=starts
+                    text=token["text"],
+                    span_slices=slices,
+                    span_starts=starts,
+                    total_height=total_height,
                 )
             else:
                 markup += escape_html(token["text"] + " ")
@@ -171,7 +182,7 @@ class SpanRenderer:
     def _get_span_slices(self, entities: List[Dict]) -> str:
         """Get the rendered markup of all Span slices"""
         span_slices = []
-        for entity, step in zip(entities, itertools.count(step=self.top_offset_step)):
+        for entity, step in zip(entities, itertools.count(step=self.offset_step)):
             color = self.colors.get(entity["label"].upper(), self.default_color)
             span_slice = self.span_slice_template.format(
                 bg=color, top_offset=self.top_offset + step
@@ -182,7 +193,7 @@ class SpanRenderer:
     def _get_span_starts(self, entities: List[Dict]) -> str:
         """Get the rendered markup of all Span start tokens"""
         span_starts = []
-        for entity, step in zip(entities, itertools.count(step=self.top_offset_step)):
+        for entity, step in zip(entities, itertools.count(step=self.offset_step)):
             color = self.colors.get(entity["label"].upper(), self.default_color)
             span_start = (
                 self.span_start_template.format(
diff --git a/spacy/displacy/templates.py b/spacy/displacy/templates.py
index ff81e7a1d..40f5376b1 100644
--- a/spacy/displacy/templates.py
+++ b/spacy/displacy/templates.py
@@ -67,7 +67,7 @@ TPL_SPANS = """
 """
 
 TPL_SPAN = """
-<span style="font-weight: bold; display: inline-block; position: relative;">
+<span style="font-weight: bold; display: inline-block; position: relative; height: {total_height}px;">
     {text}
     {span_slices}
     {span_starts}

From 0ff14aabcecef1003fa3cb6fb6227041bb0df73b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 29 Jun 2022 12:58:31 +0200
Subject: [PATCH 24/25] vectors: avoid expensive comparisons between numpy ints
 and Python ints (#10992)

* vectors: avoid expensive comparisons between numpy ints and Python ints

* vectors: avoid failure on lists of ints

* Convert another numpy int to Python
---
 spacy/vectors.pyx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
index 93f6818ee..8300220c1 100644
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@@ -336,10 +336,10 @@ cdef class Vectors:
         xp = get_array_module(self.data)
         if key is not None:
             key = get_string_id(key)
-            return self.key2row.get(key, -1)
+            return self.key2row.get(int(key), -1)
         elif keys is not None:
             keys = [get_string_id(key) for key in keys]
-            rows = [self.key2row.get(key, -1) for key in keys]
+            rows = [self.key2row.get(int(key), -1) for key in keys]
             return xp.asarray(rows, dtype="i")
         else:
             row2key = {row: key for key, row in self.key2row.items()}

From 4581a4f53f77114cb074d2a76a62068154fa8211 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 29 Jun 2022 20:03:36 +0200
Subject: [PATCH 25/25] Run mypy for python 3.10 (#11052)

---
 .github/azure-steps.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml
index 41f743feb..1f886161a 100644
--- a/.github/azure-steps.yml
+++ b/.github/azure-steps.yml
@@ -27,7 +27,6 @@ steps:
 
   - script: python -m mypy spacy
     displayName: 'Run mypy'
-    condition: ne(variables['python_version'], '3.10')
 
   - task: DeleteFiles@1
     inputs: