From 8cd77dd54cfc89c2f67ca2412490ef9b49a98518 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 4 Oct 2022 11:23:04 +0200
Subject: [PATCH 01/12] Sync flake8 version across requirements (#11580)

---
 .pre-commit-config.yaml | 2 +-
 requirements.txt        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index b959262e3..df59697b1 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -6,7 +6,7 @@ repos:
       language_version: python3.7
       additional_dependencies: ['click==8.0.4']
 -   repo: https://gitlab.com/pycqa/flake8
-    rev: 3.9.2
+    rev: 5.0.4
     hooks:
     - id: flake8
       args:
diff --git a/requirements.txt b/requirements.txt
index 446560c06..14847ff21 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -28,7 +28,7 @@ cython>=0.25,<3.0
 pytest>=5.2.0,!=7.1.0
 pytest-timeout>=1.3.0,<2.0.0
 mock>=2.0.0,<3.0.0
-flake8>=3.8.0,<3.10.0
+flake8>=3.8.0,<6.0.0
 hypothesis>=3.27.0,<7.0.0
 mypy>=0.980,<0.990; platform_machine != "aarch64" and python_version >= "3.7"
 types-dataclasses>=0.1.3; python_version < "3.7"

From ef74f8f5e447dec10ab69d2a7e94f0e09165db75 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Tue, 11 Oct 2022 14:15:22 +0200
Subject: [PATCH 02/12] Fix mypy error in edittree lemmatizer (#11612)

* cleanup imports

* try limiting Thinc to previous release

* remove Model specification

* fix code and revert Thinc constraint
---
 spacy/pipeline/edit_tree_lemmatizer.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/spacy/pipeline/edit_tree_lemmatizer.py b/spacy/pipeline/edit_tree_lemmatizer.py
index b7d615f6d..7f6367c75 100644
--- a/spacy/pipeline/edit_tree_lemmatizer.py
+++ b/spacy/pipeline/edit_tree_lemmatizer.py
@@ -1,7 +1,6 @@
 from typing import cast, Any, Callable, Dict, Iterable, List, Optional
-from typing import Sequence, Tuple, Union
+from typing import Tuple
 from collections import Counter
-from copy import deepcopy
 from itertools import islice
 import numpy as np
 
@@ -150,7 +149,7 @@ class EditTreeLemmatizer(TrainablePipe):
             # Handle cases where there are no tokens in any docs.
             n_labels = len(self.cfg["labels"])
             guesses: List[Ints2d] = [
-                self.model.ops.alloc((0, n_labels), dtype="i") for doc in docs
+                self.model.ops.alloc2i(0, n_labels, dtype="i") for _ in docs
             ]
             assert len(guesses) == n_docs
             return guesses

From 29649589fc889a58c8b631d569d4ae378a10aa2b Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Tue, 11 Oct 2022 15:25:05 +0200
Subject: [PATCH 03/12] remove dtype (#11615)

---
 spacy/pipeline/edit_tree_lemmatizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/pipeline/edit_tree_lemmatizer.py b/spacy/pipeline/edit_tree_lemmatizer.py
index 7f6367c75..76b0e0bc9 100644
--- a/spacy/pipeline/edit_tree_lemmatizer.py
+++ b/spacy/pipeline/edit_tree_lemmatizer.py
@@ -149,7 +149,7 @@ class EditTreeLemmatizer(TrainablePipe):
             # Handle cases where there are no tokens in any docs.
             n_labels = len(self.cfg["labels"])
             guesses: List[Ints2d] = [
-                self.model.ops.alloc2i(0, n_labels, dtype="i") for _ in docs
+                self.model.ops.alloc2i(0, n_labels) for _ in docs
             ]
             assert len(guesses) == n_docs
             return guesses

From 2e52479eec987367117d27fb4f049df2efb2518d Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Tue, 11 Oct 2022 23:45:05 +0900
Subject: [PATCH 04/12] Fix example code for spacy-wordnet (#11593)

* Fix example code for spacy-wordnet

It looks like in the most recent version, 0.1.0, it's no longer possible
to pass the lang parameter to the component separately. Doing so will
raise an error.

* Apply suggestions from code review

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Cleanup

* More cleanup

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 website/meta/universe.json | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/website/meta/universe.json b/website/meta/universe.json
index a6a1a0fc7..637e9d6ce 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -2460,20 +2460,20 @@
                 "import spacy",
                 "from spacy_wordnet.wordnet_annotator import WordnetAnnotator ",
                 "",
-                "# Load an spacy model (supported models are \"es\" and \"en\") ",
-                "nlp = spacy.load('en')",
-                "# Spacy 3.x",
-                "nlp.add_pipe(\"spacy_wordnet\", after='tagger', config={'lang': nlp.lang})",
-                "# Spacy 2.x",
+                "# Load a spaCy model (supported languages are \"es\" and \"en\") ",
+                "nlp = spacy.load('en_core_web_sm')",
+                "# spaCy 3.x",
+                "nlp.add_pipe(\"spacy_wordnet\", after='tagger')",
+                "# spaCy 2.x",
                 "# nlp.add_pipe(WordnetAnnotator(nlp.lang), after='tagger')",
                 "token = nlp('prices')[0]",
                 "",
-                "# wordnet object link spacy token with nltk wordnet interface by giving acces to",
+                "# WordNet object links spaCy token with NLTK WordNet interface by giving access to",
                 "# synsets and lemmas ",
                 "token._.wordnet.synsets()",
                 "token._.wordnet.lemmas()",
                 "",
-                "# And automatically tags with wordnet domains",
+                "# And automatically add info about WordNet domains",
                 "token._.wordnet.wordnet_domains()"
             ],
             "author": "recognai",

From fe06e037bcd733708401bce082863994b1fc48bd Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 12 Oct 2022 12:18:39 +0200
Subject: [PATCH 05/12] Fix init for pymorphy2_lookup lemmatizer mode (#11631)

---
 spacy/lang/ru/lemmatizer.py            |  2 +-
 spacy/lang/uk/lemmatizer.py            |  2 +-
 spacy/tests/conftest.py                | 17 +++++++++++++++++
 spacy/tests/lang/ru/test_lemmatizer.py | 14 ++++++++++++++
 spacy/tests/lang/uk/test_lemmatizer.py |  8 ++++++++
 5 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/spacy/lang/ru/lemmatizer.py b/spacy/lang/ru/lemmatizer.py
index 85180b1e4..5bf685d44 100644
--- a/spacy/lang/ru/lemmatizer.py
+++ b/spacy/lang/ru/lemmatizer.py
@@ -23,7 +23,7 @@ class RussianLemmatizer(Lemmatizer):
         overwrite: bool = False,
         scorer: Optional[Callable] = lemmatizer_score,
     ) -> None:
-        if mode == "pymorphy2":
+        if mode in {"pymorphy2", "pymorphy2_lookup"}:
             try:
                 from pymorphy2 import MorphAnalyzer
             except ImportError:
diff --git a/spacy/lang/uk/lemmatizer.py b/spacy/lang/uk/lemmatizer.py
index a8bc56057..d4f8cc9e5 100644
--- a/spacy/lang/uk/lemmatizer.py
+++ b/spacy/lang/uk/lemmatizer.py
@@ -18,7 +18,7 @@ class UkrainianLemmatizer(RussianLemmatizer):
         overwrite: bool = False,
         scorer: Optional[Callable] = lemmatizer_score,
     ) -> None:
-        if mode == "pymorphy2":
+        if mode in {"pymorphy2", "pymorphy2_lookup"}:
             try:
                 from pymorphy2 import MorphAnalyzer
             except ImportError:
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index 742bfcc6a..394ef00d3 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -343,6 +343,14 @@ def ru_lemmatizer():
     return get_lang_class("ru")().add_pipe("lemmatizer")
 
 
+@pytest.fixture
+def ru_lookup_lemmatizer():
+    pytest.importorskip("pymorphy2")
+    return get_lang_class("ru")().add_pipe(
+        "lemmatizer", config={"mode": "pymorphy2_lookup"}
+    )
+
+
 @pytest.fixture(scope="session")
 def sa_tokenizer():
     return get_lang_class("sa")().tokenizer
@@ -422,6 +430,15 @@ def uk_lemmatizer():
     return get_lang_class("uk")().add_pipe("lemmatizer")
 
 
+@pytest.fixture
+def uk_lookup_lemmatizer():
+    pytest.importorskip("pymorphy2")
+    pytest.importorskip("pymorphy2_dicts_uk")
+    return get_lang_class("uk")().add_pipe(
+        "lemmatizer", config={"mode": "pymorphy2_lookup"}
+    )
+
+
 @pytest.fixture(scope="session")
 def ur_tokenizer():
     return get_lang_class("ur")().tokenizer
diff --git a/spacy/tests/lang/ru/test_lemmatizer.py b/spacy/tests/lang/ru/test_lemmatizer.py
index 9ca7f441b..e82fd4f8c 100644
--- a/spacy/tests/lang/ru/test_lemmatizer.py
+++ b/spacy/tests/lang/ru/test_lemmatizer.py
@@ -78,3 +78,17 @@ def test_ru_lemmatizer_punct(ru_lemmatizer):
     assert ru_lemmatizer.pymorphy2_lemmatize(doc[0]) == ['"']
     doc = Doc(ru_lemmatizer.vocab, words=["»"], pos=["PUNCT"])
     assert ru_lemmatizer.pymorphy2_lemmatize(doc[0]) == ['"']
+
+
+def test_ru_doc_lookup_lemmatization(ru_lookup_lemmatizer):
+    words = ["мама", "мыла", "раму"]
+    pos = ["NOUN", "VERB", "NOUN"]
+    morphs = [
+        "Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing",
+        "Aspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act",
+        "Animacy=Anim|Case=Acc|Gender=Fem|Number=Sing",
+    ]
+    doc = Doc(ru_lookup_lemmatizer.vocab, words=words, pos=pos, morphs=morphs)
+    doc = ru_lookup_lemmatizer(doc)
+    lemmas = [token.lemma_ for token in doc]
+    assert lemmas == ["мама", "мыла", "раму"]
diff --git a/spacy/tests/lang/uk/test_lemmatizer.py b/spacy/tests/lang/uk/test_lemmatizer.py
index 57dd4198a..788744aa1 100644
--- a/spacy/tests/lang/uk/test_lemmatizer.py
+++ b/spacy/tests/lang/uk/test_lemmatizer.py
@@ -9,3 +9,11 @@ def test_uk_lemmatizer(uk_lemmatizer):
     """Check that the default uk lemmatizer runs."""
     doc = Doc(uk_lemmatizer.vocab, words=["a", "b", "c"])
     uk_lemmatizer(doc)
+    assert [token.lemma for token in doc]
+
+
+def test_uk_lookup_lemmatizer(uk_lookup_lemmatizer):
+    """Check that the lookup uk lemmatizer runs."""
+    doc = Doc(uk_lookup_lemmatizer.vocab, words=["a", "b", "c"])
+    uk_lookup_lemmatizer(doc)
+    assert [token.lemma for token in doc]

From 4d869fcc111151bcefa08ee1a2b7b49dc5ecd677 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Wed, 12 Oct 2022 15:17:40 +0200
Subject: [PATCH 06/12] Small fixes to docstrings (#11610)

* add missing scorer arg to docstring

* fix class names in textcat_multilabel

* add missing scorer to docstrings
---
 spacy/pipeline/spancat.py            | 3 +++
 spacy/pipeline/textcat_multilabel.py | 6 ++++--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py
index 1b7a9eecb..ca9f1dab0 100644
--- a/spacy/pipeline/spancat.py
+++ b/spacy/pipeline/spancat.py
@@ -133,6 +133,9 @@ def make_spancat(
     spans_key (str): Key of the doc.spans dict to save the spans under. During
         initialization and training, the component will look for spans on the
         reference document under the same key.
+    scorer (Optional[Callable]): The scoring method. Defaults to
+        Scorer.score_spans for the Doc.spans[spans_key] with overlapping
+        spans allowed.
     threshold (float): Minimum probability to consider a prediction positive.
         Spans with a positive prediction will be saved on the Doc. Defaults to
         0.5.
diff --git a/spacy/pipeline/textcat_multilabel.py b/spacy/pipeline/textcat_multilabel.py
index e33a885f8..119ae3310 100644
--- a/spacy/pipeline/textcat_multilabel.py
+++ b/spacy/pipeline/textcat_multilabel.py
@@ -96,8 +96,8 @@ def make_multilabel_textcat(
     model: Model[List[Doc], List[Floats2d]],
     threshold: float,
     scorer: Optional[Callable],
-) -> "TextCategorizer":
-    """Create a TextCategorizer component. The text categorizer predicts categories
+) -> "MultiLabel_TextCategorizer":
+    """Create a MultiLabel_TextCategorizer component. The text categorizer predicts categories
     over a whole document. It can learn one or more labels, and the labels are considered
     to be non-mutually exclusive, which means that there can be zero or more labels
     per doc).
@@ -105,6 +105,7 @@ def make_multilabel_textcat(
     model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
         scores for each category.
     threshold (float): Cutoff to consider a prediction "positive".
+    scorer (Optional[Callable]): The scoring method.
     """
     return MultiLabel_TextCategorizer(
         nlp.vocab, model, name, threshold=threshold, scorer=scorer
@@ -147,6 +148,7 @@ class MultiLabel_TextCategorizer(TextCategorizer):
         name (str): The component instance name, used to add entries to the
             losses during training.
         threshold (float): Cutoff to consider a prediction "positive".
+        scorer (Optional[Callable]): The scoring method.
 
         DOCS: https://spacy.io/api/textcategorizer#init
         """

From 6b5a3e72198aa9735587b0712e3eb2c24234b463 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 14 Oct 2022 08:16:49 +0200
Subject: [PATCH 07/12] Extend to pydantic v1.10 (#11635)

* Update types in `spacy.schemas` for updated pydantic+mypy
---
 requirements.txt |  2 +-
 setup.cfg        |  2 +-
 spacy/schemas.py | 18 +++++++++---------
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 14847ff21..9d6bbb2c4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -15,7 +15,7 @@ pathy>=0.3.5
 numpy>=1.15.0
 requests>=2.13.0,<3.0.0
 tqdm>=4.38.0,<5.0.0
-pydantic>=1.7.4,!=1.8,!=1.8.1,<1.10.0
+pydantic>=1.7.4,!=1.8,!=1.8.1,<1.11.0
 jinja2
 langcodes>=3.2.0,<4.0.0
 # Official Python utilities
diff --git a/setup.cfg b/setup.cfg
index 2dc5e7042..c2653feba 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -56,7 +56,7 @@ install_requires =
     tqdm>=4.38.0,<5.0.0
     numpy>=1.15.0
     requests>=2.13.0,<3.0.0
-    pydantic>=1.7.4,!=1.8,!=1.8.1,<1.10.0
+    pydantic>=1.7.4,!=1.8,!=1.8.1,<1.11.0
     jinja2
     # Official Python utilities
     setuptools
diff --git a/spacy/schemas.py b/spacy/schemas.py
index 048082134..ab71b2016 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -181,12 +181,12 @@ class TokenPatternNumber(BaseModel):
     IS_SUBSET: Optional[List[StrictInt]] = Field(None, alias="is_subset")
     IS_SUPERSET: Optional[List[StrictInt]] = Field(None, alias="is_superset")
     INTERSECTS: Optional[List[StrictInt]] = Field(None, alias="intersects")
-    EQ: Union[StrictInt, StrictFloat] = Field(None, alias="==")
-    NEQ: Union[StrictInt, StrictFloat] = Field(None, alias="!=")
-    GEQ: Union[StrictInt, StrictFloat] = Field(None, alias=">=")
-    LEQ: Union[StrictInt, StrictFloat] = Field(None, alias="<=")
-    GT: Union[StrictInt, StrictFloat] = Field(None, alias=">")
-    LT: Union[StrictInt, StrictFloat] = Field(None, alias="<")
+    EQ: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias="==")
+    NEQ: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias="!=")
+    GEQ: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias=">=")
+    LEQ: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias="<=")
+    GT: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias=">")
+    LT: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias="<")
 
     class Config:
         extra = "forbid"
@@ -430,7 +430,7 @@ class ProjectConfigAssetURL(BaseModel):
     # fmt: off
     dest: StrictStr = Field(..., title="Destination of downloaded asset")
     url: Optional[StrictStr] = Field(None, title="URL of asset")
-    checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
+    checksum: Optional[str] = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
     description: StrictStr = Field("", title="Description of asset")
     # fmt: on
 
@@ -438,7 +438,7 @@ class ProjectConfigAssetURL(BaseModel):
 class ProjectConfigAssetGit(BaseModel):
     # fmt: off
     git: ProjectConfigAssetGitItem = Field(..., title="Git repo information")
-    checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
+    checksum: Optional[str] = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
     description: Optional[StrictStr] = Field(None, title="Description of asset")
     # fmt: on
 
@@ -508,7 +508,7 @@ class DocJSONSchema(BaseModel):
         None, title="Indices of sentences' start and end indices"
     )
     text: StrictStr = Field(..., title="Document text")
-    spans: Dict[StrictStr, List[Dict[StrictStr, Union[StrictStr, StrictInt]]]] = Field(
+    spans: Optional[Dict[StrictStr, List[Dict[StrictStr, Union[StrictStr, StrictInt]]]]] = Field(
         None, title="Span information - end/start indices, label, KB ID"
     )
     tokens: List[Dict[StrictStr, Union[StrictStr, StrictInt]]] = Field(

From ceb62352bfcad49b3ad63e3e65ef12dabab645b3 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Fri, 14 Oct 2022 18:04:55 +0900
Subject: [PATCH 08/12] Auto-format code with black (#11649)

Co-authored-by: explosion-bot <explosion-bot@users.noreply.github.com>
---
 spacy/pipeline/edit_tree_lemmatizer.py | 4 +---
 spacy/schemas.py                       | 6 +++---
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/spacy/pipeline/edit_tree_lemmatizer.py b/spacy/pipeline/edit_tree_lemmatizer.py
index 76b0e0bc9..12f9b73a3 100644
--- a/spacy/pipeline/edit_tree_lemmatizer.py
+++ b/spacy/pipeline/edit_tree_lemmatizer.py
@@ -148,9 +148,7 @@ class EditTreeLemmatizer(TrainablePipe):
         if not any(len(doc) for doc in docs):
             # Handle cases where there are no tokens in any docs.
             n_labels = len(self.cfg["labels"])
-            guesses: List[Ints2d] = [
-                self.model.ops.alloc2i(0, n_labels) for _ in docs
-            ]
+            guesses: List[Ints2d] = [self.model.ops.alloc2i(0, n_labels) for _ in docs]
             assert len(guesses) == n_docs
             return guesses
         scores = self.model.predict(docs)
diff --git a/spacy/schemas.py b/spacy/schemas.py
index ab71b2016..a67d96d9d 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -508,9 +508,9 @@ class DocJSONSchema(BaseModel):
         None, title="Indices of sentences' start and end indices"
     )
     text: StrictStr = Field(..., title="Document text")
-    spans: Optional[Dict[StrictStr, List[Dict[StrictStr, Union[StrictStr, StrictInt]]]]] = Field(
-        None, title="Span information - end/start indices, label, KB ID"
-    )
+    spans: Optional[
+        Dict[StrictStr, List[Dict[StrictStr, Union[StrictStr, StrictInt]]]]
+    ] = Field(None, title="Span information - end/start indices, label, KB ID")
     tokens: List[Dict[StrictStr, Union[StrictStr, StrictInt]]] = Field(
         ..., title="Token information - ID, start, annotations"
     )

From 858565a5671de61334443d6a2348164bc39216e1 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Tue, 18 Oct 2022 15:11:39 +0900
Subject: [PATCH 09/12] Fix issues with DVC commands (#11592)

* Fix flag handling in dvc

Prior to this commit, if a flag (--verbose or --quiet) was passed to
DVC, it would be added to the end of the generated dvc command line.
This would result in the command being interpreted as part of the actual
command to run, rather than an argument to dvc. This would result in
command lines like:

    spacy project run preprocess --verbose

That would fail with an error that there's no such directory as
`--verbose`.

This change puts the flags at the front of the dvc command so that they
are interpreted correctly. It removes the `run_dvc_commands` function,
which had been reduced to just a for loop and wasn't used elsewhere.

A separate problem is that there's no way to specify the quiet behaviour
to dvc from the command line, though it's unclear if that's a bug.

* Add dvc quiet flag to docs

* Handle case in DVC where no commands are appropriate

If only have commands with no deps or outputs (admittedly unlikely), you
get a weird error about the dvc file not existing. This gives explicit
output instead.

* Add support for quiet flag

* Fix command execution

Commands are strings now because they're joined further up.
---
 spacy/cli/project/dvc.py | 57 +++++++++++++++++++++-------------------
 website/docs/api/cli.md  |  3 ++-
 2 files changed, 32 insertions(+), 28 deletions(-)

diff --git a/spacy/cli/project/dvc.py b/spacy/cli/project/dvc.py
index 83dc5efbf..a15353855 100644
--- a/spacy/cli/project/dvc.py
+++ b/spacy/cli/project/dvc.py
@@ -25,6 +25,7 @@ def project_update_dvc_cli(
     project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
     workflow: Optional[str] = Arg(None, help=f"Name of workflow defined in {PROJECT_FILE}. Defaults to first workflow if not set."),
     verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
+    quiet: bool = Opt(False, "--quiet", "-q", help="Print less info"),
     force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
     # fmt: on
 ):
@@ -36,7 +37,7 @@ def project_update_dvc_cli(
 
     DOCS: https://spacy.io/api/cli#project-dvc
     """
-    project_update_dvc(project_dir, workflow, verbose=verbose, force=force)
+    project_update_dvc(project_dir, workflow, verbose=verbose, quiet=quiet, force=force)
 
 
 def project_update_dvc(
@@ -44,6 +45,7 @@ def project_update_dvc(
     workflow: Optional[str] = None,
     *,
     verbose: bool = False,
+    quiet: bool = False,
     force: bool = False,
 ) -> None:
     """Update the auto-generated Data Version Control (DVC) config file. A DVC
@@ -54,11 +56,12 @@ def project_update_dvc(
     workflow (Optional[str]): Optional name of workflow defined in project.yml.
         If not set, the first workflow will be used.
     verbose (bool): Print more info.
+    quiet (bool): Print less info.
     force (bool): Force update DVC config.
     """
     config = load_project_config(project_dir)
     updated = update_dvc_config(
-        project_dir, config, workflow, verbose=verbose, force=force
+        project_dir, config, workflow, verbose=verbose, quiet=quiet, force=force
     )
     help_msg = "To execute the workflow with DVC, run: dvc repro"
     if updated:
@@ -72,7 +75,7 @@ def update_dvc_config(
     config: Dict[str, Any],
     workflow: Optional[str] = None,
     verbose: bool = False,
-    silent: bool = False,
+    quiet: bool = False,
     force: bool = False,
 ) -> bool:
     """Re-run the DVC commands in dry mode and update dvc.yaml file in the
@@ -83,7 +86,7 @@ def update_dvc_config(
     path (Path): The path to the project directory.
     config (Dict[str, Any]): The loaded project.yml.
     verbose (bool): Whether to print additional info (via DVC).
-    silent (bool): Don't output anything (via DVC).
+    quiet (bool): Don't output anything (via DVC).
     force (bool): Force update, even if hashes match.
     RETURNS (bool): Whether the DVC config file was updated.
     """
@@ -105,6 +108,14 @@ def update_dvc_config(
         dvc_config_path.unlink()
     dvc_commands = []
     config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
+
+    # some flags that apply to every command
+    flags = []
+    if verbose:
+        flags.append("--verbose")
+    if quiet:
+        flags.append("--quiet")
+
     for name in workflows[workflow]:
         command = config_commands[name]
         deps = command.get("deps", [])
@@ -118,14 +129,26 @@ def update_dvc_config(
         deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
         outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
         outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
-        dvc_cmd = ["run", "-n", name, "-w", str(path), "--no-exec"]
+
+        dvc_cmd = ["run", *flags, "-n", name, "-w", str(path), "--no-exec"]
         if command.get("no_skip"):
             dvc_cmd.append("--always-changed")
         full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd]
         dvc_commands.append(join_command(full_cmd))
+
+    if not dvc_commands:
+        # If we don't check for this, then there will be an error when reading the
+        # config, since DVC wouldn't create it.
+        msg.fail(
+            "No usable commands for DVC found. This can happen if none of your "
+            "commands have dependencies or outputs.",
+            exits=1,
+        )
+
     with working_dir(path):
-        dvc_flags = {"--verbose": verbose, "--quiet": silent}
-        run_dvc_commands(dvc_commands, flags=dvc_flags)
+        for c in dvc_commands:
+            dvc_command = "dvc " + c
+            run_command(dvc_command)
     with dvc_config_path.open("r+", encoding="utf8") as f:
         content = f.read()
         f.seek(0, 0)
@@ -133,26 +156,6 @@ def update_dvc_config(
     return True
 
 
-def run_dvc_commands(
-    commands: Iterable[str] = SimpleFrozenList(), flags: Dict[str, bool] = {}
-) -> None:
-    """Run a sequence of DVC commands in a subprocess, in order.
-
-    commands (List[str]): The string commands without the leading "dvc".
-    flags (Dict[str, bool]): Conditional flags to be added to command. Makes it
-        easier to pass flags like --quiet that depend on a variable or
-        command-line setting while avoiding lots of nested conditionals.
-    """
-    for c in commands:
-        command = split_command(c)
-        dvc_command = ["dvc", *command]
-        # Add the flags if they are set to True
-        for flag, is_active in flags.items():
-            if is_active:
-                dvc_command.append(flag)
-        run_command(dvc_command)
-
-
 def check_workflows(workflows: List[str], workflow: Optional[str] = None) -> None:
     """Validate workflows provided in project.yml and check that a given
     workflow can be used to generate a DVC config.
diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index e5cd3089b..fc2c46022 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -1482,7 +1482,7 @@ You'll also need to add the assets you want to track with
 </Infobox>
 
 ```cli
-$ python -m spacy project dvc [project_dir] [workflow] [--force] [--verbose]
+$ python -m spacy project dvc [project_dir] [workflow] [--force] [--verbose] [--quiet]
 ```
 
 > #### Example
@@ -1499,6 +1499,7 @@ $ python -m spacy project dvc [project_dir] [workflow] [--force] [--verbose]
 | `workflow`        | Name of workflow defined in `project.yml`. Defaults to first workflow if not set. ~~Optional[str] \(option)~~ |
 | `--force`, `-F`   | Force-updating config file. ~~bool (flag)~~                                                                   |
 | `--verbose`, `-V` | Print more output generated by DVC. ~~bool (flag)~~                                                           |
+| `--quiet`, `-q`   | Print no output generated by DVC. ~~bool (flag)~~                                                             |
 | `--help`, `-h`    | Show help message and available arguments. ~~bool (flag)~~                                                    |
 | **CREATES**       | A `dvc.yaml` file in the project directory, based on the steps defined in the given workflow.                 |
 

From a1eacaa8db055322d4a066a08b730243a2f5b969 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 18 Oct 2022 14:36:06 +0200
Subject: [PATCH 10/12] Add python 3.11.0rc2 to CI (#11667)

---
 .github/azure-steps.yml | 1 +
 azure-pipelines.yml     | 9 +++++++++
 2 files changed, 10 insertions(+)

diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml
index 9d57219ca..cc0247b3a 100644
--- a/.github/azure-steps.yml
+++ b/.github/azure-steps.yml
@@ -10,6 +10,7 @@ steps:
     inputs:
       versionSpec: ${{ parameters.python_version }}
       architecture: ${{ parameters.architecture }}
+      allowUnstable: true
 
   - bash: |
       echo "##vso[task.setvariable variable=python_version]${{ parameters.python_version }}"
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 2f5201614..357cce835 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -85,6 +85,15 @@ jobs:
         Python310Mac:
           imageName: "macos-latest"
           python.version: "3.10"
+        Python311Linux:
+          imageName: 'ubuntu-latest'
+          python.version: '3.11.0-rc.2'
+        Python311Windows:
+          imageName: 'windows-latest'
+          python.version: '3.11.0-rc.2'
+        Python311Mac:
+          imageName: 'macos-latest'
+          python.version: '3.11.0-rc.2'
       maxParallel: 4
     pool:
       vmImage: $(imageName)

From d66ccb8eb08cd515904045de84351546065fb3ed Mon Sep 17 00:00:00 2001
From: Edward <43848523+thomashacker@users.noreply.github.com>
Date: Wed, 19 Oct 2022 15:52:47 +0200
Subject: [PATCH 11/12] Fix multiple entries per custom extension in doc json
 (#11551)

* Fix multiple extensions and character offset

* Rename token_start/end to start/end

* Refactor Doc.from_json based on review

* Iterate over user_data items

* Only add non-empty underscore entries

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 spacy/schemas.py                            |  4 +-
 spacy/tests/doc/test_json_doc_conversion.py | 25 +++++++----
 spacy/tokens/doc.pyx                        | 48 ++++++++++-----------
 3 files changed, 42 insertions(+), 35 deletions(-)

diff --git a/spacy/schemas.py b/spacy/schemas.py
index a67d96d9d..c824d76b9 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -519,9 +519,9 @@ class DocJSONSchema(BaseModel):
         title="Any custom data stored in the document's _ attribute",
         alias="_",
     )
-    underscore_token: Optional[Dict[StrictStr, Dict[StrictStr, Any]]] = Field(
+    underscore_token: Optional[Dict[StrictStr, List[Dict[StrictStr, Any]]]] = Field(
         None, title="Any custom data stored in the token's _ attribute"
     )
-    underscore_span: Optional[Dict[StrictStr, Dict[StrictStr, Any]]] = Field(
+    underscore_span: Optional[Dict[StrictStr, List[Dict[StrictStr, Any]]]] = Field(
         None, title="Any custom data stored in the span's _ attribute"
     )
diff --git a/spacy/tests/doc/test_json_doc_conversion.py b/spacy/tests/doc/test_json_doc_conversion.py
index 0d7c061c9..19698cfb2 100644
--- a/spacy/tests/doc/test_json_doc_conversion.py
+++ b/spacy/tests/doc/test_json_doc_conversion.py
@@ -128,7 +128,9 @@ def test_doc_to_json_with_token_span_attributes(doc):
     doc._.json_test1 = "hello world"
     doc._.json_test2 = [1, 2, 3]
     doc[0:1]._.span_test = "span_attribute"
+    doc[0:2]._.span_test = "span_attribute_2"
     doc[0]._.token_test = 117
+    doc[1]._.token_test = 118
     doc.spans["span_group"] = [doc[0:1]]
     json_doc = doc.to_json(
         underscore=["json_test1", "json_test2", "token_test", "span_test"]
@@ -139,8 +141,10 @@ def test_doc_to_json_with_token_span_attributes(doc):
     assert json_doc["_"]["json_test2"] == [1, 2, 3]
     assert "underscore_token" in json_doc
     assert "underscore_span" in json_doc
-    assert json_doc["underscore_token"]["token_test"]["value"] == 117
-    assert json_doc["underscore_span"]["span_test"]["value"] == "span_attribute"
+    assert json_doc["underscore_token"]["token_test"][0]["value"] == 117
+    assert json_doc["underscore_token"]["token_test"][1]["value"] == 118
+    assert json_doc["underscore_span"]["span_test"][0]["value"] == "span_attribute"
+    assert json_doc["underscore_span"]["span_test"][1]["value"] == "span_attribute_2"
     assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0
     assert srsly.json_loads(srsly.json_dumps(json_doc)) == json_doc
 
@@ -161,8 +165,8 @@ def test_doc_to_json_with_custom_user_data(doc):
     assert json_doc["_"]["json_test"] == "hello world"
     assert "underscore_token" in json_doc
     assert "underscore_span" in json_doc
-    assert json_doc["underscore_token"]["token_test"]["value"] == 117
-    assert json_doc["underscore_span"]["span_test"]["value"] == "span_attribute"
+    assert json_doc["underscore_token"]["token_test"][0]["value"] == 117
+    assert json_doc["underscore_span"]["span_test"][0]["value"] == "span_attribute"
     assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0
     assert srsly.json_loads(srsly.json_dumps(json_doc)) == json_doc
 
@@ -181,8 +185,8 @@ def test_doc_to_json_with_token_span_same_identifier(doc):
     assert json_doc["_"]["my_ext"] == "hello world"
     assert "underscore_token" in json_doc
     assert "underscore_span" in json_doc
-    assert json_doc["underscore_token"]["my_ext"]["value"] == 117
-    assert json_doc["underscore_span"]["my_ext"]["value"] == "span_attribute"
+    assert json_doc["underscore_token"]["my_ext"][0]["value"] == 117
+    assert json_doc["underscore_span"]["my_ext"][0]["value"] == "span_attribute"
     assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0
     assert srsly.json_loads(srsly.json_dumps(json_doc)) == json_doc
 
@@ -195,10 +199,9 @@ def test_doc_to_json_with_token_attributes_missing(doc):
     doc[0]._.token_test = 117
     json_doc = doc.to_json(underscore=["span_test"])
 
-    assert "underscore_token" in json_doc
     assert "underscore_span" in json_doc
-    assert json_doc["underscore_span"]["span_test"]["value"] == "span_attribute"
-    assert "token_test" not in json_doc["underscore_token"]
+    assert json_doc["underscore_span"]["span_test"][0]["value"] == "span_attribute"
+    assert "underscore_token" not in json_doc
     assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0
 
 
@@ -283,7 +286,9 @@ def test_json_to_doc_with_token_span_attributes(doc):
     doc._.json_test1 = "hello world"
     doc._.json_test2 = [1, 2, 3]
     doc[0:1]._.span_test = "span_attribute"
+    doc[0:2]._.span_test = "span_attribute_2"
     doc[0]._.token_test = 117
+    doc[1]._.token_test = 118
 
     json_doc = doc.to_json(
         underscore=["json_test1", "json_test2", "token_test", "span_test"]
@@ -295,7 +300,9 @@ def test_json_to_doc_with_token_span_attributes(doc):
     assert new_doc._.json_test1 == "hello world"
     assert new_doc._.json_test2 == [1, 2, 3]
     assert new_doc[0]._.token_test == 117
+    assert new_doc[1]._.token_test == 118
     assert new_doc[0:1]._.span_test == "span_attribute"
+    assert new_doc[0:2]._.span_test == "span_attribute_2"
     assert new_doc.user_data == doc.user_data
     assert new_doc.to_bytes(exclude=["user_data"]) == doc.to_bytes(
         exclude=["user_data"]
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index d7d2fd8e6..295f91c28 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -1608,24 +1608,20 @@ cdef class Doc:
                 Doc.set_extension(attr)
             self._.set(attr, doc_json["_"][attr])
 
-        if doc_json.get("underscore_token", {}):
-            for token_attr in doc_json["underscore_token"]:
-                token_start = doc_json["underscore_token"][token_attr]["token_start"]
-                value = doc_json["underscore_token"][token_attr]["value"]
-
-                if not Token.has_extension(token_attr):
-                    Token.set_extension(token_attr)
-                self[token_start]._.set(token_attr, value)
+        for token_attr in doc_json.get("underscore_token", {}):
+            if not Token.has_extension(token_attr):
+                Token.set_extension(token_attr)
+            for token_data in doc_json["underscore_token"][token_attr]:
+                start = token_by_char(self.c, self.length, token_data["start"])
+                value = token_data["value"]
+                self[start]._.set(token_attr, value)
                 
-        if doc_json.get("underscore_span", {}):
-            for span_attr in doc_json["underscore_span"]:
-                token_start = doc_json["underscore_span"][span_attr]["token_start"]
-                token_end = doc_json["underscore_span"][span_attr]["token_end"]
-                value = doc_json["underscore_span"][span_attr]["value"]
-
-                if not Span.has_extension(span_attr):
-                    Span.set_extension(span_attr)
-                self[token_start:token_end]._.set(span_attr, value)
+        for span_attr in doc_json.get("underscore_span", {}):
+            if not Span.has_extension(span_attr):
+                Span.set_extension(span_attr)
+            for span_data in doc_json["underscore_span"][span_attr]:
+                value = span_data["value"]
+                self.char_span(span_data["start"], span_data["end"])._.set(span_attr, value)
         return self
 
     def to_json(self, underscore=None):
@@ -1673,30 +1669,34 @@ cdef class Doc:
         if underscore:
             user_keys = set()
             if self.user_data:
-                data["_"] = {}
-                data["underscore_token"] = {}
-                data["underscore_span"] = {}
-                for data_key in self.user_data:
+                for data_key, value in self.user_data.copy().items():
                     if type(data_key) == tuple and len(data_key) >= 4 and data_key[0] == "._.":
                         attr = data_key[1]
                         start = data_key[2]
                         end = data_key[3]
                         if attr in underscore:
                             user_keys.add(attr)
-                            value = self.user_data[data_key]
                             if not srsly.is_json_serializable(value):
                                 raise ValueError(Errors.E107.format(attr=attr, value=repr(value)))
                             # Check if doc attribute
                             if start is None:
+                                if "_" not in data:
+                                    data["_"] = {}
                                 data["_"][attr] = value
                             # Check if token attribute
                             elif end is None:
+                                if "underscore_token" not in data:
+                                    data["underscore_token"] = {}
                                 if attr not in data["underscore_token"]:
-                                    data["underscore_token"][attr] = {"token_start": start, "value": value}
+                                    data["underscore_token"][attr] = []
+                                data["underscore_token"][attr].append({"start": start, "value": value})
                             # Else span attribute
                             else:
+                                if "underscore_span" not in data:
+                                    data["underscore_span"] = {}
                                 if attr not in data["underscore_span"]:
-                                    data["underscore_span"][attr] = {"token_start": start, "token_end": end, "value": value}
+                                    data["underscore_span"][attr] = []
+                                data["underscore_span"][attr].append({"start": start, "end": end, "value": value})
 
             for attr in underscore:
                 if attr not in user_keys:

From 3d0e895363921d4acb7f89a5b708472681e6fc1b Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 19 Oct 2022 17:33:55 +0200
Subject: [PATCH 12/12] Set version to v3.4.2 (#11672)

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index 843c15aba..ce86e6294 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "3.4.1"
+__version__ = "3.4.2"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __projects__ = "https://github.com/explosion/projects"