Merge pull request #11686 from adrianeboyd/chore/update-v4-from-master

Update v4 from master
2025-11-01 16:37:45 +03:00 · 2022-10-21 12:55:53 +02:00 · 2022-10-21 12:55:53 +02:00 · a4bd890f32
commit a4bd890f32
parent 0e2b7fb28b 103b24fb25
22 changed files with 228 additions and 99 deletions
--- a/.github/azure-steps.yml
+++ b/.github/azure-steps.yml
@ -10,6 +10,7 @@ steps:
    inputs:
      versionSpec: ${{ parameters.python_version }}
      architecture: ${{ parameters.architecture }}
+      allowUnstable: true

  - bash: |
      echo "##vso[task.setvariable variable=python_version]${{ parameters.python_version }}"
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -6,7 +6,7 @@ repos:
      language_version: python3.7
      additional_dependencies: ['click==8.0.4']
 -   repo: https://gitlab.com/pycqa/flake8
-    rev: 3.9.2
+    rev: 5.0.4
    hooks:
    - id: flake8
      args:
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@ -85,6 +85,15 @@ jobs:
        Python310Mac:
          imageName: "macos-latest"
          python.version: "3.10"
+        Python311Linux:
+          imageName: 'ubuntu-latest'
+          python.version: '3.11.0-rc.2'
+        Python311Windows:
+          imageName: 'windows-latest'
+          python.version: '3.11.0-rc.2'
+        Python311Mac:
+          imageName: 'macos-latest'
+          python.version: '3.11.0-rc.2'
      maxParallel: 4
    pool:
      vmImage: $(imageName)
--- a/requirements.txt
+++ b/requirements.txt
@ -15,7 +15,7 @@ pathy>=0.3.5
 numpy>=1.15.0
 requests>=2.13.0,<3.0.0
 tqdm>=4.38.0,<5.0.0
-pydantic>=1.7.4,!=1.8,!=1.8.1,<1.10.0
+pydantic>=1.7.4,!=1.8,!=1.8.1,<1.11.0
 jinja2
 langcodes>=3.2.0,<4.0.0
 # Official Python utilities
@ -28,7 +28,7 @@ cython>=0.25,<3.0
 pytest>=5.2.0,!=7.1.0
 pytest-timeout>=1.3.0,<2.0.0
 mock>=2.0.0,<3.0.0
-flake8>=3.8.0,<3.10.0
+flake8>=3.8.0,<6.0.0
 hypothesis>=3.27.0,<7.0.0
 mypy>=0.980,<0.990; platform_machine != "aarch64" and python_version >= "3.7"
 types-dataclasses>=0.1.3; python_version < "3.7"
--- a/setup.cfg
+++ b/setup.cfg
@ -48,7 +48,7 @@ install_requires =
    tqdm>=4.38.0,<5.0.0
    numpy>=1.15.0
    requests>=2.13.0,<3.0.0
-    pydantic>=1.7.4,!=1.8,!=1.8.1,<1.10.0
+    pydantic>=1.7.4,!=1.8,!=1.8.1,<1.11.0
    jinja2
    # Official Python utilities
    setuptools
--- a/spacy/about.py
+++ b/spacy/about.py
@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "3.4.1"
+__version__ = "3.4.2"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __projects__ = "https://github.com/explosion/projects"
--- a/spacy/cli/project/dvc.py
+++ b/spacy/cli/project/dvc.py
@ -25,6 +25,7 @@ def project_update_dvc_cli(
    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
    workflow: Optional[str] = Arg(None, help=f"Name of workflow defined in {PROJECT_FILE}. Defaults to first workflow if not set."),
    verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
+    quiet: bool = Opt(False, "--quiet", "-q", help="Print less info"),
    force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
    # fmt: on
 ):
@ -36,7 +37,7 @@ def project_update_dvc_cli(

    DOCS: https://spacy.io/api/cli#project-dvc
    """
-    project_update_dvc(project_dir, workflow, verbose=verbose, force=force)
+    project_update_dvc(project_dir, workflow, verbose=verbose, quiet=quiet, force=force)


 def project_update_dvc(
@ -44,6 +45,7 @@ def project_update_dvc(
    workflow: Optional[str] = None,
    *,
    verbose: bool = False,
+    quiet: bool = False,
    force: bool = False,
 ) -> None:
    """Update the auto-generated Data Version Control (DVC) config file. A DVC
@ -54,11 +56,12 @@ def project_update_dvc(
    workflow (Optional[str]): Optional name of workflow defined in project.yml.
        If not set, the first workflow will be used.
    verbose (bool): Print more info.
+    quiet (bool): Print less info.
    force (bool): Force update DVC config.
    """
    config = load_project_config(project_dir)
    updated = update_dvc_config(
-        project_dir, config, workflow, verbose=verbose, force=force
+        project_dir, config, workflow, verbose=verbose, quiet=quiet, force=force
    )
    help_msg = "To execute the workflow with DVC, run: dvc repro"
    if updated:
@ -72,7 +75,7 @@ def update_dvc_config(
    config: Dict[str, Any],
    workflow: Optional[str] = None,
    verbose: bool = False,
-    silent: bool = False,
+    quiet: bool = False,
    force: bool = False,
 ) -> bool:
    """Re-run the DVC commands in dry mode and update dvc.yaml file in the
@ -83,7 +86,7 @@ def update_dvc_config(
    path (Path): The path to the project directory.
    config (Dict[str, Any]): The loaded project.yml.
    verbose (bool): Whether to print additional info (via DVC).
-    silent (bool): Don't output anything (via DVC).
+    quiet (bool): Don't output anything (via DVC).
    force (bool): Force update, even if hashes match.
    RETURNS (bool): Whether the DVC config file was updated.
    """
@ -105,6 +108,14 @@ def update_dvc_config(
        dvc_config_path.unlink()
    dvc_commands = []
    config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
+
+    # some flags that apply to every command
+    flags = []
+    if verbose:
+        flags.append("--verbose")
+    if quiet:
+        flags.append("--quiet")
+
    for name in workflows[workflow]:
        command = config_commands[name]
        deps = command.get("deps", [])
@ -118,14 +129,26 @@ def update_dvc_config(
        deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
        outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
        outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
-        dvc_cmd = ["run", "-n", name, "-w", str(path), "--no-exec"]
+
+        dvc_cmd = ["run", *flags, "-n", name, "-w", str(path), "--no-exec"]
        if command.get("no_skip"):
            dvc_cmd.append("--always-changed")
        full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd]
        dvc_commands.append(join_command(full_cmd))
+
+    if not dvc_commands:
+        # If we don't check for this, then there will be an error when reading the
+        # config, since DVC wouldn't create it.
+        msg.fail(
+            "No usable commands for DVC found. This can happen if none of your "
+            "commands have dependencies or outputs.",
+            exits=1,
+        )
+
    with working_dir(path):
-        dvc_flags = {"--verbose": verbose, "--quiet": silent}
-        run_dvc_commands(dvc_commands, flags=dvc_flags)
+        for c in dvc_commands:
+            dvc_command = "dvc " + c
+            run_command(dvc_command)
    with dvc_config_path.open("r+", encoding="utf8") as f:
        content = f.read()
        f.seek(0, 0)
@ -133,26 +156,6 @@ def update_dvc_config(
    return True


-def run_dvc_commands(
-    commands: Iterable[str] = SimpleFrozenList(), flags: Dict[str, bool] = {}
-) -> None:
-    """Run a sequence of DVC commands in a subprocess, in order.
-
-    commands (List[str]): The string commands without the leading "dvc".
-    flags (Dict[str, bool]): Conditional flags to be added to command. Makes it
-        easier to pass flags like --quiet that depend on a variable or
-        command-line setting while avoiding lots of nested conditionals.
-    """
-    for c in commands:
-        command = split_command(c)
-        dvc_command = ["dvc", *command]
-        # Add the flags if they are set to True
-        for flag, is_active in flags.items():
-            if is_active:
-                dvc_command.append(flag)
-        run_command(dvc_command)
-
-
 def check_workflows(workflows: List[str], workflow: Optional[str] = None) -> None:
    """Validate workflows provided in project.yml and check that a given
    workflow can be used to generate a DVC config.
--- a/spacy/lang/ru/lemmatizer.py
+++ b/spacy/lang/ru/lemmatizer.py
@ -23,7 +23,7 @@ class RussianLemmatizer(Lemmatizer):
        overwrite: bool = False,
        scorer: Optional[Callable] = lemmatizer_score,
    ) -> None:
-        if mode == "pymorphy2":
+        if mode in {"pymorphy2", "pymorphy2_lookup"}:
            try:
                from pymorphy2 import MorphAnalyzer
            except ImportError:
--- a/spacy/lang/uk/lemmatizer.py
+++ b/spacy/lang/uk/lemmatizer.py
@ -18,7 +18,7 @@ class UkrainianLemmatizer(RussianLemmatizer):
        overwrite: bool = False,
        scorer: Optional[Callable] = lemmatizer_score,
    ) -> None:
-        if mode == "pymorphy2":
+        if mode in {"pymorphy2", "pymorphy2_lookup"}:
            try:
                from pymorphy2 import MorphAnalyzer
            except ImportError:
--- a/spacy/pipeline/edit_tree_lemmatizer.py
+++ b/spacy/pipeline/edit_tree_lemmatizer.py
@ -1,7 +1,6 @@
-from typing import cast, Any, Callable, Dict, Iterable, List, Optional
-from typing import Sequence, Tuple, Union
+from typing import cast, Any, Callable, Dict, Iterable, List, Optional, Union
+from typing import Tuple
 from collections import Counter
-from copy import deepcopy
 from itertools import islice
 import numpy as np

--- a/spacy/pipeline/spancat.py
+++ b/spacy/pipeline/spancat.py
@ -30,17 +30,17 @@ scorer = {"@layers": "spacy.LinearLogistic.v1"}
 hidden_size = 128

 [model.tok2vec]
-@architectures = "spacy.Tok2Vec.v1"
+@architectures = "spacy.Tok2Vec.v2"

 [model.tok2vec.embed]
-@architectures = "spacy.MultiHashEmbed.v1"
+@architectures = "spacy.MultiHashEmbed.v2"
 width = 96
 rows = [5000, 2000, 1000, 1000]
 attrs = ["ORTH", "PREFIX", "SUFFIX", "SHAPE"]
 include_static_vectors = false

 [model.tok2vec.encode]
-@architectures = "spacy.MaxoutWindowEncoder.v1"
+@architectures = "spacy.MaxoutWindowEncoder.v2"
 width = ${model.tok2vec.embed.width}
 window_size = 1
 maxout_pieces = 3
@ -139,6 +139,9 @@ def make_spancat(
    spans_key (str): Key of the doc.spans dict to save the spans under. During
        initialization and training, the component will look for spans on the
        reference document under the same key.
+    scorer (Optional[Callable]): The scoring method. Defaults to
+        Scorer.score_spans for the Doc.spans[spans_key] with overlapping
+        spans allowed.
    threshold (float): Minimum probability to consider a prediction positive.
        Spans with a positive prediction will be saved on the Doc. Defaults to
        0.5.
--- a/spacy/pipeline/textcat_multilabel.py
+++ b/spacy/pipeline/textcat_multilabel.py
@ -19,7 +19,7 @@ multi_label_default_config = """
@architectures = "spacy.TextCatEnsemble.v2"

 [model.tok2vec]
-@architectures = "spacy.Tok2Vec.v1"
+@architectures = "spacy.Tok2Vec.v2"

 [model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v2"
@ -29,7 +29,7 @@ attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
 include_static_vectors = false

 [model.tok2vec.encode]
-@architectures = "spacy.MaxoutWindowEncoder.v1"
+@architectures = "spacy.MaxoutWindowEncoder.v2"
 width = ${model.tok2vec.embed.width}
 window_size = 1
 maxout_pieces = 3
@ -98,7 +98,7 @@ def make_multilabel_textcat(
    threshold: float,
    scorer: Optional[Callable],
    save_activations: bool,
-) -> "TextCategorizer":
+) -> "MultiLabel_TextCategorizer":
    """Create a TextCategorizer component. The text categorizer predicts categories
    over a whole document. It can learn one or more labels, and the labels are considered
    to be non-mutually exclusive, which means that there can be zero or more labels
@ -107,6 +107,7 @@ def make_multilabel_textcat(
    model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
        scores for each category.
    threshold (float): Cutoff to consider a prediction "positive".
+    scorer (Optional[Callable]): The scoring method.
    """
    return MultiLabel_TextCategorizer(
        nlp.vocab,
@ -155,7 +156,11 @@ class MultiLabel_TextCategorizer(TextCategorizer):
        name (str): The component instance name, used to add entries to the
            losses during training.
        threshold (float): Cutoff to consider a prediction "positive".
+<<<<<<< HEAD
        save_activations (bool): save model activations in Doc when annotating.
+=======
+        scorer (Optional[Callable]): The scoring method.
+>>>>>>> upstream/master

        DOCS: https://spacy.io/api/textcategorizer#init
        """
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@ -181,12 +181,12 @@ class TokenPatternNumber(BaseModel):
    IS_SUBSET: Optional[List[StrictInt]] = Field(None, alias="is_subset")
    IS_SUPERSET: Optional[List[StrictInt]] = Field(None, alias="is_superset")
    INTERSECTS: Optional[List[StrictInt]] = Field(None, alias="intersects")
-    EQ: Union[StrictInt, StrictFloat] = Field(None, alias="==")
-    NEQ: Union[StrictInt, StrictFloat] = Field(None, alias="!=")
-    GEQ: Union[StrictInt, StrictFloat] = Field(None, alias=">=")
-    LEQ: Union[StrictInt, StrictFloat] = Field(None, alias="<=")
-    GT: Union[StrictInt, StrictFloat] = Field(None, alias=">")
-    LT: Union[StrictInt, StrictFloat] = Field(None, alias="<")
+    EQ: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias="==")
+    NEQ: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias="!=")
+    GEQ: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias=">=")
+    LEQ: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias="<=")
+    GT: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias=">")
+    LT: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias="<")

    class Config:
        extra = "forbid"
@ -430,7 +430,7 @@ class ProjectConfigAssetURL(BaseModel):
    # fmt: off
    dest: StrictStr = Field(..., title="Destination of downloaded asset")
    url: Optional[StrictStr] = Field(None, title="URL of asset")
-    checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
+    checksum: Optional[str] = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
    description: StrictStr = Field("", title="Description of asset")
    # fmt: on

@ -438,7 +438,7 @@ class ProjectConfigAssetURL(BaseModel):
 class ProjectConfigAssetGit(BaseModel):
    # fmt: off
    git: ProjectConfigAssetGitItem = Field(..., title="Git repo information")
-    checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
+    checksum: Optional[str] = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
    description: Optional[StrictStr] = Field(None, title="Description of asset")
    # fmt: on

@ -508,9 +508,9 @@ class DocJSONSchema(BaseModel):
        None, title="Indices of sentences' start and end indices"
    )
    text: StrictStr = Field(..., title="Document text")
-    spans: Dict[StrictStr, List[Dict[StrictStr, Union[StrictStr, StrictInt]]]] = Field(
-        None, title="Span information - end/start indices, label, KB ID"
-    )
+    spans: Optional[
+        Dict[StrictStr, List[Dict[StrictStr, Union[StrictStr, StrictInt]]]]
+    ] = Field(None, title="Span information - end/start indices, label, KB ID")
    tokens: List[Dict[StrictStr, Union[StrictStr, StrictInt]]] = Field(
        ..., title="Token information - ID, start, annotations"
    )
@ -519,9 +519,9 @@ class DocJSONSchema(BaseModel):
        title="Any custom data stored in the document's _ attribute",
        alias="_",
    )
-    underscore_token: Optional[Dict[StrictStr, Dict[StrictStr, Any]]] = Field(
+    underscore_token: Optional[Dict[StrictStr, List[Dict[StrictStr, Any]]]] = Field(
        None, title="Any custom data stored in the token's _ attribute"
    )
-    underscore_span: Optional[Dict[StrictStr, Dict[StrictStr, Any]]] = Field(
+    underscore_span: Optional[Dict[StrictStr, List[Dict[StrictStr, Any]]]] = Field(
        None, title="Any custom data stored in the span's _ attribute"
    )
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -357,6 +357,14 @@ def ru_lemmatizer():
    return get_lang_class("ru")().add_pipe("lemmatizer")


+@pytest.fixture
+def ru_lookup_lemmatizer():
+    pytest.importorskip("pymorphy2")
+    return get_lang_class("ru")().add_pipe(
+        "lemmatizer", config={"mode": "pymorphy2_lookup"}
+    )
+
+
@pytest.fixture(scope="session")
 def sa_tokenizer():
    return get_lang_class("sa")().tokenizer
@ -436,6 +444,15 @@ def uk_lemmatizer():
    return get_lang_class("uk")().add_pipe("lemmatizer")


+@pytest.fixture
+def uk_lookup_lemmatizer():
+    pytest.importorskip("pymorphy2")
+    pytest.importorskip("pymorphy2_dicts_uk")
+    return get_lang_class("uk")().add_pipe(
+        "lemmatizer", config={"mode": "pymorphy2_lookup"}
+    )
+
+
@pytest.fixture(scope="session")
 def ur_tokenizer():
    return get_lang_class("ur")().tokenizer
--- a/spacy/tests/doc/test_json_doc_conversion.py
+++ b/spacy/tests/doc/test_json_doc_conversion.py
@ -128,7 +128,9 @@ def test_doc_to_json_with_token_span_attributes(doc):
    doc._.json_test1 = "hello world"
    doc._.json_test2 = [1, 2, 3]
    doc[0:1]._.span_test = "span_attribute"
+    doc[0:2]._.span_test = "span_attribute_2"
    doc[0]._.token_test = 117
+    doc[1]._.token_test = 118
    doc.spans["span_group"] = [doc[0:1]]
    json_doc = doc.to_json(
        underscore=["json_test1", "json_test2", "token_test", "span_test"]
@ -139,8 +141,10 @@ def test_doc_to_json_with_token_span_attributes(doc):
    assert json_doc["_"]["json_test2"] == [1, 2, 3]
    assert "underscore_token" in json_doc
    assert "underscore_span" in json_doc
-    assert json_doc["underscore_token"]["token_test"]["value"] == 117
-    assert json_doc["underscore_span"]["span_test"]["value"] == "span_attribute"
+    assert json_doc["underscore_token"]["token_test"][0]["value"] == 117
+    assert json_doc["underscore_token"]["token_test"][1]["value"] == 118
+    assert json_doc["underscore_span"]["span_test"][0]["value"] == "span_attribute"
+    assert json_doc["underscore_span"]["span_test"][1]["value"] == "span_attribute_2"
    assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0
    assert srsly.json_loads(srsly.json_dumps(json_doc)) == json_doc

@ -161,8 +165,8 @@ def test_doc_to_json_with_custom_user_data(doc):
    assert json_doc["_"]["json_test"] == "hello world"
    assert "underscore_token" in json_doc
    assert "underscore_span" in json_doc
-    assert json_doc["underscore_token"]["token_test"]["value"] == 117
-    assert json_doc["underscore_span"]["span_test"]["value"] == "span_attribute"
+    assert json_doc["underscore_token"]["token_test"][0]["value"] == 117
+    assert json_doc["underscore_span"]["span_test"][0]["value"] == "span_attribute"
    assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0
    assert srsly.json_loads(srsly.json_dumps(json_doc)) == json_doc

@ -181,8 +185,8 @@ def test_doc_to_json_with_token_span_same_identifier(doc):
    assert json_doc["_"]["my_ext"] == "hello world"
    assert "underscore_token" in json_doc
    assert "underscore_span" in json_doc
-    assert json_doc["underscore_token"]["my_ext"]["value"] == 117
-    assert json_doc["underscore_span"]["my_ext"]["value"] == "span_attribute"
+    assert json_doc["underscore_token"]["my_ext"][0]["value"] == 117
+    assert json_doc["underscore_span"]["my_ext"][0]["value"] == "span_attribute"
    assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0
    assert srsly.json_loads(srsly.json_dumps(json_doc)) == json_doc

@ -195,10 +199,9 @@ def test_doc_to_json_with_token_attributes_missing(doc):
    doc[0]._.token_test = 117
    json_doc = doc.to_json(underscore=["span_test"])

-    assert "underscore_token" in json_doc
    assert "underscore_span" in json_doc
-    assert json_doc["underscore_span"]["span_test"]["value"] == "span_attribute"
-    assert "token_test" not in json_doc["underscore_token"]
+    assert json_doc["underscore_span"]["span_test"][0]["value"] == "span_attribute"
+    assert "underscore_token" not in json_doc
    assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0


@ -283,7 +286,9 @@ def test_json_to_doc_with_token_span_attributes(doc):
    doc._.json_test1 = "hello world"
    doc._.json_test2 = [1, 2, 3]
    doc[0:1]._.span_test = "span_attribute"
+    doc[0:2]._.span_test = "span_attribute_2"
    doc[0]._.token_test = 117
+    doc[1]._.token_test = 118

    json_doc = doc.to_json(
        underscore=["json_test1", "json_test2", "token_test", "span_test"]
@ -295,7 +300,9 @@ def test_json_to_doc_with_token_span_attributes(doc):
    assert new_doc._.json_test1 == "hello world"
    assert new_doc._.json_test2 == [1, 2, 3]
    assert new_doc[0]._.token_test == 117
+    assert new_doc[1]._.token_test == 118
    assert new_doc[0:1]._.span_test == "span_attribute"
+    assert new_doc[0:2]._.span_test == "span_attribute_2"
    assert new_doc.user_data == doc.user_data
    assert new_doc.to_bytes(exclude=["user_data"]) == doc.to_bytes(
        exclude=["user_data"]
--- a/spacy/tests/lang/ru/test_lemmatizer.py
+++ b/spacy/tests/lang/ru/test_lemmatizer.py
@ -78,3 +78,17 @@ def test_ru_lemmatizer_punct(ru_lemmatizer):
    assert ru_lemmatizer.pymorphy2_lemmatize(doc[0]) == ['"']
    doc = Doc(ru_lemmatizer.vocab, words=["»"], pos=["PUNCT"])
    assert ru_lemmatizer.pymorphy2_lemmatize(doc[0]) == ['"']
+
+
+def test_ru_doc_lookup_lemmatization(ru_lookup_lemmatizer):
+    words = ["мама", "мыла", "раму"]
+    pos = ["NOUN", "VERB", "NOUN"]
+    morphs = [
+        "Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing",
+        "Aspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act",
+        "Animacy=Anim|Case=Acc|Gender=Fem|Number=Sing",
+    ]
+    doc = Doc(ru_lookup_lemmatizer.vocab, words=words, pos=pos, morphs=morphs)
+    doc = ru_lookup_lemmatizer(doc)
+    lemmas = [token.lemma_ for token in doc]
+    assert lemmas == ["мама", "мыла", "раму"]
--- a/spacy/tests/lang/uk/test_lemmatizer.py
+++ b/spacy/tests/lang/uk/test_lemmatizer.py
@ -9,3 +9,11 @@ def test_uk_lemmatizer(uk_lemmatizer):
    """Check that the default uk lemmatizer runs."""
    doc = Doc(uk_lemmatizer.vocab, words=["a", "b", "c"])
    uk_lemmatizer(doc)
+    assert [token.lemma for token in doc]
+
+
+def test_uk_lookup_lemmatizer(uk_lookup_lemmatizer):
+    """Check that the lookup uk lemmatizer runs."""
+    doc = Doc(uk_lookup_lemmatizer.vocab, words=["a", "b", "c"])
+    uk_lookup_lemmatizer(doc)
+    assert [token.lemma for token in doc]
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -1619,24 +1619,20 @@ cdef class Doc:
                Doc.set_extension(attr)
            self._.set(attr, doc_json["_"][attr])

-        if doc_json.get("underscore_token", {}):
-            for token_attr in doc_json["underscore_token"]:
-                token_start = doc_json["underscore_token"][token_attr]["token_start"]
-                value = doc_json["underscore_token"][token_attr]["value"]
-
-                if not Token.has_extension(token_attr):
-                    Token.set_extension(token_attr)
-                self[token_start]._.set(token_attr, value)
+        for token_attr in doc_json.get("underscore_token", {}):
+            if not Token.has_extension(token_attr):
+                Token.set_extension(token_attr)
+            for token_data in doc_json["underscore_token"][token_attr]:
+                start = token_by_char(self.c, self.length, token_data["start"])
+                value = token_data["value"]
+                self[start]._.set(token_attr, value)
                
-        if doc_json.get("underscore_span", {}):
-            for span_attr in doc_json["underscore_span"]:
-                token_start = doc_json["underscore_span"][span_attr]["token_start"]
-                token_end = doc_json["underscore_span"][span_attr]["token_end"]
-                value = doc_json["underscore_span"][span_attr]["value"]
-
-                if not Span.has_extension(span_attr):
-                    Span.set_extension(span_attr)
-                self[token_start:token_end]._.set(span_attr, value)
+        for span_attr in doc_json.get("underscore_span", {}):
+            if not Span.has_extension(span_attr):
+                Span.set_extension(span_attr)
+            for span_data in doc_json["underscore_span"][span_attr]:
+                value = span_data["value"]
+                self.char_span(span_data["start"], span_data["end"])._.set(span_attr, value)
        return self

    def to_json(self, underscore=None):
@ -1684,30 +1680,34 @@ cdef class Doc:
        if underscore:
            user_keys = set()
            if self.user_data:
-                data["_"] = {}
-                data["underscore_token"] = {}
-                data["underscore_span"] = {}
-                for data_key in self.user_data:
+                for data_key, value in self.user_data.copy().items():
                    if type(data_key) == tuple and len(data_key) >= 4 and data_key[0] == "._.":
                        attr = data_key[1]
                        start = data_key[2]
                        end = data_key[3]
                        if attr in underscore:
                            user_keys.add(attr)
-                            value = self.user_data[data_key]
                            if not srsly.is_json_serializable(value):
                                raise ValueError(Errors.E107.format(attr=attr, value=repr(value)))
                            # Check if doc attribute
                            if start is None:
+                                if "_" not in data:
+                                    data["_"] = {}
                                data["_"][attr] = value
                            # Check if token attribute
                            elif end is None:
+                                if "underscore_token" not in data:
+                                    data["underscore_token"] = {}
                                if attr not in data["underscore_token"]:
-                                    data["underscore_token"][attr] = {"token_start": start, "value": value}
+                                    data["underscore_token"][attr] = []
+                                data["underscore_token"][attr].append({"start": start, "value": value})
                            # Else span attribute
                            else:
+                                if "underscore_span" not in data:
+                                    data["underscore_span"] = {}
                                if attr not in data["underscore_span"]:
-                                    data["underscore_span"][attr] = {"token_start": start, "token_end": end, "value": value}
+                                    data["underscore_span"][attr] = []
+                                data["underscore_span"][attr].append({"start": start, "end": end, "value": value})

            for attr in underscore:
                if attr not in user_keys:
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@ -1482,7 +1482,7 @@ You'll also need to add the assets you want to track with
 </Infobox>

 ```cli
-$ python -m spacy project dvc [project_dir] [workflow] [--force] [--verbose]
+$ python -m spacy project dvc [project_dir] [workflow] [--force] [--verbose] [--quiet]
 ```

 > #### Example
@ -1499,6 +1499,7 @@ $ python -m spacy project dvc [project_dir] [workflow] [--force] [--verbose]
 | `workflow`        | Name of workflow defined in `project.yml`. Defaults to first workflow if not set. ~~Optional[str] \(option)~~ |
 | `--force`, `-F`   | Force-updating config file. ~~bool (flag)~~                                                                   |
 | `--verbose`, `-V` | Print more output generated by DVC. ~~bool (flag)~~                                                           |
+| `--quiet`, `-q`   | Print no output generated by DVC. ~~bool (flag)~~                                                             |
 | `--help`, `-h`    | Show help message and available arguments. ~~bool (flag)~~                                                    |
 | **CREATES**       | A `dvc.yaml` file in the project directory, based on the steps defined in the given workflow.                 |

--- a/website/docs/api/kb_in_memory.md
+++ b/website/docs/api/kb_in_memory.md
@ -21,9 +21,9 @@ Create the knowledge base.
 > #### Example
 >
 > ```python
-> from spacy.kb import KnowledgeBase
+> from spacy.kb import InMemoryLookupKB
 > vocab = nlp.vocab
-> kb = KnowledgeBase(vocab=vocab, entity_vector_length=64)
+> kb = InMemoryLookupKB(vocab=vocab, entity_vector_length=64)
 > ```

 | Name                   | Description                                      |
--- a/website/docs/usage/projects.md
+++ b/website/docs/usage/projects.md
@ -243,6 +243,27 @@ pipelines.
 > python -m spacy project run test . --vars.foo bar
 > ```

+> #### Tip: Environment Variables
+>
+> Commands in a project file are not executed in a shell, so they don't have
+> direct access to environment variables. But you can insert environment
+> variables using the `env` dictionary to make values available for
+> interpolation, just like values in `vars`. Here's an example `env` dict that
+> makes `$PATH` available as `ENV_PATH`:
+>
+> ```yaml
+> env:
+>   ENV_PATH: PATH
+> ```
+>
+> This can be used in a project command like so:
+>
+> ```yaml
+>   - name: "echo-path"
+>     script:
+>       - "echo ${env.ENV_PATH}"
+> ```
+
 | Section                                             | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
 | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `title`                                             | An optional project title used in `--help` message and [auto-generated docs](#custom-docs).                                                                                                                                                                                                                                                                                                                                                                                                                  |
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@ -1,5 +1,46 @@
 {
    "resources": [
+        {
+            "id": "spacy-cleaner",
+            "title": "spacy-cleaner",
+            "slogan": "Easily clean text with spaCy!",
+            "description": "**spacy-cleaner** utilises spaCy `Language` models to replace, remove, and \n  mutate spaCy tokens. Cleaning actions available are:\n\n* Remove/replace stopwords.\n* Remove/replace punctuation.\n* Remove/replace numbers.\n* Remove/replace emails.\n* Remove/replace URLs.\n* Perform lemmatisation.\n\nSee our [docs](https://ce11an.github.io/spacy-cleaner/) for more information.",
+            "github": "Ce11an/spacy-cleaner",
+            "pip": "spacy-cleaner",
+            "code_example": [
+                "import spacy",
+                "import spacy_cleaner",
+                "from spacy_cleaner.processing import removers, replacers, mutators",
+                "",
+                "model = spacy.load(\"en_core_web_sm\")",
+                "pipeline = spacy_cleaner.Pipeline(",
+                "    model,",
+                "    removers.remove_stopword_token,",
+                "    replacers.replace_punctuation_token,",
+                "    mutators.mutate_lemma_token,",
+                ")",
+                "",
+                "texts = [\"Hello, my name is Cellan! I love to swim!\"]",
+                "",
+                "pipeline.clean(texts)",
+                "# ['hello _IS_PUNCT_ Cellan _IS_PUNCT_ love swim _IS_PUNCT_']"
+            ],
+            "code_language": "python",
+            "url": "https://ce11an.github.io/spacy-cleaner/",
+            "image": "https://raw.githubusercontent.com/Ce11an/spacy-cleaner/main/docs/assets/images/spacemen.png",
+            "author": "Cellan Hall",
+            "author_links": {
+                "twitter": "Ce11an",
+                "github": "Ce11an",
+                "website": "https://www.linkedin.com/in/cellan-hall/"
+            },
+            "category": [
+                "extension"
+            ],
+            "tags": [
+                "text-processing"
+            ]
+        },
        {
            "id": "Zshot",
            "title": "Zshot",
@ -2460,20 +2501,20 @@
                "import spacy",
                "from spacy_wordnet.wordnet_annotator import WordnetAnnotator ",
                "",
-                "# Load an spacy model (supported models are \"es\" and \"en\") ",
-                "nlp = spacy.load('en')",
-                "# Spacy 3.x",
-                "nlp.add_pipe(\"spacy_wordnet\", after='tagger', config={'lang': nlp.lang})",
-                "# Spacy 2.x",
+                "# Load a spaCy model (supported languages are \"es\" and \"en\") ",
+                "nlp = spacy.load('en_core_web_sm')",
+                "# spaCy 3.x",
+                "nlp.add_pipe(\"spacy_wordnet\", after='tagger')",
+                "# spaCy 2.x",
                "# nlp.add_pipe(WordnetAnnotator(nlp.lang), after='tagger')",
                "token = nlp('prices')[0]",
                "",
-                "# wordnet object link spacy token with nltk wordnet interface by giving acces to",
+                "# WordNet object links spaCy token with NLTK WordNet interface by giving access to",
                "# synsets and lemmas ",
                "token._.wordnet.synsets()",
                "token._.wordnet.lemmas()",
                "",
-                "# And automatically tags with wordnet domains",
+                "# And automatically add info about WordNet domains",
                "token._.wordnet.wordnet_domains()"
            ],
            "author": "recognai",