Merge branch 'master' into feature/coref

2025-11-17 08:16:04 +03:00 · 2022-04-18 18:39:08 +09:00 · 2022-04-18 18:39:08 +09:00 · 683f470852
commit 683f470852
parent afd255c0ed d622883a42
187 changed files with 8786 additions and 1138 deletions
--- a/.github/ISSUE_TEMPLATE/01_bugs.md
+++ b/.github/ISSUE_TEMPLATE/01_bugs.md
@ -4,6 +4,8 @@ about: Use this template if you came across a bug or unexpected behaviour differ
 ---
 <!-- NOTE: For questions or install related issues, please open a Discussion instead. -->
 ## How to reproduce the behaviour
 <!-- Include a code example or the steps that led to the problem. Please try to be as specific as possible. -->
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@ -1,8 +1,5 @@
 blank_issues_enabled: false
 contact_links:
  - name: ⚠️ Python 3.10 Support
    url: https://github.com/explosion/spaCy/discussions/9418
    about: Python 3.10 wheels haven't been released yet, see the link for details.
  - name: 🗯 Discussions Forum
    url: https://github.com/explosion/spaCy/discussions
    about: Install issues, usage questions, general discussion and anything else that isn't a bug report.
--- a/.github/azure-steps.yml
+++ b/.github/azure-steps.yml
@ -64,12 +64,12 @@ steps:
    displayName: "Run GPU tests"
    condition: eq(${{ parameters.gpu }}, true)
-  - script: |
+#  - script: |
-      python -m spacy download ca_core_news_sm
+#      python -m spacy download ca_core_news_sm
-      python -m spacy download ca_core_news_md
+#      python -m spacy download ca_core_news_md
-      python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
+#      python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
-    displayName: 'Test download CLI'
+#    displayName: 'Test download CLI'
-    condition: eq(variables['python_version'], '3.8')
+#    condition: eq(variables['python_version'], '3.8')
  - script: |
      python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
@ -93,17 +93,17 @@ steps:
    displayName: 'Test train CLI'
    condition: eq(variables['python_version'], '3.8')
-  - script: |
+#  - script: |
-      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
+#      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
-      PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
+#      PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
-    displayName: 'Test assemble CLI'
+#    displayName: 'Test assemble CLI'
-    condition: eq(variables['python_version'], '3.8')
+#    condition: eq(variables['python_version'], '3.8')
-
+#
-  - script: |
+#  - script: |
-      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
+#      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
-      python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
+#      python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
-    displayName: 'Test assemble CLI vectors warning'
+#    displayName: 'Test assemble CLI vectors warning'
-    condition: eq(variables['python_version'], '3.8')
+#    condition: eq(variables['python_version'], '3.8')
  - script: |
      python .github/validate_universe_json.py website/meta/universe.json
--- a/.github/contributors/fonfonx.md
+++ b/.github/contributors/fonfonx.md
@ -0,0 +1,106 @@
 # spaCy contributor agreement
 This spaCy Contributor Agreement (**"SCA"**) is based on the
 [Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
 The SCA applies to any contribution that you make to any product or project
 managed by us (the **"project"**), and sets out the intellectual property rights
 you grant to us in the contributed materials. The term **"us"** shall mean
 [ExplosionAI GmbH](https://explosion.ai/legal). The term
 **"you"** shall mean the person or entity identified below.
 If you agree to be bound by these terms, fill in the information requested
 below and include the filled-in version with your first pull request, under the
 folder [`.github/contributors/`](/.github/contributors/). The name of the file
 should be your GitHub username, with the extension `.md`. For example, the user
 example_user would create the file `.github/contributors/example_user.md`.
 Read this agreement carefully before signing. These terms and conditions
 constitute a binding legal agreement.
 ## Contributor Agreement
 1. The term "contribution" or "contributed materials" means any source code,
 object code, patch, tool, sample, graphic, specification, manual,
 documentation, or any other material posted or submitted by you to the project.
 2. With respect to any worldwide copyrights, or copyright applications and
 registrations, in your contribution:
    * you hereby assign to us joint ownership, and to the extent that such
    assignment is or becomes invalid, ineffective or unenforceable, you hereby
    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
    royalty-free, unrestricted license to exercise all rights under those
    copyrights. This includes, at our option, the right to sublicense these same
    rights to third parties through multiple levels of sublicensees or other
    licensing arrangements;
    * you agree that each of us can do all things in relation to your
    contribution as if each of us were the sole owners, and if one of us makes
    a derivative work of your contribution, the one who makes the derivative
    work (or has it made will be the sole owner of that derivative work;
    * you agree that you will not assert any moral rights in your contribution
    against us, our licensees or transferees;
    * you agree that we may register a copyright in your contribution and
    exercise all ownership rights associated with it; and
    * you agree that neither of us has any duty to consult with, obtain the
    consent of, pay or render an accounting to the other for any use or
    distribution of your contribution.
 3. With respect to any patents you own, or that you can license without payment
 to any third party, you hereby grant to us a perpetual, irrevocable,
 non-exclusive, worldwide, no-charge, royalty-free license to:
    * make, have made, use, sell, offer to sell, import, and otherwise transfer
    your contribution in whole or in part, alone or in combination with or
    included in any product, work or materials arising out of the project to
    which your contribution was submitted, and
    * at our option, to sublicense these same rights to third parties through
    multiple levels of sublicensees or other licensing arrangements.
 4. Except as set out above, you keep all right, title, and interest in your
 contribution. The rights that you grant to us under these terms are effective
 on the date you first submitted a contribution to us, even if your submission
 took place before the date you sign these terms.
 5. You covenant, represent, warrant and agree that:
    * Each contribution that you submit is and shall be an original work of
    authorship and you can legally grant the rights set out in this SCA;
    * to the best of your knowledge, each contribution will not violate any
    third party's copyrights, trademarks, patents, or other intellectual
    property rights; and
    * each contribution shall be in compliance with U.S. export control laws and
    other applicable export and import laws. You agree to notify us if you
    become aware of any circumstance which would make any of the foregoing
    representations inaccurate in any respect. We may publicly disclose your
    participation in the project, including the fact that you have signed the SCA.
 6. This SCA is governed by the laws of the State of California and applicable
 U.S. Federal law. Any choice of law rules will not apply.
 7. Please place an “x” on one of the applicable statement below. Please do NOT
 mark both statements:
    * [x] I am signing on behalf of myself as an individual and no other person
    or entity, including my employer, has or will have rights with respect to my
    contributions.
    * [ ] I am signing on behalf of my employer or a legal entity and I have the
    actual authority to contractually bind that entity.
 ## Contributor Details
 | Field                          | Entry                |
 |------------------------------- | -------------------- |
 | Name                           | Xavier Fontaine      |
 | Company name (if applicable)   |                      |
 | Title or role (if applicable)  |                      |
 | Date                           | 2022-04-13           |
 | GitHub username                | fonfonx              |
 | Website (optional)             |                      |
--- a/.github/workflows/gputests.yml
+++ b/.github/workflows/gputests.yml
@ -0,0 +1,21 @@
 name: Weekly GPU tests
 on:
  schedule:
    - cron: '0 1 * * MON'
 jobs:
  weekly-gputests:
    strategy:
      fail-fast: false
      matrix:
        branch: [master, v4]
    runs-on: ubuntu-latest
    steps:
      - name: Trigger buildkite build
        uses: buildkite/trigger-pipeline-action@v1.2.0
        env:
          PIPELINE: explosion-ai/spacy-slow-gpu-tests
          BRANCH: ${{ matrix.branch }}
          MESSAGE: ":github: Weekly GPU + slow tests - triggered from a GitHub Action"
          BUILDKITE_API_ACCESS_TOKEN: ${{ secrets.BUILDKITE_SECRET }}
--- a/.github/workflows/slowtests.yml
+++ b/.github/workflows/slowtests.yml
@ -0,0 +1,37 @@
 name: Daily slow tests
 on:
  schedule:
    - cron: '0 0 * * *'
 jobs:
  daily-slowtests:
    strategy:
      fail-fast: false
      matrix:
        branch: [master, v4]
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
        uses: actions/checkout@v1
        with:
          ref: ${{ matrix.branch }}
      - name: Get commits from past 24 hours
        id: check_commits
        run: |
          today=$(date '+%Y-%m-%d %H:%M:%S')
          yesterday=$(date -d "yesterday" '+%Y-%m-%d %H:%M:%S')
          if git log --after="$yesterday" --before="$today" | grep commit ; then
            echo "::set-output name=run_tests::true"
          else
            echo "::set-output name=run_tests::false"
          fi
      - name: Trigger buildkite build
        if: steps.check_commits.outputs.run_tests == 'true'
        uses: buildkite/trigger-pipeline-action@v1.2.0
        env:
          PIPELINE: explosion-ai/spacy-slow-tests
          BRANCH: ${{ matrix.branch }}
          MESSAGE: ":github: Daily slow tests - triggered from a GitHub Action"
          BUILDKITE_API_ACCESS_TOKEN: ${{ secrets.BUILDKITE_SECRET }}
--- a/.gitignore
+++ b/.gitignore
@ -9,7 +9,6 @@ keys/
 spacy/tests/package/setup.cfg
 spacy/tests/package/pyproject.toml
 spacy/tests/package/requirements.txt
 spacy/tests/universe/universe.json
 # Website
 website/.cache/
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -144,7 +144,7 @@ Changes to `.py` files will be effective immediately.
 When fixing a bug, first create an
 [issue](https://github.com/explosion/spaCy/issues) if one does not already
-exist.  The description text can be very short – we don't want to make this too
+exist. The description text can be very short – we don't want to make this too
 bureaucratic.
 Next, add a test to the relevant file in the
@ -233,7 +233,7 @@ also want to keep an eye on unused declared variables or repeated
 (i.e. overwritten) dictionary keys. If your code was formatted with `black`
 (see above), you shouldn't see any formatting-related warnings.
-The [`.flake8`](.flake8) config defines the configuration we use for this
+The `flake8` section in [`setup.cfg`](setup.cfg) defines the configuration we use for this
 codebase. For example, we're not super strict about the line length, and we're
 excluding very large files like lemmatization and tokenizer exception tables.
--- a/README.md
+++ b/README.md
@ -32,19 +32,20 @@ open-source software, released under the MIT license.
 ## 📖 Documentation
-| Documentation              |                                                                |
+| Documentation                                                                                                                                                                                                             |                                                                                                                                                                                                                                                                                                                              |
-| -------------------------- | -------------------------------------------------------------- |
+| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| ⭐️ **[spaCy 101]**        | New to spaCy? Here's everything you need to know!              |
+| ⭐️ **[spaCy 101]**                                                                                                                                                                                                       | New to spaCy? Here's everything you need to know!                                                                                                                                                                                                                                                                            |
-| 📚 **[Usage Guides]**      | How to use spaCy and its features.                             |
+| 📚 **[Usage Guides]**                                                                                                                                                                                                     | How to use spaCy and its features.                                                                                                                                                                                                                                                                                           |
-| 🚀 **[New in v3.0]**       | New features, backwards incompatibilities and migration guide. |
+| 🚀 **[New in v3.0]**                                                                                                                                                                                                      | New features, backwards incompatibilities and migration guide.                                                                                                                                                                                                                                                               |
-| 🪐 **[Project Templates]** | End-to-end workflows you can clone, modify and run.            |
+| 🪐 **[Project Templates]**                                                                                                                                                                                                | End-to-end workflows you can clone, modify and run.                                                                                                                                                                                                                                                                          |
-| 🎛 **[API Reference]**      | The detailed reference for spaCy's API.                        |
+| 🎛 **[API Reference]**                                                                                                                                                                                                     | The detailed reference for spaCy's API.                                                                                                                                                                                                                                                                                      |
-| 📦 **[Models]**            | Download trained pipelines for spaCy.                          |
+| 📦 **[Models]**                                                                                                                                                                                                           | Download trained pipelines for spaCy.                                                                                                                                                                                                                                                                                        |
-| 🌌 **[Universe]**          | Plugins, extensions, demos and books from the spaCy ecosystem. |
+| 🌌 **[Universe]**                                                                                                                                                                                                         | Plugins, extensions, demos and books from the spaCy ecosystem.                                                                                                                                                                                                                                                               |
-| 👩‍🏫 **[Online Course]**     | Learn spaCy in this free and interactive online course.        |
+| 👩‍🏫 **[Online Course]**                                                                                                                                                                                                    | Learn spaCy in this free and interactive online course.                                                                                                                                                                                                                                                                      |
-| 📺 **[Videos]**            | Our YouTube channel with video tutorials, talks and more.      |
+| 📺 **[Videos]**                                                                                                                                                                                                           | Our YouTube channel with video tutorials, talks and more.                                                                                                                                                                                                                                                                    |
-| 🛠 **[Changelog]**          | Changes and version history.                                   |
+| 🛠 **[Changelog]**                                                                                                                                                                                                         | Changes and version history.                                                                                                                                                                                                                                                                                                 |
-| 💝 **[Contribute]**        | How to contribute to the spaCy project and code base.          |
+| 💝 **[Contribute]**                                                                                                                                                                                                       | How to contribute to the spaCy project and code base.                                                                                                                                                                                                                                                                        |
 | <a href="https://explosion.ai/spacy-tailored-pipelines"><img src="https://user-images.githubusercontent.com/13643239/152853098-1c761611-ccb0-4ec6-9066-b234552831fe.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more &rarr;](https://explosion.ai/spacy-tailored-pipelines)** |
 [spacy 101]: https://spacy.io/usage/spacy-101
 [new in v3.0]: https://spacy.io/usage/v3
@ -60,9 +61,7 @@ open-source software, released under the MIT license.
 ## 💬 Where to ask questions
-The spaCy project is maintained by **[@honnibal](https://github.com/honnibal)**,
+The spaCy project is maintained by the [spaCy team](https://explosion.ai/about).
 **[@ines](https://github.com/ines)**, **[@svlandeg](https://github.com/svlandeg)**,
 **[@adrianeboyd](https://github.com/adrianeboyd)** and **[@polm](https://github.com/polm)**.
 Please understand that we won't be able to provide individual support via email.
 We also believe that help is much more valuable if it's shared publicly, so that
 more people can benefit from it.
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@ -11,12 +11,14 @@ trigger:
    exclude:
      - "website/*"
      - "*.md"
      - ".github/workflows/*"
 pr:
-   paths:
+  paths:
    exclude:
      - "*.md"
      - "website/docs/*"
      - "website/src/*"
      - ".github/workflows/*"
 jobs:
  # Perform basic checks for most important errors (syntax etc.) Uses the config
--- a/extra/DEVELOPER_DOCS/Code
+++ b/extra/DEVELOPER_DOCS/Code
@ -137,7 +137,7 @@ If any of the TODOs you've added are important and should be fixed soon, you sho
 ## Type hints
-We use Python type hints across the `.py` files wherever possible. This makes it easy to understand what a function expects and returns, and modern editors will be able to show this information to you when you call an annotated function. Type hints are not currently used in the `.pyx` (Cython) code, except for definitions of registered functions and component factories, where they're used for config validation.
+We use Python type hints across the `.py` files wherever possible. This makes it easy to understand what a function expects and returns, and modern editors will be able to show this information to you when you call an annotated function. Type hints are not currently used in the `.pyx` (Cython) code, except for definitions of registered functions and component factories, where they're used for config validation. Ideally when developing, run `mypy spacy` on the code base to inspect any issues.
 If possible, you should always use the more descriptive type hints like `List[str]` or even `List[Any]` instead of only `list`. We also annotate arguments and return types of `Callable` – although, you can simplify this if the type otherwise gets too verbose (e.g. functions that return factories to create callbacks). Remember that `Callable` takes two values: a **list** of the argument type(s) in order, and the return values.
@ -155,6 +155,13 @@ def create_callback(some_arg: bool) -> Callable[[str, int], List[str]]:
    return callback
 ```
 For typing variables, we prefer the explicit format.
 ```diff
 - var = value    # type: Type
 + var: Type = value
 ```
 For model architectures, Thinc also provides a collection of [custom types](https://thinc.ai/docs/api-types), including more specific types for arrays and model inputs/outputs. Even outside of static type checking, using these types will make the code a lot easier to read and follow, since it's always clear what array types are expected (and what might go wrong if the output is different from the expected type).
 ```python
--- a/pyproject.toml
+++ b/pyproject.toml
@ -5,7 +5,7 @@ requires = [
    "cymem>=2.0.2,<2.1.0",
    "preshed>=3.0.2,<3.1.0",
    "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=8.0.12,<8.1.0",
+    "thinc>=8.0.14,<8.1.0",
    "blis>=0.4.0,<0.8.0",
    "pathy",
    "numpy>=1.15.0",
--- a/requirements.txt
+++ b/requirements.txt
@ -1,14 +1,14 @@
 # Our libraries
-spacy-legacy>=3.0.8,<3.1.0
+spacy-legacy>=3.0.9,<3.1.0
 spacy-loggers>=1.0.0,<2.0.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=8.0.12,<8.1.0
+thinc>=8.0.14,<8.1.0
 blis>=0.4.0,<0.8.0
 ml_datasets>=0.2.0,<0.3.0
 murmurhash>=0.28.0,<1.1.0
-wasabi>=0.8.1,<1.1.0
+wasabi>=0.9.1,<1.1.0
-srsly>=2.4.1,<3.0.0
+srsly>=2.4.3,<3.0.0
 catalogue>=2.0.6,<2.1.0
 typer>=0.3.0,<0.5.0
 pathy>=0.3.5
@ -26,7 +26,7 @@ typing_extensions>=3.7.4.1,<4.0.0.0; python_version < "3.8"
 # Development dependencies
 pre-commit>=2.13.0
 cython>=0.25,<3.0
-pytest>=5.2.0
+pytest>=5.2.0,!=7.1.0
 pytest-timeout>=1.3.0,<2.0.0
 mock>=2.0.0,<3.0.0
 flake8>=3.8.0,<3.10.0
@ -35,3 +35,4 @@ mypy==0.910
 types-dataclasses>=0.1.3; python_version < "3.7"
 types-mock>=0.1.1
 types-requests
 black>=22.0,<23.0
--- a/setup.cfg
+++ b/setup.cfg
@ -38,18 +38,18 @@ setup_requires =
    cymem>=2.0.2,<2.1.0
    preshed>=3.0.2,<3.1.0
    murmurhash>=0.28.0,<1.1.0
-    thinc>=8.0.12,<8.1.0
+    thinc>=8.0.14,<8.1.0
 install_requires =
    # Our libraries
-    spacy-legacy>=3.0.8,<3.1.0
+    spacy-legacy>=3.0.9,<3.1.0
    spacy-loggers>=1.0.0,<2.0.0
    murmurhash>=0.28.0,<1.1.0
    cymem>=2.0.2,<2.1.0
    preshed>=3.0.2,<3.1.0
-    thinc>=8.0.12,<8.1.0
+    thinc>=8.0.14,<8.1.0
    blis>=0.4.0,<0.8.0
-    wasabi>=0.8.1,<1.1.0
+    wasabi>=0.9.1,<1.1.0
-    srsly>=2.4.1,<3.0.0
+    srsly>=2.4.3,<3.0.0
    catalogue>=2.0.6,<2.1.0
    typer>=0.3.0,<0.5.0
    pathy>=0.3.5
--- a/setup.py
+++ b/setup.py
@ -23,6 +23,7 @@ Options.docstrings = True
 PACKAGES = find_packages()
 MOD_NAMES = [
    "spacy.training.alignment_array",
    "spacy.training.example",
    "spacy.parts_of_speech",
    "spacy.strings",
@ -33,6 +34,7 @@ MOD_NAMES = [
    "spacy.ml.parser_model",
    "spacy.morphology",
    "spacy.pipeline.dep_parser",
    "spacy.pipeline._edit_tree_internals.edit_trees",
    "spacy.pipeline.morphologizer",
    "spacy.pipeline.multitask",
    "spacy.pipeline.ner",
@ -81,7 +83,6 @@ COPY_FILES = {
    ROOT / "setup.cfg": PACKAGE_ROOT / "tests" / "package",
    ROOT / "pyproject.toml": PACKAGE_ROOT / "tests" / "package",
    ROOT / "requirements.txt": PACKAGE_ROOT / "tests" / "package",
    ROOT / "website" / "meta" / "universe.json": PACKAGE_ROOT / "tests" / "universe",
 }
--- a/spacy/about.py
+++ b/spacy/about.py
@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "3.2.1"
+__version__ = "3.3.0.dev0"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __projects__ = "https://github.com/explosion/projects"
--- a/spacy/cli/init.py
+++ b/spacy/cli/init.py
@ -14,6 +14,7 @@ from .pretrain import pretrain  # noqa: F401
 from .debug_data import debug_data  # noqa: F401
 from .debug_config import debug_config  # noqa: F401
 from .debug_model import debug_model  # noqa: F401
 from .debug_diff import debug_diff  # noqa: F401
 from .evaluate import evaluate  # noqa: F401
 from .convert import convert  # noqa: F401
 from .init_pipeline import init_pipeline_cli  # noqa: F401
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@ -360,7 +360,7 @@ def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False)
    src = str(src)
    with smart_open.open(src, mode="rb", ignore_ext=True) as input_file:
        with dest.open(mode="wb") as output_file:
-            output_file.write(input_file.read())
+            shutil.copyfileobj(input_file, output_file)
 def ensure_pathy(path):
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@ -19,6 +19,7 @@ from ..morphology import Morphology
 from ..language import Language
 from ..util import registry, resolve_dot_names
 from ..compat import Literal
 from ..vectors import Mode as VectorsMode
 from .. import util
@ -170,29 +171,101 @@ def debug_data(
        show=verbose,
    )
    if len(nlp.vocab.vectors):
-        msg.info(
+        if nlp.vocab.vectors.mode == VectorsMode.floret:
-            f"{len(nlp.vocab.vectors)} vectors ({nlp.vocab.vectors.n_keys} "
+            msg.info(
-            f"unique keys, {nlp.vocab.vectors_length} dimensions)"
+                f"floret vectors with {len(nlp.vocab.vectors)} vectors, "
-        )
+                f"{nlp.vocab.vectors_length} dimensions, "
-        n_missing_vectors = sum(gold_train_data["words_missing_vectors"].values())
+                f"{nlp.vocab.vectors.minn}-{nlp.vocab.vectors.maxn} char "
-        msg.warn(
+                f"n-gram subwords"
-            "{} words in training data without vectors ({:.0f}%)".format(
+            )
-                n_missing_vectors,
+        else:
-                100 * (n_missing_vectors / gold_train_data["n_words"]),
+            msg.info(
-            ),
+                f"{len(nlp.vocab.vectors)} vectors ({nlp.vocab.vectors.n_keys} "
-        )
+                f"unique keys, {nlp.vocab.vectors_length} dimensions)"
-        msg.text(
+            )
-            "10 most common words without vectors: {}".format(
+            n_missing_vectors = sum(gold_train_data["words_missing_vectors"].values())
-                _format_labels(
+            msg.warn(
-                    gold_train_data["words_missing_vectors"].most_common(10),
+                "{} words in training data without vectors ({:.0f}%)".format(
-                    counts=True,
+                    n_missing_vectors,
-                )
+                    100 * (n_missing_vectors / gold_train_data["n_words"]),
-            ),
+                ),
-            show=verbose,
+            )
-        )
+            msg.text(
                "10 most common words without vectors: {}".format(
                    _format_labels(
                        gold_train_data["words_missing_vectors"].most_common(10),
                        counts=True,
                    )
                ),
                show=verbose,
            )
    else:
        msg.info("No word vectors present in the package")
    if "spancat" in factory_names:
        model_labels_spancat = _get_labels_from_spancat(nlp)
        has_low_data_warning = False
        has_no_neg_warning = False
        msg.divider("Span Categorization")
        msg.table(model_labels_spancat, header=["Spans Key", "Labels"], divider=True)
        msg.text("Label counts in train data: ", show=verbose)
        for spans_key, data_labels in gold_train_data["spancat"].items():
            msg.text(
                f"Key: {spans_key}, {_format_labels(data_labels.items(), counts=True)}",
                show=verbose,
            )
        # Data checks: only take the spans keys in the actual spancat components
        data_labels_in_component = {
            spans_key: gold_train_data["spancat"][spans_key]
            for spans_key in model_labels_spancat.keys()
        }
        for spans_key, data_labels in data_labels_in_component.items():
            for label, count in data_labels.items():
                # Check for missing labels
                spans_key_in_model = spans_key in model_labels_spancat.keys()
                if (spans_key_in_model) and (
                    label not in model_labels_spancat[spans_key]
                ):
                    msg.warn(
                        f"Label '{label}' is not present in the model labels of key '{spans_key}'. "
                        "Performance may degrade after training."
                    )
                # Check for low number of examples per label
                if count <= NEW_LABEL_THRESHOLD:
                    msg.warn(
                        f"Low number of examples for label '{label}' in key '{spans_key}' ({count})"
                    )
                    has_low_data_warning = True
                # Check for negative examples
                with msg.loading("Analyzing label distribution..."):
                    neg_docs = _get_examples_without_label(
                        train_dataset, label, "spancat", spans_key
                    )
                if neg_docs == 0:
                    msg.warn(f"No examples for texts WITHOUT new label '{label}'")
                    has_no_neg_warning = True
        if has_low_data_warning:
            msg.text(
                f"To train a new span type, your data should include at "
                f"least {NEW_LABEL_THRESHOLD} instances of the new label",
                show=verbose,
            )
        else:
            msg.good("Good amount of examples for all labels")
        if has_no_neg_warning:
            msg.text(
                "Training data should always include examples of spans "
                "in context, as well as examples without a given span "
                "type.",
                show=verbose,
            )
        else:
            msg.good("Examples without ocurrences available for all labels")
    if "ner" in factory_names:
        # Get all unique NER labels present in the data
        labels = set(
@ -238,7 +311,7 @@ def debug_data(
                has_low_data_warning = True
                with msg.loading("Analyzing label distribution..."):
-                    neg_docs = _get_examples_without_label(train_dataset, label)
+                    neg_docs = _get_examples_without_label(train_dataset, label, "ner")
                if neg_docs == 0:
                    msg.warn(f"No examples for texts WITHOUT new label '{label}'")
                    has_no_neg_warning = True
@ -573,6 +646,7 @@ def _compile_gold(
        "deps": Counter(),
        "words": Counter(),
        "roots": Counter(),
        "spancat": dict(),
        "ws_ents": 0,
        "boundary_cross_ents": 0,
        "n_words": 0,
@ -603,6 +677,7 @@ def _compile_gold(
                if nlp.vocab.strings[word] not in nlp.vocab.vectors:
                    data["words_missing_vectors"].update([word])
        if "ner" in factory_names:
            sent_starts = eg.get_aligned_sent_starts()
            for i, label in enumerate(eg.get_aligned_ner()):
                if label is None:
                    continue
@ -612,10 +687,19 @@ def _compile_gold(
                if label.startswith(("B-", "U-")):
                    combined_label = label.split("-")[1]
                    data["ner"][combined_label] += 1
-                if gold[i].is_sent_start and label.startswith(("I-", "L-")):
+                if sent_starts[i] == True and label.startswith(("I-", "L-")):
                    data["boundary_cross_ents"] += 1
                elif label == "-":
                    data["ner"]["-"] += 1
        if "spancat" in factory_names:
            for span_key in list(eg.reference.spans.keys()):
                if span_key not in data["spancat"]:
                    data["spancat"][span_key] = Counter()
                for i, span in enumerate(eg.reference.spans[span_key]):
                    if span.label_ is None:
                        continue
                    else:
                        data["spancat"][span_key][span.label_] += 1
        if "textcat" in factory_names or "textcat_multilabel" in factory_names:
            data["cats"].update(gold.cats)
            if any(val not in (0, 1) for val in gold.cats.values()):
@ -686,14 +770,28 @@ def _format_labels(
    return ", ".join([f"'{l}'" for l in cast(Iterable[str], labels)])
-def _get_examples_without_label(data: Sequence[Example], label: str) -> int:
+def _get_examples_without_label(
    data: Sequence[Example],
    label: str,
    component: Literal["ner", "spancat"] = "ner",
    spans_key: Optional[str] = "sc",
 ) -> int:
    count = 0
    for eg in data:
-        labels = [
+        if component == "ner":
-            label.split("-")[1]
+            labels = [
-            for label in eg.get_aligned_ner()
+                label.split("-")[1]
-            if label not in ("O", "-", None)
+                for label in eg.get_aligned_ner()
-        ]
+                if label not in ("O", "-", None)
            ]
        if component == "spancat":
            labels = (
                [span.label_ for span in eg.reference.spans[spans_key]]
                if spans_key in eg.reference.spans
                else []
            )
        if label not in labels:
            count += 1
    return count
--- a/spacy/cli/debug_diff.py
+++ b/spacy/cli/debug_diff.py
@ -0,0 +1,89 @@
 from typing import Optional
 import typer
 from wasabi import Printer, diff_strings, MarkdownRenderer
 from pathlib import Path
 from thinc.api import Config
 from ._util import debug_cli, Arg, Opt, show_validation_error, parse_config_overrides
 from ..util import load_config
 from .init_config import init_config, Optimizations
@debug_cli.command(
    "diff-config",
    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
 )
 def debug_diff_cli(
    # fmt: off
    ctx: typer.Context,
    config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
    compare_to: Optional[Path] = Opt(None, help="Path to a config file to diff against, or `None` to compare against default settings", exists=True, allow_dash=True),
    optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether the user config was optimized for efficiency or accuracy. Only relevant when comparing against the default config."),
    gpu: bool = Opt(False, "--gpu", "-G", help="Whether the original config can run on a GPU. Only relevant when comparing against the default config."),
    pretraining: bool = Opt(False, "--pretraining", "--pt", help="Whether to compare on a config with pretraining involved. Only relevant when comparing against the default config."),
    markdown: bool = Opt(False, "--markdown", "-md", help="Generate Markdown for GitHub issues")
    # fmt: on
 ):
    """Show a diff of a config file with respect to spaCy's defaults or another config file. If
    additional settings were used in the creation of the config file, then you
    must supply these as extra parameters to the command when comparing to the default settings. The generated diff
    can also be used when posting to the discussion forum to provide more
    information for the maintainers.
    The `optimize`, `gpu`, and `pretraining` options are only relevant when
    comparing against the default configuration (or specifically when `compare_to` is None).
    DOCS: https://spacy.io/api/cli#debug-diff
    """
    debug_diff(
        config_path=config_path,
        compare_to=compare_to,
        gpu=gpu,
        optimize=optimize,
        pretraining=pretraining,
        markdown=markdown,
    )
 def debug_diff(
    config_path: Path,
    compare_to: Optional[Path],
    gpu: bool,
    optimize: Optimizations,
    pretraining: bool,
    markdown: bool,
 ):
    msg = Printer()
    with show_validation_error(hint_fill=False):
        user_config = load_config(config_path)
        if compare_to:
            other_config = load_config(compare_to)
        else:
            # Recreate a default config based from user's config
            lang = user_config["nlp"]["lang"]
            pipeline = list(user_config["nlp"]["pipeline"])
            msg.info(f"Found user-defined language: '{lang}'")
            msg.info(f"Found user-defined pipelines: {pipeline}")
            other_config = init_config(
                lang=lang,
                pipeline=pipeline,
                optimize=optimize.value,
                gpu=gpu,
                pretraining=pretraining,
                silent=True,
            )
    user = user_config.to_str()
    other = other_config.to_str()
    if user == other:
        msg.warn("No diff to show: configs are identical")
    else:
        diff_text = diff_strings(other, user, add_symbols=markdown)
        if markdown:
            md = MarkdownRenderer()
            md.add(md.code_block(diff_text, "diff"))
            print(md.text)
        else:
            print(diff_text)
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@ -7,6 +7,7 @@ from collections import defaultdict
 from catalogue import RegistryError
 import srsly
 import sys
 import re
 from ._util import app, Arg, Opt, string_to_list, WHEEL_SUFFIX, SDIST_SUFFIX
 from ..schemas import validate, ModelMetaSchema
@ -109,6 +110,24 @@ def package(
            ", ".join(meta["requirements"]),
        )
    if name is not None:
        if not name.isidentifier():
            msg.fail(
                f"Model name ('{name}') is not a valid module name. "
                "This is required so it can be imported as a module.",
                "We recommend names that use ASCII A-Z, a-z, _ (underscore), "
                "and 0-9. "
                "For specific details see: https://docs.python.org/3/reference/lexical_analysis.html#identifiers",
                exits=1,
            )
        if not _is_permitted_package_name(name):
            msg.fail(
                f"Model name ('{name}') is not a permitted package name. "
                "This is required to correctly load the model with spacy.load.",
                "We recommend names that use ASCII A-Z, a-z, _ (underscore), "
                "and 0-9. "
                "For specific details see: https://www.python.org/dev/peps/pep-0426/#name",
                exits=1,
            )
        meta["name"] = name
    if version is not None:
        meta["version"] = version
@ -162,7 +181,7 @@ def package(
        imports="\n".join(f"from . import {m}" for m in imports)
    )
    create_file(package_path / "__init__.py", init_py)
-    msg.good(f"Successfully created package '{model_name_v}'", main_path)
+    msg.good(f"Successfully created package directory '{model_name_v}'", main_path)
    if create_sdist:
        with util.working_dir(main_path):
            util.run_command([sys.executable, "setup.py", "sdist"], capture=False)
@ -171,8 +190,14 @@ def package(
    if create_wheel:
        with util.working_dir(main_path):
            util.run_command([sys.executable, "setup.py", "bdist_wheel"], capture=False)
-        wheel = main_path / "dist" / f"{model_name_v}{WHEEL_SUFFIX}"
+        wheel_name_squashed = re.sub("_+", "_", model_name_v)
        wheel = main_path / "dist" / f"{wheel_name_squashed}{WHEEL_SUFFIX}"
        msg.good(f"Successfully created binary wheel", wheel)
    if "__" in model_name:
        msg.warn(
            f"Model name ('{model_name}') contains a run of underscores. "
            "Runs of underscores are not significant in installed package names.",
        )
 def has_wheel() -> bool:
@ -422,6 +447,14 @@ def _format_label_scheme(data: Dict[str, Any]) -> str:
    return md.text
 def _is_permitted_package_name(package_name: str) -> bool:
    # regex from: https://www.python.org/dev/peps/pep-0426/#name
    permitted_match = re.search(
        r"^([A-Z0-9]|[A-Z0-9][A-Z0-9._-]*[A-Z0-9])$", package_name, re.IGNORECASE
    )
    return permitted_match is not None
 TEMPLATE_SETUP = """
 #!/usr/bin/env python
 import io
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@ -3,9 +3,15 @@ the docs and the init config command. It encodes various best practices and
 can help generate the best possible configuration, given a user's requirements. #}
 {%- set use_transformer = hardware != "cpu" -%}
 {%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
 {%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "spancat", "trainable_lemmatizer"] -%}
 [paths]
 train = null
 dev = null
 {% if use_transformer or optimize == "efficiency" or not word_vectors -%}
 vectors = null
 {% else -%}
 vectors = "{{ word_vectors }}"
 {% endif -%}
 [system]
 {% if use_transformer -%}
@ -19,10 +25,10 @@ lang = "{{ lang }}"
 {%- set has_textcat = ("textcat" in components or "textcat_multilabel" in components) -%}
 {%- set with_accuracy = optimize == "accuracy" -%}
 {%- set has_accurate_textcat = has_textcat and with_accuracy -%}
-{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "entity_linker" in components or has_accurate_textcat) -%}
+{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "spancat" in components or "trainable_lemmatizer" in components or "entity_linker" in components or has_accurate_textcat) -%}
-{%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components %}
+{%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components -%}
 {%- else -%}
-{%- set full_pipeline = components %}
+{%- set full_pipeline = components -%}
 {%- endif %}
 pipeline = {{ full_pipeline|pprint()|replace("'", '"')|safe }}
 batch_size = {{ 128 if hardware == "gpu" else 1000 }}
@ -49,7 +55,7 @@ stride = 96
 factory = "morphologizer"
 [components.morphologizer.model]
-@architectures = "spacy.Tagger.v1"
+@architectures = "spacy.Tagger.v2"
 nO = null
 [components.morphologizer.model.tok2vec]
@ -65,7 +71,7 @@ grad_factor = 1.0
 factory = "tagger"
 [components.tagger.model]
-@architectures = "spacy.Tagger.v1"
+@architectures = "spacy.Tagger.v2"
 nO = null
 [components.tagger.model.tok2vec]
@ -118,6 +124,60 @@ grad_factor = 1.0
@layers = "reduce_mean.v1"
 {% endif -%}
 {% if "spancat" in components -%}
 [components.spancat]
 factory = "spancat"
 max_positive = null
 scorer = {"@scorers":"spacy.spancat_scorer.v1"}
 spans_key = "sc"
 threshold = 0.5
 [components.spancat.model]
@architectures = "spacy.SpanCategorizer.v1"
 [components.spancat.model.reducer]
@layers = "spacy.mean_max_reducer.v1"
 hidden_size = 128
 [components.spancat.model.scorer]
@layers = "spacy.LinearLogistic.v1"
 nO = null
 nI = null
 [components.spancat.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
 grad_factor = 1.0
 [components.spancat.model.tok2vec.pooling]
@layers = "reduce_mean.v1"
 [components.spancat.suggester]
@misc = "spacy.ngram_suggester.v1"
 sizes = [1,2,3]
 {% endif -%}
 {% if "trainable_lemmatizer" in components -%}
 [components.trainable_lemmatizer]
 factory = "trainable_lemmatizer"
 backoff = "orth"
 min_tree_freq = 3
 overwrite = false
 scorer = {"@scorers":"spacy.lemmatizer_scorer.v1"}
 top_k = 1
 [components.trainable_lemmatizer.model]
@architectures = "spacy.Tagger.v2"
 nO = null
 normalize = false
 [components.trainable_lemmatizer.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
 grad_factor = 1.0
 [components.trainable_lemmatizer.model.tok2vec.pooling]
@layers = "reduce_mean.v1"
 {% endif -%}
 {% if "entity_linker" in components -%}
 [components.entity_linker]
 factory = "entity_linker"
@ -126,7 +186,7 @@ incl_context = true
 incl_prior = true
 [components.entity_linker.model]
-@architectures = "spacy.EntityLinker.v1"
+@architectures = "spacy.EntityLinker.v2"
 nO = null
 [components.entity_linker.model.tok2vec]
@ -233,7 +293,7 @@ maxout_pieces = 3
 factory = "morphologizer"
 [components.morphologizer.model]
-@architectures = "spacy.Tagger.v1"
+@architectures = "spacy.Tagger.v2"
 nO = null
 [components.morphologizer.model.tok2vec]
@ -246,7 +306,7 @@ width = ${components.tok2vec.model.encode.width}
 factory = "tagger"
 [components.tagger.model]
-@architectures = "spacy.Tagger.v1"
+@architectures = "spacy.Tagger.v2"
 nO = null
 [components.tagger.model.tok2vec]
@ -290,6 +350,54 @@ nO = null
 width = ${components.tok2vec.model.encode.width}
 {% endif %}
 {% if "spancat" in components %}
 [components.spancat]
 factory = "spancat"
 max_positive = null
 scorer = {"@scorers":"spacy.spancat_scorer.v1"}
 spans_key = "sc"
 threshold = 0.5
 [components.spancat.model]
@architectures = "spacy.SpanCategorizer.v1"
 [components.spancat.model.reducer]
@layers = "spacy.mean_max_reducer.v1"
 hidden_size = 128
 [components.spancat.model.scorer]
@layers = "spacy.LinearLogistic.v1"
 nO = null
 nI = null
 [components.spancat.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
 width = ${components.tok2vec.model.encode.width}
 [components.spancat.suggester]
@misc = "spacy.ngram_suggester.v1"
 sizes = [1,2,3]
 {% endif %}
 {% if "trainable_lemmatizer" in components -%}
 [components.trainable_lemmatizer]
 factory = "trainable_lemmatizer"
 backoff = "orth"
 min_tree_freq = 3
 overwrite = false
 scorer = {"@scorers":"spacy.lemmatizer_scorer.v1"}
 top_k = 1
 [components.trainable_lemmatizer.model]
@architectures = "spacy.Tagger.v2"
 nO = null
 normalize = false
 [components.trainable_lemmatizer.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
 width = ${components.tok2vec.model.encode.width}
 {% endif -%}
 {% if "entity_linker" in components -%}
 [components.entity_linker]
 factory = "entity_linker"
@ -298,7 +406,7 @@ incl_context = true
 incl_prior = true
 [components.entity_linker.model]
-@architectures = "spacy.EntityLinker.v1"
+@architectures = "spacy.EntityLinker.v2"
 nO = null
 [components.entity_linker.model.tok2vec]
@ -364,7 +472,7 @@ no_output_layer = false
 {% endif %}
 {% for pipe in components %}
-{% if pipe not in ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker"] %}
+{% if pipe not in listener_components %}
 {# Other components defined by the user: we just assume they're factories #}
 [components.{{ pipe }}]
 factory = "{{ pipe }}"
@ -421,8 +529,4 @@ compound = 1.001
 {% endif %}
 [initialize]
 {% if use_transformer or optimize == "efficiency" or not word_vectors -%}
 vectors = ${paths.vectors}
 {% else -%}
 vectors = "{{ word_vectors }}"
 {% endif -%}
--- a/spacy/displacy/init.py
+++ b/spacy/displacy/init.py
@ -4,10 +4,10 @@ spaCy's built in visualization suite for dependencies and named entities.
 DOCS: https://spacy.io/api/top-level#displacy
 USAGE: https://spacy.io/usage/visualizers
 """
-from typing import Union, Iterable, Optional, Dict, Any, Callable
+from typing import List, Union, Iterable, Optional, Dict, Any, Callable
 import warnings
-from .render import DependencyRenderer, EntityRenderer
+from .render import DependencyRenderer, EntityRenderer, SpanRenderer
 from ..tokens import Doc, Span
 from ..errors import Errors, Warnings
 from ..util import is_in_jupyter
@ -44,6 +44,7 @@ def render(
    factories = {
        "dep": (DependencyRenderer, parse_deps),
        "ent": (EntityRenderer, parse_ents),
        "span": (SpanRenderer, parse_spans),
    }
    if style not in factories:
        raise ValueError(Errors.E087.format(style=style))
@ -203,6 +204,42 @@ def parse_ents(doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
    return {"text": doc.text, "ents": ents, "title": title, "settings": settings}
 def parse_spans(doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
    """Generate spans in [{start: i, end: i, label: 'label'}] format.
    doc (Doc): Document to parse.
    options (Dict[str, any]): Span-specific visualisation options.
    RETURNS (dict): Generated span types keyed by text (original text) and spans.
    """
    kb_url_template = options.get("kb_url_template", None)
    spans_key = options.get("spans_key", "sc")
    spans = [
        {
            "start": span.start_char,
            "end": span.end_char,
            "start_token": span.start,
            "end_token": span.end,
            "label": span.label_,
            "kb_id": span.kb_id_ if span.kb_id_ else "",
            "kb_url": kb_url_template.format(span.kb_id_) if kb_url_template else "#",
        }
        for span in doc.spans[spans_key]
    ]
    tokens = [token.text for token in doc]
    if not spans:
        warnings.warn(Warnings.W117.format(spans_key=spans_key))
    title = doc.user_data.get("title", None) if hasattr(doc, "user_data") else None
    settings = get_doc_settings(doc)
    return {
        "text": doc.text,
        "spans": spans,
        "title": title,
        "settings": settings,
        "tokens": tokens,
    }
 def set_render_wrapper(func: Callable[[str], str]) -> None:
    """Set an optional wrapper function that is called around the generated
    HTML markup on displacy.render. This can be used to allow integration into
--- a/spacy/displacy/render.py
+++ b/spacy/displacy/render.py
@ -1,12 +1,15 @@
-from typing import Dict, Any, List, Optional, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
 import uuid
 import itertools
 from .templates import TPL_DEP_SVG, TPL_DEP_WORDS, TPL_DEP_WORDS_LEMMA, TPL_DEP_ARCS
 from .templates import TPL_ENT, TPL_ENT_RTL, TPL_FIGURE, TPL_TITLE, TPL_PAGE
 from .templates import TPL_ENTS, TPL_KB_LINK
 from ..util import minify_html, escape_html, registry
 from ..errors import Errors
-
+from ..util import escape_html, minify_html, registry
 from .templates import TPL_DEP_ARCS, TPL_DEP_SVG, TPL_DEP_WORDS
 from .templates import TPL_DEP_WORDS_LEMMA, TPL_ENT, TPL_ENT_RTL, TPL_ENTS
 from .templates import TPL_FIGURE, TPL_KB_LINK, TPL_PAGE, TPL_SPAN
 from .templates import TPL_SPAN_RTL, TPL_SPAN_SLICE, TPL_SPAN_SLICE_RTL
 from .templates import TPL_SPAN_START, TPL_SPAN_START_RTL, TPL_SPANS
 from .templates import TPL_TITLE
 DEFAULT_LANG = "en"
 DEFAULT_DIR = "ltr"
@ -33,6 +36,168 @@ DEFAULT_LABEL_COLORS = {
 }
 class SpanRenderer:
    """Render Spans as SVGs."""
    style = "span"
    def __init__(self, options: Dict[str, Any] = {}) -> None:
        """Initialise span renderer
        options (dict): Visualiser-specific options (colors, spans)
        """
        # Set up the colors and overall look
        colors = dict(DEFAULT_LABEL_COLORS)
        user_colors = registry.displacy_colors.get_all()
        for user_color in user_colors.values():
            if callable(user_color):
                # Since this comes from the function registry, we want to make
                # sure we support functions that *return* a dict of colors
                user_color = user_color()
            if not isinstance(user_color, dict):
                raise ValueError(Errors.E925.format(obj=type(user_color)))
            colors.update(user_color)
        colors.update(options.get("colors", {}))
        self.default_color = DEFAULT_ENTITY_COLOR
        self.colors = {label.upper(): color for label, color in colors.items()}
        # Set up how the text and labels will be rendered
        self.direction = DEFAULT_DIR
        self.lang = DEFAULT_LANG
        self.top_offset = options.get("top_offset", 40)
        self.top_offset_step = options.get("top_offset_step", 17)
        # Set up which templates will be used
        template = options.get("template")
        if template:
            self.span_template = template["span"]
            self.span_slice_template = template["slice"]
            self.span_start_template = template["start"]
        else:
            if self.direction == "rtl":
                self.span_template = TPL_SPAN_RTL
                self.span_slice_template = TPL_SPAN_SLICE_RTL
                self.span_start_template = TPL_SPAN_START_RTL
            else:
                self.span_template = TPL_SPAN
                self.span_slice_template = TPL_SPAN_SLICE
                self.span_start_template = TPL_SPAN_START
    def render(
        self, parsed: List[Dict[str, Any]], page: bool = False, minify: bool = False
    ) -> str:
        """Render complete markup.
        parsed (list): Dependency parses to render.
        page (bool): Render parses wrapped as full HTML page.
        minify (bool): Minify HTML markup.
        RETURNS (str): Rendered HTML markup.
        """
        rendered = []
        for i, p in enumerate(parsed):
            if i == 0:
                settings = p.get("settings", {})
                self.direction = settings.get("direction", DEFAULT_DIR)
                self.lang = settings.get("lang", DEFAULT_LANG)
            rendered.append(self.render_spans(p["tokens"], p["spans"], p.get("title")))
        if page:
            docs = "".join([TPL_FIGURE.format(content=doc) for doc in rendered])
            markup = TPL_PAGE.format(content=docs, lang=self.lang, dir=self.direction)
        else:
            markup = "".join(rendered)
        if minify:
            return minify_html(markup)
        return markup
    def render_spans(
        self,
        tokens: List[str],
        spans: List[Dict[str, Any]],
        title: Optional[str],
    ) -> str:
        """Render span types in text.
        Spans are rendered per-token, this means that for each token, we check if it's part
        of a span slice (a member of a span type) or a span start (the starting token of a
        given span type).
        tokens (list): Individual tokens in the text
        spans (list): Individual entity spans and their start, end, label, kb_id and kb_url.
        title (str / None): Document title set in Doc.user_data['title'].
        """
        per_token_info = []
        for idx, token in enumerate(tokens):
            # Identify if a token belongs to a Span (and which) and if it's a
            # start token of said Span. We'll use this for the final HTML render
            token_markup: Dict[str, Any] = {}
            token_markup["text"] = token
            entities = []
            for span in spans:
                ent = {}
                if span["start_token"] <= idx < span["end_token"]:
                    ent["label"] = span["label"]
                    ent["is_start"] = True if idx == span["start_token"] else False
                    kb_id = span.get("kb_id", "")
                    kb_url = span.get("kb_url", "#")
                    ent["kb_link"] = (
                        TPL_KB_LINK.format(kb_id=kb_id, kb_url=kb_url) if kb_id else ""
                    )
                    entities.append(ent)
            token_markup["entities"] = entities
            per_token_info.append(token_markup)
        markup = self._render_markup(per_token_info)
        markup = TPL_SPANS.format(content=markup, dir=self.direction)
        if title:
            markup = TPL_TITLE.format(title=title) + markup
        return markup
    def _render_markup(self, per_token_info: List[Dict[str, Any]]) -> str:
        """Render the markup from per-token information"""
        markup = ""
        for token in per_token_info:
            entities = sorted(token["entities"], key=lambda d: d["label"])
            if entities:
                slices = self._get_span_slices(token["entities"])
                starts = self._get_span_starts(token["entities"])
                markup += self.span_template.format(
                    text=token["text"], span_slices=slices, span_starts=starts
                )
            else:
                markup += escape_html(token["text"] + " ")
        return markup
    def _get_span_slices(self, entities: List[Dict]) -> str:
        """Get the rendered markup of all Span slices"""
        span_slices = []
        for entity, step in zip(entities, itertools.count(step=self.top_offset_step)):
            color = self.colors.get(entity["label"].upper(), self.default_color)
            span_slice = self.span_slice_template.format(
                bg=color, top_offset=self.top_offset + step
            )
            span_slices.append(span_slice)
        return "".join(span_slices)
    def _get_span_starts(self, entities: List[Dict]) -> str:
        """Get the rendered markup of all Span start tokens"""
        span_starts = []
        for entity, step in zip(entities, itertools.count(step=self.top_offset_step)):
            color = self.colors.get(entity["label"].upper(), self.default_color)
            span_start = (
                self.span_start_template.format(
                    bg=color,
                    top_offset=self.top_offset + step,
                    label=entity["label"],
                    kb_link=entity["kb_link"],
                )
                if entity["is_start"]
                else ""
            )
            span_starts.append(span_start)
        return "".join(span_starts)
 class DependencyRenderer:
    """Render dependency parses as SVGs."""
@ -105,7 +270,7 @@ class DependencyRenderer:
        RETURNS (str): Rendered SVG markup.
        """
        self.levels = self.get_levels(arcs)
-        self.highest_level = len(self.levels)
+        self.highest_level = max(self.levels.values(), default=0)
        self.offset_y = self.distance / 2 * self.highest_level + self.arrow_stroke
        self.width = self.offset_x + len(words) * self.distance
        self.height = self.offset_y + 3 * self.word_spacing
@ -165,7 +330,7 @@ class DependencyRenderer:
        if start < 0 or end < 0:
            error_args = dict(start=start, end=end, label=label, dir=direction)
            raise ValueError(Errors.E157.format(**error_args))
-        level = self.levels.index(end - start) + 1
+        level = self.levels[(start, end, label)]
        x_start = self.offset_x + start * self.distance + self.arrow_spacing
        if self.direction == "rtl":
            x_start = self.width - x_start
@ -181,7 +346,7 @@ class DependencyRenderer:
        y_curve = self.offset_y - level * self.distance / 2
        if self.compact:
            y_curve = self.offset_y - level * self.distance / 6
-        if y_curve == 0 and len(self.levels) > 5:
+        if y_curve == 0 and max(self.levels.values(), default=0) > 5:
            y_curve = -self.distance
        arrowhead = self.get_arrowhead(direction, x_start, y, x_end)
        arc = self.get_arc(x_start, y, y_curve, x_end)
@ -225,15 +390,23 @@ class DependencyRenderer:
            p1, p2, p3 = (end, end + self.arrow_width - 2, end - self.arrow_width + 2)
        return f"M{p1},{y + 2} L{p2},{y - self.arrow_width} {p3},{y - self.arrow_width}"
-    def get_levels(self, arcs: List[Dict[str, Any]]) -> List[int]:
+    def get_levels(self, arcs: List[Dict[str, Any]]) -> Dict[Tuple[int, int, str], int]:
        """Calculate available arc height "levels".
        Used to calculate arrow heights dynamically and without wasting space.
        args (list): Individual arcs and their start, end, direction and label.
-        RETURNS (list): Arc levels sorted from lowest to highest.
+        RETURNS (dict): Arc levels keyed by (start, end, label).
        """
-        levels = set(map(lambda arc: arc["end"] - arc["start"], arcs))
+        arcs = [dict(t) for t in {tuple(sorted(arc.items())) for arc in arcs}]
-        return sorted(list(levels))
+        length = max([arc["end"] for arc in arcs], default=0)
        max_level = [0] * length
        levels = {}
        for arc in sorted(arcs, key=lambda arc: arc["end"] - arc["start"]):
            level = max(max_level[arc["start"] : arc["end"]]) + 1
            for i in range(arc["start"], arc["end"]):
                max_level[i] = level
            levels[(arc["start"], arc["end"], arc["label"])] = level
        return levels
 class EntityRenderer:
@ -242,7 +415,7 @@ class EntityRenderer:
    style = "ent"
    def __init__(self, options: Dict[str, Any] = {}) -> None:
-        """Initialise dependency renderer.
+        """Initialise entity renderer.
        options (dict): Visualiser-specific options (colors, ents)
        """
--- a/spacy/displacy/templates.py
+++ b/spacy/displacy/templates.py
@ -62,6 +62,55 @@ TPL_ENT_RTL = """
 </mark>
 """
 TPL_SPANS = """
 <div class="spans" style="line-height: 2.5; direction: {dir}">{content}</div>
 """
 TPL_SPAN = """
 <span style="font-weight: bold; display: inline-block; position: relative;">
    {text}
    {span_slices}
    {span_starts}
 </span>
 """
 TPL_SPAN_SLICE = """
 <span style="background: {bg}; top: {top_offset}px; height: 4px; left: -1px; width: calc(100% + 2px); position: absolute;">
 </span>
 """
 TPL_SPAN_START = """
 <span style="background: {bg}; top: {top_offset}px; height: 4px; border-top-left-radius: 3px; border-bottom-left-radius: 3px; left: -1px; width: calc(100% + 2px); position: absolute;">
    <span style="background: {bg}; z-index: 10; color: #000; top: -0.5em; padding: 2px 3px; position: absolute; font-size: 0.6em; font-weight: bold; line-height: 1; border-radius: 3px">
        {label}{kb_link}
    </span>
 </span>
 """
 TPL_SPAN_RTL = """
 <span style="font-weight: bold; display: inline-block; position: relative;">
    {text}
    {span_slices}
    {span_starts}
 </span>
 """
 TPL_SPAN_SLICE_RTL = """
 <span style="background: {bg}; top: {top_offset}px; height: 4px; left: -1px; width: calc(100% + 2px); position: absolute;">
 </span>
 """
 TPL_SPAN_START_RTL = """
 <span style="background: {bg}; top: {top_offset}px; height: 4px; border-top-left-radius: 3px; border-bottom-left-radius: 3px; left: -1px; width: calc(100% + 2px); position: absolute;">
    <span style="background: {bg}; z-index: 10; color: #000; top: -0.5em; padding: 2px 3px; position: absolute; font-size: 0.6em; font-weight: bold; line-height: 1; border-radius: 3px">
        {label}{kb_link}
    </span>
 </span>
 """
 # Important: this needs to start with a space!
 TPL_KB_LINK = """
 <a style="text-decoration: none; color: inherit; font-weight: normal" href="{kb_url}">{kb_id}</a>
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -192,6 +192,13 @@ class Warnings(metaclass=ErrorsWithCodes):
    W115 = ("Skipping {method}: the floret vector table cannot be modified. "
            "Vectors are calculated from character ngrams.")
    W116 = ("Unable to clean attribute '{attr}'.")
    W117 = ("No spans to visualize found in Doc object with spans_key: '{spans_key}'. If this is "
            "surprising to you, make sure the Doc was processed using a model "
            "that supports span categorization, and check the `doc.spans[spans_key]` "
            "property manually if necessary.")
    W118 = ("Term '{term}' not found in glossary. It may however be explained in documentation "
            "for the corpora used to train the language. Please check "
            "`nlp.meta[\"sources\"]` for any relevant links.")
 class Errors(metaclass=ErrorsWithCodes):
@ -483,7 +490,7 @@ class Errors(metaclass=ErrorsWithCodes):
            "components, since spans are only views of the Doc. Use Doc and "
            "Token attributes (or custom extension attributes) only and remove "
            "the following: {attrs}")
-    E181 = ("Received invalid attributes for unkown object {obj}: {attrs}. "
+    E181 = ("Received invalid attributes for unknown object {obj}: {attrs}. "
            "Only Doc and Token attributes are supported.")
    E182 = ("Received invalid attribute declaration: {attr}\nDid you forget "
            "to define the attribute? For example: `{attr}.???`")
@ -520,10 +527,14 @@ class Errors(metaclass=ErrorsWithCodes):
    E202 = ("Unsupported {name} mode '{mode}'. Supported modes: {modes}.")
    # New errors added in v3.x
    E855 = ("Invalid {obj}: {obj} is not from the same doc.")
    E856 = ("Error accessing span at position {i}: out of bounds in span group "
            "of length {length}.")
    E857 = ("Entry '{name}' not found in edit tree lemmatizer labels.")
    E858 = ("The {mode} vector table does not support this operation. "
            "{alternative}")
    E859 = ("The floret vector table cannot be modified.")
-    E860 = ("Can't truncate fasttext-bloom vectors.")
+    E860 = ("Can't truncate floret vectors.")
    E861 = ("No 'keys' should be provided when initializing floret vectors "
            "with 'minn' and 'maxn'.")
    E862 = ("'hash_count' must be between 1-4 for floret vectors.")
@ -566,9 +577,6 @@ class Errors(metaclass=ErrorsWithCodes):
    E879 = ("Unexpected type for 'spans' data. Provide a dictionary mapping keys to "
            "a list of spans, with each span represented by a tuple (start_char, end_char). "
            "The tuple can be optionally extended with a label and a KB ID.")
    E880 = ("The 'wandb' library could not be found - did you install it? "
            "Alternatively, specify the 'ConsoleLogger' in the 'training.logger' "
            "config section, instead of the 'WandbLogger'.")
    E884 = ("The pipeline could not be initialized because the vectors "
            "could not be found at '{vectors}'. If your pipeline was already "
            "initialized/trained before, call 'resume_training' instead of 'initialize', "
@ -894,6 +902,9 @@ class Errors(metaclass=ErrorsWithCodes):
             "patterns.")
    E1025 = ("Cannot intify the value '{value}' as an IOB string. The only "
             "supported values are: 'I', 'O', 'B' and ''")
    E1026 = ("Edit tree has an invalid format:\n{errors}")
    E1027 = ("AlignmentArray only supports slicing with a step of 1.")
    E1028 = ("AlignmentArray only supports indexing using an int or a slice.")
 # Deprecated model shortcuts, only used in errors and warnings
--- a/spacy/glossary.py
+++ b/spacy/glossary.py
@ -1,3 +1,7 @@
 import warnings
 from .errors import Warnings
 def explain(term):
    """Get a description for a given POS tag, dependency label or entity type.
@ -11,6 +15,8 @@ def explain(term):
    """
    if term in GLOSSARY:
        return GLOSSARY[term]
    else:
        warnings.warn(Warnings.W118.format(term=term))
 GLOSSARY = {
@ -310,7 +316,6 @@ GLOSSARY = {
    "re": "repeated element",
    "rs": "reported speech",
    "sb": "subject",
    "sb": "subject",
    "sbp": "passivized subject (PP)",
    "sp": "subject or predicate",
    "svp": "separable verb prefix",
--- a/spacy/lang/char_classes.py
+++ b/spacy/lang/char_classes.py
@ -45,6 +45,10 @@ _hangul_syllables = r"\uAC00-\uD7AF"
 _hangul_jamo = r"\u1100-\u11FF"
 _hangul = _hangul_syllables + _hangul_jamo
 _hiragana = r"\u3040-\u309F"
 _katakana = r"\u30A0-\u30FFー"
 _kana = _hiragana + _katakana
 # letters with diacritics - Catalan, Czech, Latin, Latvian, Lithuanian, Polish, Slovak, Turkish, Welsh
 _latin_u_extendedA = (
    r"\u0100\u0102\u0104\u0106\u0108\u010A\u010C\u010E\u0110\u0112\u0114\u0116\u0118\u011A\u011C"
@ -244,6 +248,7 @@ _uncased = (
    + _tamil
    + _telugu
    + _hangul
    + _kana
    + _cjk
 )
--- a/spacy/lang/dsb/init.py
+++ b/spacy/lang/dsb/init.py
@ -0,0 +1,16 @@
 from .lex_attrs import LEX_ATTRS
 from .stop_words import STOP_WORDS
 from ...language import Language, BaseDefaults
 class LowerSorbianDefaults(BaseDefaults):
    lex_attr_getters = LEX_ATTRS
    stop_words = STOP_WORDS
 class LowerSorbian(Language):
    lang = "dsb"
    Defaults = LowerSorbianDefaults
 __all__ = ["LowerSorbian"]
--- a/spacy/lang/dsb/examples.py
+++ b/spacy/lang/dsb/examples.py
@ -0,0 +1,15 @@
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.dsb.examples import sentences
 >>> docs = nlp.pipe(sentences)
 """
 sentences = [
    "Z tym stwori so wuměnjenje a zakład za dalše wobdźěłanje přez analyzu tekstoweje struktury a semantisku anotaciju a z tym tež za tu předstajenu digitalnu online-wersiju.",
    "Mi so tu jara derje spodoba.",
    "Kotre nowniny chceće měć?",
    "Tak ako w slědnem lěśe jo teke lětosa jano doma zapustowaś móžno.",
    "Zwóstanjo pótakem hyšći wjele źěła.",
 ]
--- a/spacy/lang/dsb/lex_attrs.py
+++ b/spacy/lang/dsb/lex_attrs.py
@ -0,0 +1,113 @@
 from ...attrs import LIKE_NUM
 _num_words = [
    "nul",
    "jaden",
    "jadna",
    "jadno",
    "dwa",
    "dwě",
    "tśi",
    "tśo",
    "styri",
    "styrjo",
    "pěś",
    "pěśo",
    "šesć",
    "šesćo",
    "sedym",
    "sedymjo",
    "wósym",
    "wósymjo",
    "źewjeś",
    "źewjeśo",
    "źaseś",
    "źaseśo",
    "jadnassćo",
    "dwanassćo",
    "tśinasćo",
    "styrnasćo",
    "pěśnasćo",
    "šesnasćo",
    "sedymnasćo",
    "wósymnasćo",
    "źewjeśnasćo",
    "dwanasćo",
    "dwaźasća",
    "tśiźasća",
    "styrźasća",
    "pěśźaset",
    "šesćźaset",
    "sedymźaset",
    "wósymźaset",
    "źewjeśźaset",
    "sto",
    "tysac",
    "milion",
    "miliarda",
    "bilion",
    "biliarda",
    "trilion",
    "triliarda",
 ]
 _ordinal_words = [
    "prědny",
    "prědna",
    "prědne",
    "drugi",
    "druga",
    "druge",
    "tśeśi",
    "tśeśa",
    "tśeśe",
    "stwórty",
    "stwórta",
    "stwórte",
    "pêty",
    "pěta",
    "pête",
    "šesty",
    "šesta",
    "šeste",
    "sedymy",
    "sedyma",
    "sedyme",
    "wósymy",
    "wósyma",
    "wósyme",
    "źewjety",
    "źewjeta",
    "źewjete",
    "źasety",
    "źaseta",
    "źasete",
    "jadnasty",
    "jadnasta",
    "jadnaste",
    "dwanasty",
    "dwanasta",
    "dwanaste",
 ]
 def like_num(text):
    if text.startswith(("+", "-", "±", "~")):
        text = text[1:]
    text = text.replace(",", "").replace(".", "")
    if text.isdigit():
        return True
    if text.count("/") == 1:
        num, denom = text.split("/")
        if num.isdigit() and denom.isdigit():
            return True
    text_lower = text.lower()
    if text_lower in _num_words:
        return True
    # Check ordinal number
    if text_lower in _ordinal_words:
        return True
    return False
 LEX_ATTRS = {LIKE_NUM: like_num}
--- a/spacy/lang/dsb/stop_words.py
+++ b/spacy/lang/dsb/stop_words.py
@ -0,0 +1,15 @@
 STOP_WORDS = set(
    """
 a abo aby ako ale až
 daniž dokulaž
 gaž
 jolic
 pak pótom
 teke togodla
 """.split()
 )
--- a/spacy/lang/en/tokenizer_exceptions.py
+++ b/spacy/lang/en/tokenizer_exceptions.py
@ -447,7 +447,6 @@ for exc_data in [
    {ORTH: "La.", NORM: "Louisiana"},
    {ORTH: "Mar.", NORM: "March"},
    {ORTH: "Mass.", NORM: "Massachusetts"},
    {ORTH: "May.", NORM: "May"},
    {ORTH: "Mich.", NORM: "Michigan"},
    {ORTH: "Minn.", NORM: "Minnesota"},
    {ORTH: "Miss.", NORM: "Mississippi"},
--- a/spacy/lang/es/lex_attrs.py
+++ b/spacy/lang/es/lex_attrs.py
@ -47,6 +47,41 @@ _num_words = [
 ]
 _ordinal_words = [
    "primero",
    "segundo",
    "tercero",
    "cuarto",
    "quinto",
    "sexto",
    "séptimo",
    "octavo",
    "noveno",
    "décimo",
    "undécimo",
    "duodécimo",
    "decimotercero",
    "decimocuarto",
    "decimoquinto",
    "decimosexto",
    "decimoséptimo",
    "decimoctavo",
    "decimonoveno",
    "vigésimo",
    "trigésimo",
    "cuadragésimo",
    "quincuagésimo",
    "sexagésimo",
    "septuagésimo",
    "octogésima",
    "nonagésima",
    "centésima",
    "milésima",
    "millonésima",
    "billonésima",
 ]
 def like_num(text):
    if text.startswith(("+", "-", "±", "~")):
        text = text[1:]
@ -57,7 +92,11 @@ def like_num(text):
        num, denom = text.split("/")
        if num.isdigit() and denom.isdigit():
            return True
-    if text.lower() in _num_words:
+    text_lower = text.lower()
    if text_lower in _num_words:
        return True
    # Check ordinal number
    if text_lower in _ordinal_words:
        return True
    return False
--- a/spacy/lang/fi/init.py
+++ b/spacy/lang/fi/init.py
@ -2,6 +2,7 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 from .syntax_iterators import SYNTAX_ITERATORS
 from ...language import Language, BaseDefaults
@ -11,6 +12,7 @@ class FinnishDefaults(BaseDefaults):
    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
    lex_attr_getters = LEX_ATTRS
    stop_words = STOP_WORDS
    syntax_iterators = SYNTAX_ITERATORS
 class Finnish(Language):
--- a/spacy/lang/fi/syntax_iterators.py
+++ b/spacy/lang/fi/syntax_iterators.py
@ -0,0 +1,79 @@
 from typing import Iterator, Tuple, Union
 from ...tokens import Doc, Span
 from ...symbols import NOUN, PROPN, PRON
 from ...errors import Errors
 def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
    """Detect base noun phrases from a dependency parse. Works on both Doc and Span."""
    labels = [
        "appos",
        "nsubj",
        "nsubj:cop",
        "obj",
        "obl",
        "ROOT",
    ]
    extend_labels = [
        "amod",
        "compound",
        "compound:nn",
        "flat:name",
        "nmod",
        "nmod:gobj",
        "nmod:gsubj",
        "nmod:poss",
        "nummod",
    ]
    def potential_np_head(word):
        return word.pos in (NOUN, PROPN) and (
            word.dep in np_deps or word.head.pos == PRON
        )
    doc = doclike.doc  # Ensure works on both Doc and Span.
    if not doc.has_annotation("DEP"):
        raise ValueError(Errors.E029)
    np_deps = [doc.vocab.strings[label] for label in labels]
    extend_deps = [doc.vocab.strings[label] for label in extend_labels]
    np_label = doc.vocab.strings.add("NP")
    conj_label = doc.vocab.strings.add("conj")
    rbracket = 0
    prev_end = -1
    for i, word in enumerate(doclike):
        if i < rbracket:
            continue
        # Is this a potential independent NP head or coordinated with
        # a NOUN that is itself an independent NP head?
        #
        # e.g. "Terveyden ja hyvinvoinnin laitos"
        if potential_np_head(word) or (
            word.dep == conj_label and potential_np_head(word.head)
        ):
            # Try to extend to the left to include adjective/num
            # modifiers, compound words etc.
            lbracket = word.i
            for ldep in word.lefts:
                if ldep.dep in extend_deps:
                    lbracket = ldep.left_edge.i
                    break
            # Prevent nested chunks from being produced
            if lbracket <= prev_end:
                continue
            rbracket = word.i
            # Try to extend the span to the right to capture
            # appositions and noun modifiers
            for rdep in word.rights:
                if rdep.dep in extend_deps:
                    rbracket = rdep.i
            prev_end = rbracket
            yield lbracket, rbracket + 1, np_label
 SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
--- a/spacy/lang/fr/lex_attrs.py
+++ b/spacy/lang/fr/lex_attrs.py
@ -3,7 +3,7 @@ from ...attrs import LIKE_NUM
 _num_words = set(
    """
-zero un deux trois quatre cinq six sept huit neuf dix
+zero un une deux trois quatre cinq six sept huit neuf dix
 onze douze treize quatorze quinze seize dix-sept dix-huit dix-neuf
 vingt trente quarante cinquante soixante soixante-dix septante quatre-vingt huitante quatre-vingt-dix nonante
 cent mille mil million milliard billion quadrillion quintillion
@ -13,7 +13,7 @@ sextillion septillion octillion nonillion decillion
 _ordinal_words = set(
    """
-premier deuxième second troisième quatrième cinquième sixième septième huitième neuvième dixième
+premier première deuxième second seconde troisième quatrième cinquième sixième septième huitième neuvième dixième
 onzième douzième treizième quatorzième quinzième seizième dix-septième dix-huitième dix-neuvième
 vingtième trentième quarantième cinquantième soixantième soixante-dixième septantième quatre-vingtième huitantième quatre-vingt-dixième nonantième
 centième millième millionnième milliardième billionnième quadrillionnième quintillionnième
--- a/spacy/lang/fr/syntax_iterators.py
+++ b/spacy/lang/fr/syntax_iterators.py
@ -6,16 +6,35 @@ from ...tokens import Doc, Span
 def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
-    """Detect base noun phrases from a dependency parse. Works on Doc and Span."""
+    """
-    # fmt: off
+    Detect base noun phrases from a dependency parse. Works on both Doc and Span.
-    labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
+    """
-    # fmt: on
+    labels = [
        "nsubj",
        "nsubj:pass",
        "obj",
        "obl",
        "obl:agent",
        "obl:arg",
        "obl:mod",
        "nmod",
        "pcomp",
        "appos",
        "ROOT",
    ]
    post_modifiers = ["flat", "flat:name", "flat:foreign", "fixed", "compound"]
    doc = doclike.doc  # Ensure works on both Doc and Span.
    if not doc.has_annotation("DEP"):
        raise ValueError(Errors.E029)
-    np_deps = [doc.vocab.strings[label] for label in labels]
+    np_deps = {doc.vocab.strings.add(label) for label in labels}
-    conj = doc.vocab.strings.add("conj")
+    np_modifs = {doc.vocab.strings.add(modifier) for modifier in post_modifiers}
    np_label = doc.vocab.strings.add("NP")
    adj_label = doc.vocab.strings.add("amod")
    det_label = doc.vocab.strings.add("det")
    det_pos = doc.vocab.strings.add("DET")
    adp_pos = doc.vocab.strings.add("ADP")
    conj_label = doc.vocab.strings.add("conj")
    conj_pos = doc.vocab.strings.add("CCONJ")
    prev_end = -1
    for i, word in enumerate(doclike):
        if word.pos not in (NOUN, PROPN, PRON):
@ -24,16 +43,43 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
        if word.left_edge.i <= prev_end:
            continue
        if word.dep in np_deps:
-            prev_end = word.right_edge.i
+            right_childs = list(word.rights)
-            yield word.left_edge.i, word.right_edge.i + 1, np_label
+            right_child = right_childs[0] if right_childs else None
-        elif word.dep == conj:
+
            if right_child:
                if (
                    right_child.dep == adj_label
                ):  # allow chain of adjectives by expanding to right
                    right_end = right_child.right_edge
                elif (
                    right_child.dep == det_label and right_child.pos == det_pos
                ):  # cut relative pronouns here
                    right_end = right_child
                elif right_child.dep in np_modifs:  # Check if we can expand to right
                    right_end = word.right_edge
                else:
                    right_end = word
            else:
                right_end = word
            prev_end = right_end.i
            left_index = word.left_edge.i
            left_index = left_index + 1 if word.left_edge.pos == adp_pos else left_index
            yield left_index, right_end.i + 1, np_label
        elif word.dep == conj_label:
            head = word.head
-            while head.dep == conj and head.head.i < head.i:
+            while head.dep == conj_label and head.head.i < head.i:
                head = head.head
            # If the head is an NP, and we're coordinated to it, we're an NP
            if head.dep in np_deps:
-                prev_end = word.right_edge.i
+                prev_end = word.i
-                yield word.left_edge.i, word.right_edge.i + 1, np_label
+
                left_index = word.left_edge.i  # eliminate left attached conjunction
                left_index = (
                    left_index + 1 if word.left_edge.pos == conj_pos else left_index
                )
                yield left_index, word.i + 1, np_label
 SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
--- a/spacy/lang/hsb/init.py
+++ b/spacy/lang/hsb/init.py
@ -0,0 +1,18 @@
 from .lex_attrs import LEX_ATTRS
 from .stop_words import STOP_WORDS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from ...language import Language, BaseDefaults
 class UpperSorbianDefaults(BaseDefaults):
    lex_attr_getters = LEX_ATTRS
    stop_words = STOP_WORDS
    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
 class UpperSorbian(Language):
    lang = "hsb"
    Defaults = UpperSorbianDefaults
 __all__ = ["UpperSorbian"]
--- a/spacy/lang/hsb/examples.py
+++ b/spacy/lang/hsb/examples.py
@ -0,0 +1,15 @@
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.hsb.examples import sentences
 >>> docs = nlp.pipe(sentences)
 """
 sentences = [
    "To běšo wjelgin raźone a jo se wót luźi derje pśiwzeło. Tak som dožywiła wjelgin",
    "Jogo pśewóźowarce stej groniłej, až how w serbskich stronach njama Santa Claus nic pytaś.",
    "A ten sobuźěłaśeŕ Statneje biblioteki w Barlinju jo pśimjeł drogotne knigły bźez rukajcowu z nagima rukoma!",
    "Take wobchadanje z našym kulturnym derbstwom zewšym njejźo.",
    "Wopśimjeśe drugich pśinoskow jo było na wusokem niwowje, ako pśecej.",
 ]
--- a/spacy/lang/hsb/lex_attrs.py
+++ b/spacy/lang/hsb/lex_attrs.py
@ -0,0 +1,106 @@
 from ...attrs import LIKE_NUM
 _num_words = [
    "nul",
    "jedyn",
    "jedna",
    "jedne",
    "dwaj",
    "dwě",
    "tři",
    "třo",
    "štyri",
    "štyrjo",
    "pjeć",
    "šěsć",
    "sydom",
    "wosom",
    "dźewjeć",
    "dźesać",
    "jědnaće",
    "dwanaće",
    "třinaće",
    "štyrnaće",
    "pjatnaće",
    "šěsnaće",
    "sydomnaće",
    "wosomnaće",
    "dźewjatnaće",
    "dwaceći",
    "třiceći",
    "štyrceći",
    "pjećdźesat",
    "šěsćdźesat",
    "sydomdźesat",
    "wosomdźesat",
    "dźewjećdźesat",
    "sto",
    "tysac",
    "milion",
    "miliarda",
    "bilion",
    "biliarda",
    "trilion",
    "triliarda",
 ]
 _ordinal_words = [
    "prěni",
    "prěnja",
    "prěnje",
    "druhi",
    "druha",
    "druhe",
    "třeći",
    "třeća",
    "třeće",
    "štwórty",
    "štwórta",
    "štwórte",
    "pjaty",
    "pjata",
    "pjate",
    "šěsty",
    "šěsta",
    "šěste",
    "sydmy",
    "sydma",
    "sydme",
    "wosmy",
    "wosma",
    "wosme",
    "dźewjaty",
    "dźewjata",
    "dźewjate",
    "dźesaty",
    "dźesata",
    "dźesate",
    "jědnaty",
    "jědnata",
    "jědnate",
    "dwanaty",
    "dwanata",
    "dwanate",
 ]
 def like_num(text):
    if text.startswith(("+", "-", "±", "~")):
        text = text[1:]
    text = text.replace(",", "").replace(".", "")
    if text.isdigit():
        return True
    if text.count("/") == 1:
        num, denom = text.split("/")
        if num.isdigit() and denom.isdigit():
            return True
    text_lower = text.lower()
    if text_lower in _num_words:
        return True
    # Check ordinal number
    if text_lower in _ordinal_words:
        return True
    return False
 LEX_ATTRS = {LIKE_NUM: like_num}
--- a/spacy/lang/hsb/stop_words.py
+++ b/spacy/lang/hsb/stop_words.py
@ -0,0 +1,19 @@
 STOP_WORDS = set(
    """
 a abo ale ani
 dokelž
 hdyž
 jeli jelizo
 kaž
 pak potom
 tež tohodla
 zo zoby
 """.split()
 )
--- a/spacy/lang/hsb/tokenizer_exceptions.py
+++ b/spacy/lang/hsb/tokenizer_exceptions.py
@ -0,0 +1,18 @@
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ...symbols import ORTH, NORM
 from ...util import update_exc
 _exc = dict()
 for exc_data in [
    {ORTH: "mil.", NORM: "milion"},
    {ORTH: "wob.", NORM: "wobydler"},
 ]:
    _exc[exc_data[ORTH]] = [exc_data]
 for orth in [
    "resp.",
 ]:
    _exc[orth] = [{ORTH: orth}]
 TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
--- a/spacy/lang/it/init.py
+++ b/spacy/lang/it/init.py
@ -6,13 +6,15 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 from ...language import Language, BaseDefaults
 from .lemmatizer import ItalianLemmatizer
 from .syntax_iterators import SYNTAX_ITERATORS
 class ItalianDefaults(BaseDefaults):
    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
    stop_words = STOP_WORDS
    prefixes = TOKENIZER_PREFIXES
    infixes = TOKENIZER_INFIXES
    stop_words = STOP_WORDS
    syntax_iterators = SYNTAX_ITERATORS
 class Italian(Language):
--- a/spacy/lang/it/stop_words.py
+++ b/spacy/lang/it/stop_words.py
@ -10,18 +10,18 @@ avresti avrete avrà avrò avuta avute avuti avuto
 basta bene benissimo brava bravo
-casa caso cento certa certe certi certo che chi chicchessia chiunque ci
+casa caso cento certa certe certi certo che chi chicchessia chiunque ci c'
 ciascuna ciascuno cima cio cioe circa citta città co codesta codesti codesto
 cogli coi col colei coll coloro colui come cominci comunque con concernente
 conciliarsi conclusione consiglio contro cortesia cos cosa cosi così cui
-da dagl dagli dai dal dall dalla dalle dallo dappertutto davanti degl degli
+d' da dagl dagli dai dal dall dall' dalla dalle dallo dappertutto davanti degl degli
-dei del dell della delle dello dentro detto deve di dice dietro dire
+dei del dell dell' della delle dello dentro detto deve di dice dietro dire
 dirimpetto diventa diventare diventato dopo dov dove dovra dovrà dovunque due
 dunque durante
-ebbe ebbero ebbi ecc ecco ed effettivamente egli ella entrambi eppure era
+e ebbe ebbero ebbi ecc ecco ed effettivamente egli ella entrambi eppure era
-erano eravamo eravate eri ero esempio esse essendo esser essere essi ex
+erano eravamo eravate eri ero esempio esse essendo esser essere essi ex è
 fa faccia facciamo facciano facciate faccio facemmo facendo facesse facessero
 facessi facessimo faceste facesti faceva facevamo facevano facevate facevi
@ -30,21 +30,21 @@ fareste faresti farete farà farò fatto favore fece fecero feci fin finalmente
 finche fine fino forse forza fosse fossero fossi fossimo foste fosti fra
 frattempo fu fui fummo fuori furono futuro generale
-gia già giacche giorni giorno gli gliela gliele glieli glielo gliene governo
+gia già giacche giorni giorno gli gl' gliela gliele glieli glielo gliene governo
 grande grazie gruppo
 ha haha hai hanno ho
 ieri il improvviso in inc infatti inoltre insieme intanto intorno invece io
-la là lasciato lato lavoro le lei li lo lontano loro lui lungo luogo
+l' la là lasciato lato lavoro le lei li lo lontano loro lui lungo luogo
-ma macche magari maggior mai male malgrado malissimo mancanza marche me
+m' ma macche magari maggior mai male malgrado malissimo mancanza marche me
 medesimo mediante meglio meno mentre mesi mezzo mi mia mie miei mila miliardi
 milioni minimi ministro mio modo molti moltissimo molto momento mondo mosto
-nazionale ne negl negli nei nel nell nella nelle nello nemmeno neppure nessun
+nazionale ne negl negli nei nel nell nella nelle nello nemmeno neppure nessun nessun'
-nessuna nessuno niente no noi non nondimeno nonostante nonsia nostra nostre
+nessuna nessuno nient' niente no noi non nondimeno nonostante nonsia nostra nostre
 nostri nostro novanta nove nulla nuovo
 od oggi ogni ognuna ognuno oltre oppure ora ore osi ossia ottanta otto
@ -56,12 +56,12 @@ potrebbe preferibilmente presa press prima primo principalmente probabilmente
 proprio puo può pure purtroppo
 qualche qualcosa qualcuna qualcuno quale quali qualunque quando quanta quante
-quanti quanto quantunque quasi quattro quel quella quelle quelli quello quest
+quanti quanto quantunque quasi quattro quel quel' quella quelle quelli quello quest quest'
 questa queste questi questo qui quindi
 realmente recente recentemente registrazione relativo riecco salvo
-sara sarà sarai saranno sarebbe sarebbero sarei saremmo saremo sareste
+s' sara sarà sarai saranno sarebbe sarebbero sarei saremmo saremo sareste
 saresti sarete saro sarò scola scopo scorso se secondo seguente seguito sei
 sembra sembrare sembrato sembri sempre senza sette si sia siamo siano siate
 siete sig solito solo soltanto sono sopra sotto spesso srl sta stai stando
@ -72,12 +72,12 @@ steste stesti stette stettero stetti stia stiamo stiano stiate sto su sua
 subito successivamente successivo sue sugl sugli sui sul sull sulla sulle
 sullo suo suoi
-tale tali talvolta tanto te tempo ti titolo tra tranne tre trenta
+t' tale tali talvolta tanto te tempo ti titolo tra tranne tre trenta
 troppo trovato tu tua tue tuo tuoi tutta tuttavia tutte tutti tutto
-uguali ulteriore ultimo un una uno uomo
+uguali ulteriore ultimo un un' una uno uomo
-va vale vari varia varie vario verso vi via vicino visto vita voi volta volte
+v' va vale vari varia varie vario verso vi via vicino visto vita voi volta volte
 vostra vostre vostri vostro
 """.split()
 )
--- a/spacy/lang/it/syntax_iterators.py
+++ b/spacy/lang/it/syntax_iterators.py
@ -0,0 +1,86 @@
 from typing import Union, Iterator, Tuple
 from ...symbols import NOUN, PROPN, PRON
 from ...errors import Errors
 from ...tokens import Doc, Span
 def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
    """
    Detect base noun phrases from a dependency parse. Works on both Doc and Span.
    """
    labels = [
        "nsubj",
        "nsubj:pass",
        "obj",
        "obl",
        "obl:agent",
        "nmod",
        "pcomp",
        "appos",
        "ROOT",
    ]
    post_modifiers = ["flat", "flat:name", "fixed", "compound"]
    dets = ["det", "det:poss"]
    doc = doclike.doc  # Ensure works on both Doc and Span.
    if not doc.has_annotation("DEP"):
        raise ValueError(Errors.E029)
    np_deps = {doc.vocab.strings.add(label) for label in labels}
    np_modifs = {doc.vocab.strings.add(modifier) for modifier in post_modifiers}
    np_label = doc.vocab.strings.add("NP")
    adj_label = doc.vocab.strings.add("amod")
    det_labels = {doc.vocab.strings.add(det) for det in dets}
    det_pos = doc.vocab.strings.add("DET")
    adp_label = doc.vocab.strings.add("ADP")
    conj = doc.vocab.strings.add("conj")
    conj_pos = doc.vocab.strings.add("CCONJ")
    prev_end = -1
    for i, word in enumerate(doclike):
        if word.pos not in (NOUN, PROPN, PRON):
            continue
        # Prevent nested chunks from being produced
        if word.left_edge.i <= prev_end:
            continue
        if word.dep in np_deps:
            right_childs = list(word.rights)
            right_child = right_childs[0] if right_childs else None
            if right_child:
                if (
                    right_child.dep == adj_label
                ):  # allow chain of adjectives by expanding to right
                    right_end = right_child.right_edge
                elif (
                    right_child.dep in det_labels and right_child.pos == det_pos
                ):  # cut relative pronouns here
                    right_end = right_child
                elif right_child.dep in np_modifs:  # Check if we can expand to right
                    right_end = word.right_edge
                else:
                    right_end = word
            else:
                right_end = word
            prev_end = right_end.i
            left_index = word.left_edge.i
            left_index = (
                left_index + 1 if word.left_edge.pos == adp_label else left_index
            )
            yield left_index, right_end.i + 1, np_label
        elif word.dep == conj:
            head = word.head
            while head.dep == conj and head.head.i < head.i:
                head = head.head
            # If the head is an NP, and we're coordinated to it, we're an NP
            if head.dep in np_deps:
                prev_end = word.i
                left_index = word.left_edge.i  # eliminate left attached conjunction
                left_index = (
                    left_index + 1 if word.left_edge.pos == conj_pos else left_index
                )
                yield left_index, word.i + 1, np_label
 SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
--- a/spacy/lang/ko/init.py
+++ b/spacy/lang/ko/init.py
@ -1,12 +1,13 @@
 from typing import Iterator, Any, Dict
 from .punctuation import TOKENIZER_INFIXES
 from .stop_words import STOP_WORDS
 from .tag_map import TAG_MAP
 from .lex_attrs import LEX_ATTRS
 from ...language import Language, BaseDefaults
 from ...tokens import Doc
 from ...scorer import Scorer
-from ...symbols import POS
+from ...symbols import POS, X
 from ...training import validate_examples
 from ...util import DummyTokenizer, registry, load_config_from_str
 from ...vocab import Vocab
@ -31,15 +32,24 @@ def create_tokenizer():
 class KoreanTokenizer(DummyTokenizer):
    def __init__(self, vocab: Vocab):
        self.vocab = vocab
-        MeCab = try_mecab_import()  # type: ignore[func-returns-value]
+        self._mecab = try_mecab_import()  # type: ignore[func-returns-value]
-        self.mecab_tokenizer = MeCab("-F%f[0],%f[7]")
+        self._mecab_tokenizer = None
    @property
    def mecab_tokenizer(self):
        # This is a property so that initializing a pipeline with blank:ko is
        # possible without actually requiring mecab-ko, e.g. to run
        # `spacy init vectors ko` for a pipeline that will have a different
        # tokenizer in the end. The languages need to match for the vectors
        # to be imported and there's no way to pass a custom config to
        # `init vectors`.
        if self._mecab_tokenizer is None:
            self._mecab_tokenizer = self._mecab("-F%f[0],%f[7]")
        return self._mecab_tokenizer
    def __reduce__(self):
        return KoreanTokenizer, (self.vocab,)
    def __del__(self):
        self.mecab_tokenizer.__del__()
    def __call__(self, text: str) -> Doc:
        dtokens = list(self.detailed_tokens(text))
        surfaces = [dt["surface"] for dt in dtokens]
@ -47,7 +57,10 @@ class KoreanTokenizer(DummyTokenizer):
        for token, dtoken in zip(doc, dtokens):
            first_tag, sep, eomi_tags = dtoken["tag"].partition("+")
            token.tag_ = first_tag  # stem(어간) or pre-final(선어말 어미)
-            token.pos = TAG_MAP[token.tag_][POS]
+            if token.tag_ in TAG_MAP:
                token.pos = TAG_MAP[token.tag_][POS]
            else:
                token.pos = X
            token.lemma_ = dtoken["lemma"]
        doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens]
        return doc
@ -76,6 +89,7 @@ class KoreanDefaults(BaseDefaults):
    lex_attr_getters = LEX_ATTRS
    stop_words = STOP_WORDS
    writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
    infixes = TOKENIZER_INFIXES
 class Korean(Language):
@ -90,7 +104,8 @@ def try_mecab_import() -> None:
        return MeCab
    except ImportError:
        raise ImportError(
-            "Korean support requires [mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), "
+            'The Korean tokenizer ("spacy.ko.KoreanTokenizer") requires '
            "[mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), "
            "[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), "
            "and [natto-py](https://github.com/buruzaemon/natto-py)"
        ) from None
--- a/spacy/lang/ko/punctuation.py
+++ b/spacy/lang/ko/punctuation.py
@ -0,0 +1,12 @@
 from ..char_classes import LIST_QUOTES
 from ..punctuation import TOKENIZER_INFIXES as BASE_TOKENIZER_INFIXES
 _infixes = (
    ["·", "ㆍ", "\(", "\)"]
    + [r"(?<=[0-9])~(?=[0-9-])"]
    + LIST_QUOTES
    + BASE_TOKENIZER_INFIXES
 )
 TOKENIZER_INFIXES = _infixes
--- a/spacy/lang/nb/stop_words.py
+++ b/spacy/lang/nb/stop_words.py
@ -4,46 +4,42 @@ alle allerede alt and andre annen annet at av
 bak bare bedre beste blant ble bli blir blitt bris by både
-da dag de del dem den denne der dermed det dette disse drept du
+da dag de del dem den denne der dermed det dette disse du
 eller en enn er et ett etter
-fem fikk fire fjor flere folk for fortsatt fotball fra fram frankrike fredag
+fem fikk fire fjor flere folk for fortsatt fra fram
 funnet få får fått før først første
 gang gi gikk gjennom gjorde gjort gjør gjøre god godt grunn gå går
-ha hadde ham han hans har hele helt henne hennes her hun hva hvor hvordan
+ha hadde ham han hans har hele helt henne hennes her hun
 hvorfor
 i ifølge igjen ikke ingen inn
 ja jeg
 kamp kampen kan kl klart kom komme kommer kontakt kort kroner kunne kveld
 kvinner
-la laget land landet langt leder ligger like litt løpet lørdag
+la laget land landet langt leder ligger like litt løpet
-man mandag mange mannen mars med meg mellom men mener menn mennesker mens mer
+man mange med meg mellom men mener mennesker mens mer mot mye må mål måtte
 millioner minutter mot msci mye må mål måtte
-ned neste noe noen nok norge norsk norske ntb ny nye nå når
+ned neste noe noen nok ny nye nå når
-og også om onsdag opp opplyser oslo oss over
+og også om opp opplyser oss over
-personer plass poeng politidistrikt politiet president prosent på
+personer plass poeng på
-regjeringen runde rundt russland
+runde rundt
-sa saken samme sammen samtidig satt se seg seks selv senere september ser sett
+sa saken samme sammen samtidig satt se seg seks selv senere ser sett
 siden sier sin sine siste sitt skal skriver skulle slik som sted stedet stor
-store står sverige svært så søndag
+store står svært så
-ta tatt tid tidligere til tilbake tillegg tirsdag to tok torsdag tre tror
+ta tatt tid tidligere til tilbake tillegg tok tror
 tyskland
-under usa ut uten utenfor
+under ut uten utenfor
 vant var ved veldig vi videre viktig vil ville viser vår være vært
--- a/spacy/lang/ru/lex_attrs.py
+++ b/spacy/lang/ru/lex_attrs.py
@ -1,56 +1,219 @@
 from ...attrs import LIKE_NUM
-_num_words = [
+_num_words = list(
-    "ноль",
+    set(
-    "один",
+        """
-    "два",
+ноль ноля нолю нолём ноле нулевой нулевого нулевому нулевым нулевом нулевая нулевую нулевое нулевые нулевых нулевыми 
-    "три",
+
-    "четыре",
+четверть четверти четвертью четвертей четвертям четвертями четвертях 
-    "пять",
+
-    "шесть",
+треть трети третью третей третям третями третях 
-    "семь",
+
-    "восемь",
+половина половины половине половину половиной половин половинам половинами половинах половиною 
-    "девять",
+
-    "десять",
+один одного одному одним одном 
-    "одиннадцать",
+первой первого первому первом первый первым первых 
-    "двенадцать",
+во-первых 
-    "тринадцать",
+единица единицы единице единицу единицей единиц единицам единицами единицах единицею 
-    "четырнадцать",
+
-    "пятнадцать",
+два двумя двум двух двоих двое две 
-    "шестнадцать",
+второго второму второй втором вторым вторых 
-    "семнадцать",
+двойка двойки двойке двойку двойкой двоек двойкам двойками двойках двойкою  
-    "восемнадцать",
+во-вторых 
-    "девятнадцать",
+оба обе обеим обеими обеих обоим обоими обоих 
-    "двадцать",
+
-    "тридцать",
+полтора полторы полутора 
-    "сорок",
+
-    "пятьдесят",
+три третьего третьему третьем третьим третий тремя трем трех трое троих трёх 
-    "шестьдесят",
+тройка тройки тройке тройку тройкою троек тройкам тройками тройках тройкой 
-    "семьдесят",
+троечка троечки троечке троечку троечкой троечек троечкам троечками троечках троечкой 
-    "восемьдесят",
+трешка трешки трешке трешку трешкой трешек трешкам трешками трешках трешкою 
-    "девяносто",
+трёшка трёшки трёшке трёшку трёшкой трёшек трёшкам трёшками трёшках трёшкою 
-    "сто",
+трояк трояка трояку трояком трояке трояки трояков троякам трояками трояках  
-    "двести",
+треха треху трехой 
-    "триста",
+трёха трёху трёхой 
-    "четыреста",
+втроем втроём 
-    "пятьсот",
+
-    "шестьсот",
+четыре четвертого четвертому четвертом четвертый четвертым четверка четырьмя четырем четырех четверо четырёх четверым 
-    "семьсот",
+четверых 
-    "восемьсот",
+вчетвером 
-    "девятьсот",
+
-    "тысяча",
+пять пятого пятому пятом пятый пятым пятью пяти пятеро пятерых пятерыми 
-    "миллион",
+впятером 
-    "миллиард",
+пятерочка пятерочки пятерочке пятерочками пятерочкой пятерочку пятерочкой пятерочками 
-    "триллион",
+пятёрочка пятёрочки пятёрочке пятёрочками пятёрочкой пятёрочку пятёрочкой пятёрочками 
-    "квадриллион",
+пятерка пятерки пятерке пятерками пятеркой пятерку пятерками 
-    "квинтиллион",
+пятёрка пятёрки пятёрке пятёрками пятёркой пятёрку пятёрками 
-]
+пятёра пятёры пятёре пятёрами пятёрой пятёру пятёрами 
 пятера пятеры пятере пятерами пятерой пятеру пятерами 
 пятак пятаки пятаке пятаками пятаком пятаку пятаками 
 шесть шестерка шестого шестому шестой шестом шестым шестью шести шестеро шестерых 
 вшестером 
 семь семерка седьмого седьмому седьмой седьмом седьмым семью семи семеро седьмых 
 всемером 
 восемь восьмерка восьмого восьмому восемью восьмой восьмом восьмым восеми восьмером восьми восьмью 
 восьмерых 
 ввосьмером 
 девять девятого девятому девятка девятом девятый девятым девятью девяти девятером вдевятером девятерых 
 вдевятером 
 десять десятого десятому десятка десятом десятый десятым десятью десяти десятером десятых 
 вдесятером 
 одиннадцать одиннадцатого одиннадцатому одиннадцатом одиннадцатый одиннадцатым одиннадцатью одиннадцати 
 одиннадцатых 
 двенадцать двенадцатого двенадцатому двенадцатом двенадцатый двенадцатым двенадцатью двенадцати 
 двенадцатых 
 тринадцать тринадцатого тринадцатому тринадцатом тринадцатый тринадцатым тринадцатью тринадцати 
 тринадцатых 
 четырнадцать четырнадцатого четырнадцатому четырнадцатом четырнадцатый четырнадцатым четырнадцатью четырнадцати 
 четырнадцатых 
 пятнадцать пятнадцатого пятнадцатому пятнадцатом пятнадцатый пятнадцатым пятнадцатью пятнадцати 
 пятнадцатых 
 пятнарик пятнарику пятнариком пятнарики 
 шестнадцать шестнадцатого шестнадцатому шестнадцатом шестнадцатый шестнадцатым шестнадцатью шестнадцати 
 шестнадцатых 
 семнадцать семнадцатого семнадцатому семнадцатом семнадцатый семнадцатым семнадцатью семнадцати семнадцатых 
 восемнадцать восемнадцатого восемнадцатому восемнадцатом восемнадцатый восемнадцатым восемнадцатью восемнадцати 
 восемнадцатых 
 девятнадцать девятнадцатого девятнадцатому девятнадцатом девятнадцатый девятнадцатым девятнадцатью девятнадцати 
 девятнадцатых 
 двадцать двадцатого двадцатому двадцатом двадцатый двадцатым двадцатью двадцати двадцатых 
 четвертак четвертака четвертаке четвертаку четвертаки четвертаком четвертаками 
 тридцать тридцатого тридцатому тридцатом тридцатый тридцатым тридцатью тридцати тридцатых 
 тридцадка тридцадку тридцадке тридцадки тридцадкой тридцадкою тридцадками 
 тридевять тридевяти тридевятью 
 сорок сорокового сороковому сороковом сороковым сороковой сороковых 
 сорокет сорокета сорокету сорокете сорокеты сорокетом сорокетами сорокетам 
 пятьдесят пятьдесятого пятьдесятому пятьюдесятью пятьдесятом пятьдесятый пятьдесятым пятидесяти пятьдесятых 
 полтинник полтинника полтиннике полтиннику полтинники полтинником полтинниками полтинникам полтинниках 
 пятидесятка пятидесятке пятидесятку пятидесятки пятидесяткой пятидесятками пятидесяткам пятидесятках 
 полтос полтоса полтосе полтосу полтосы полтосом полтосами полтосам полтосах 
 шестьдесят шестьдесятого шестьдесятому шестьюдесятью шестьдесятом шестьдесятый шестьдесятым шестидесятые шестидесяти 
 шестьдесятых 
 семьдесят семьдесятого семьдесятому семьюдесятью семьдесятом семьдесятый семьдесятым семидесяти семьдесятых 
 восемьдесят восемьдесятого восемьдесятому восемьюдесятью восемьдесятом восемьдесятый восемьдесятым восемидесяти 
 восьмидесяти восьмидесятых 
 девяносто девяностого девяностому девяностом девяностый девяностым девяноста девяностых 
 сто сотого сотому сотом сотен сотый сотым ста 
 стольник стольника стольнику стольнике стольники стольником стольниками 
 сотка сотки сотке соткой сотками соткам сотках 
 сотня сотни сотне сотней сотнями сотням сотнях 
 двести двумястами двухсотого двухсотому двухсотом двухсотый двухсотым двумстам двухстах двухсот 
 триста тремястами трехсотого трехсотому трехсотом трехсотый трехсотым тремстам трехстах трехсот 
 четыреста четырехсотого четырехсотому четырьмястами четырехсотом четырехсотый четырехсотым четыремстам четырехстах 
 четырехсот 
 пятьсот пятисотого пятисотому пятьюстами пятисотом пятисотый пятисотым пятистам пятистах пятисот 
 пятисотка пятисотки пятисотке пятисоткой пятисотками пятисоткам пятисоткою пятисотках 
 пятихатка пятихатки пятихатке пятихаткой пятихатками пятихаткам пятихаткою пятихатках 
 пятифан пятифаны пятифане пятифаном пятифанами пятифанах 
 шестьсот шестисотого шестисотому шестьюстами шестисотом шестисотый шестисотым шестистам шестистах шестисот 
 семьсот семисотого семисотому семьюстами семисотом семисотый семисотым семистам семистах семисот 
 восемьсот восемисотого восемисотому восемисотом восемисотый восемисотым восьмистами восьмистам восьмистах восьмисот 
 девятьсот девятисотого девятисотому девятьюстами девятисотом девятисотый девятисотым девятистам девятистах девятисот 
 тысяча тысячного тысячному тысячном тысячный тысячным тысячам тысячах тысячей тысяч тысячи тыс 
 косарь косаря косару косарем косарями косарях косарям косарей 
 десятитысячный десятитысячного десятитысячному десятитысячным десятитысячном десятитысячная десятитысячной 
 десятитысячную десятитысячною десятитысячное десятитысячные десятитысячных десятитысячными 
 двадцатитысячный двадцатитысячного двадцатитысячному двадцатитысячным двадцатитысячном двадцатитысячная 
 двадцатитысячной двадцатитысячную двадцатитысячною двадцатитысячное двадцатитысячные двадцатитысячных 
 двадцатитысячными 
 тридцатитысячный тридцатитысячного тридцатитысячному тридцатитысячным тридцатитысячном тридцатитысячная 
 тридцатитысячной тридцатитысячную тридцатитысячною тридцатитысячное тридцатитысячные тридцатитысячных 
 тридцатитысячными 
 сорокатысячный сорокатысячного сорокатысячному сорокатысячным сорокатысячном сорокатысячная 
 сорокатысячной сорокатысячную сорокатысячною сорокатысячное сорокатысячные сорокатысячных 
 сорокатысячными 
 пятидесятитысячный пятидесятитысячного пятидесятитысячному пятидесятитысячным пятидесятитысячном пятидесятитысячная 
 пятидесятитысячной пятидесятитысячную пятидесятитысячною пятидесятитысячное пятидесятитысячные пятидесятитысячных 
 пятидесятитысячными 
 шестидесятитысячный шестидесятитысячного шестидесятитысячному шестидесятитысячным шестидесятитысячном шестидесятитысячная 
 шестидесятитысячной шестидесятитысячную шестидесятитысячною шестидесятитысячное шестидесятитысячные шестидесятитысячных 
 шестидесятитысячными 
 семидесятитысячный семидесятитысячного семидесятитысячному семидесятитысячным семидесятитысячном семидесятитысячная 
 семидесятитысячной семидесятитысячную семидесятитысячною семидесятитысячное семидесятитысячные семидесятитысячных 
 семидесятитысячными 
 восьмидесятитысячный восьмидесятитысячного восьмидесятитысячному восьмидесятитысячным восьмидесятитысячном восьмидесятитысячная 
 восьмидесятитысячной восьмидесятитысячную восьмидесятитысячною восьмидесятитысячное восьмидесятитысячные восьмидесятитысячных 
 восьмидесятитысячными 
 стотысячный стотысячного стотысячному стотысячным стотысячном стотысячная стотысячной стотысячную стотысячное 
 стотысячные стотысячных стотысячными стотысячною 
 миллион миллионного миллионов миллионному миллионном миллионный миллионным миллионом миллиона миллионе миллиону 
 миллионов 
 лям ляма лямы лямом лямами лямах лямов 
 млн 
 десятимиллионная десятимиллионной десятимиллионными десятимиллионный десятимиллионным десятимиллионному 
 десятимиллионными десятимиллионную десятимиллионное  десятимиллионные десятимиллионных десятимиллионною 
 миллиард миллиардного миллиардному миллиардном миллиардный миллиардным миллиардом миллиарда миллиарде миллиарду 
 миллиардов 
 лярд лярда лярды лярдом лярдами лярдах лярдов 
 млрд 
 триллион триллионного триллионному триллионном триллионный триллионным триллионом триллиона триллионе триллиону 
 триллионов трлн 
 квадриллион квадриллионного квадриллионному квадриллионный квадриллионным квадриллионом квадриллиона квадриллионе 
 квадриллиону квадриллионов квадрлн 
 квинтиллион квинтиллионного квинтиллионному квинтиллионный квинтиллионным квинтиллионом квинтиллиона квинтиллионе 
 квинтиллиону квинтиллионов квинтлн 
 i ii iii iv v vi vii viii ix x xi xii xiii xiv xv xvi xvii xviii xix xx xxi xxii xxiii xxiv xxv xxvi xxvii xxvii xxix
 """.split()
    )
 )
 def like_num(text):
    if text.startswith(("+", "-", "±", "~")):
        text = text[1:]
    if text.endswith("%"):
        text = text[:-1]
    text = text.replace(",", "").replace(".", "")
    if text.isdigit():
        return True
--- a/spacy/lang/ru/stop_words.py
+++ b/spacy/lang/ru/stop_words.py
@ -1,52 +1,111 @@
 STOP_WORDS = set(
    """
-а
+а авось ага агу аж ай али алло ау ах ая
-будем будет будете будешь буду будут будучи будь будьте бы был была были было
+б будем будет будете будешь буду будут будучи будь будьте бы был была были было
-быть
+быть бац без безусловно бишь благо благодаря ближайшие близко более больше
 будто бывает бывала бывали бываю бывают бытует
 в вам вами вас весь во вот все всё всего всей всем всём всеми всему всех всею
-всея всю вся вы
+всея всю вся вы ваш ваша ваше ваши вдали вдобавок вдруг ведь везде вернее
 взаимно взаправду видно вишь включая вместо внакладе вначале вне вниз внизу
 вновь вовсе возможно воистину вокруг вон вообще вопреки вперекор вплоть
 вполне вправду вправе впрочем впрямь вресноту вроде вряд всегда всюду
 всякий всякого всякой всячески вчеред
-да для до
+г го где гораздо гав
-его едим едят ее её ей ел ела ем ему емъ если ест есть ешь еще ещё ею
+д да для до дабы давайте давно давным даже далее далеко дальше данная
 данного данное данной данном данному данные данный данных дану данунах
 даром де действительно довольно доколе доколь долго должен должна
 должно должны должный дополнительно другая другие другим другими
 других другое другой
-же
+е его едим едят ее её ей ел ела ем ему емъ если ест есть ешь еще ещё ею едва
 ежели еле
-за
+ж же
-и из или им ими имъ их
+з за затем зато зачем здесь значит зря
 и из или им ими имъ их ибо иль имеет имел имела имело именно иметь иначе
 иногда иным иными итак ишь
 й
 к как кем ко когда кого ком кому комья которая которого которое которой котором
-которому которою которую которые который которым которыми которых кто
+которому которою которую которые который которым которыми которых кто ка кабы
 каждая каждое каждые каждый кажется казалась казались казалось казался казаться
 какая какие каким какими каков какого какой какому какою касательно кой коли
 коль конечно короче кроме кстати ку куда
-меня мне мной мною мог моги могите могла могли могло могу могут мое моё моего
+л ли либо лишь любая любого любое любой любом любую любыми любых
 м меня мне мной мною мог моги могите могла могли могло могу могут мое моё моего
 моей моем моём моему моею можем может можете можешь мои мой моим моими моих
-мочь мою моя мы
+мочь мою моя мы мало меж между менее меньше мимо многие много многого многое
 многом многому можно мол му
-на нам нами нас наса наш наша наше нашего нашей нашем нашему нашею наши нашим
+н на нам нами нас наса наш наша наше нашего нашей нашем нашему нашею наши нашим
 нашими наших нашу не него нее неё ней нем нём нему нет нею ним ними них но
 наверняка наверху навряд навыворот над надо назад наиболее наизворот
 наизнанку наипаче накануне наконец наоборот наперед наперекор наподобие
 например напротив напрямую насилу настоящая настоящее настоящие настоящий
 насчет нате находиться начала начале неважно негде недавно недалеко незачем
 некем некогда некому некоторая некоторые некоторый некоторых некто некуда
 нельзя немногие немногим немного необходимо необходимости необходимые
 необходимым неоткуда непрерывно нередко несколько нету неужели нечего
 нечем нечему нечто нешто нибудь нигде ниже низко никак никакой никем
 никогда никого никому никто никуда ниоткуда нипочем ничего ничем ничему
 ничто ну нужная нужно нужного нужные нужный нужных ныне нынешнее нынешней
 нынешних нынче
 о об один одна одни одним одними одних одно одного одной одном одному одною
-одну он она оне они оно от
+одну он она оне они оно от оба общую обычно ого однажды однако ой около оный
 оп опять особенно особо особую особые откуда отнелижа отнелиже отовсюду
 отсюда оттого оттот оттуда отчего отчему ох очевидно очень ом
-по при
+п по при паче перед под подавно поди подобная подобно подобного подобные
 подобный подобным подобных поелику пожалуй пожалуйста позже поистине
 пока покамест поколе поколь покуда покудова помимо понеже поприще пор
 пора посему поскольку после посреди посредством потом потому потомушта
 похожем почему почти поэтому прежде притом причем про просто прочего
 прочее прочему прочими проще прям пусть
 р ради разве ранее рано раньше рядом
 с сам сама сами самим самими самих само самого самом самому саму свое своё
 своего своей своем своём своему своею свои свой своим своими своих свою своя
-себе себя собой собою
+себе себя собой собою самая самое самой самый самых сверх свыше се сего сей
 сейчас сие сих сквозь сколько скорее скоро следует слишком смогут сможет
 сначала снова со собственно совсем сперва спокону спустя сразу среди сродни
 стал стала стали стало стать суть сызнова
-та так такая такие таким такими таких такого такое такой таком такому такою
+та то ту ты ти так такая такие таким такими таких такого такое такой таком такому такою
-такую те тебе тебя тем теми тех то тобой тобою того той только том томах тому
+такую те тебе тебя тем теми тех тобой тобою того той только том томах тому
-тот тою ту ты
+тот тою также таки таков такова там твои твоим твоих твой твоя твоё
 теперь тогда тоже тотчас точно туда тут тьфу тая
-у уже
+у уже увы уж ура ух ую
-чего чем чём чему что чтобы
+ф фу
-эта эти этим этими этих это этого этой этом этому этот этою эту
+х ха хе хорошо хотел хотела хотелось хотеть хоть хотя хочешь хочу хуже
-я
+ч чего чем чём чему что чтобы часто чаще чей через чтоб чуть чхать чьим
 чьих чьё чё
 ш ша
 щ ща щас
 ы ых ые ый
 э эта эти этим этими этих это этого этой этом этому этот этою эту эдак эдакий
 эй эка экий этак этакий эх
 ю
 я явно явных яко якобы якоже
 """.split()
 )
--- a/spacy/lang/ru/tokenizer_exceptions.py
+++ b/spacy/lang/ru/tokenizer_exceptions.py
@ -2,7 +2,6 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ...symbols import ORTH, NORM
 from ...util import update_exc
 _exc = {}
 _abbrev_exc = [
@ -42,7 +41,6 @@ _abbrev_exc = [
    {ORTH: "дек", NORM: "декабрь"},
 ]
 for abbrev_desc in _abbrev_exc:
    abbrev = abbrev_desc[ORTH]
    for orth in (abbrev, abbrev.capitalize(), abbrev.upper()):
@ -50,17 +48,354 @@ for abbrev_desc in _abbrev_exc:
        _exc[orth + "."] = [{ORTH: orth + ".", NORM: abbrev_desc[NORM]}]
-_slang_exc = [
+for abbr in [
    # Year slang abbreviations
    {ORTH: "2к15", NORM: "2015"},
    {ORTH: "2к16", NORM: "2016"},
    {ORTH: "2к17", NORM: "2017"},
    {ORTH: "2к18", NORM: "2018"},
    {ORTH: "2к19", NORM: "2019"},
    {ORTH: "2к20", NORM: "2020"},
-]
+    {ORTH: "2к21", NORM: "2021"},
    {ORTH: "2к22", NORM: "2022"},
    {ORTH: "2к23", NORM: "2023"},
    {ORTH: "2к24", NORM: "2024"},
    {ORTH: "2к25", NORM: "2025"},
 ]:
    _exc[abbr[ORTH]] = [abbr]
-for slang_desc in _slang_exc:
+for abbr in [
-    _exc[slang_desc[ORTH]] = [slang_desc]
+    # Profession and academic titles abbreviations
    {ORTH: "ак.", NORM: "академик"},
    {ORTH: "акад.", NORM: "академик"},
    {ORTH: "д-р архитектуры", NORM: "доктор архитектуры"},
    {ORTH: "д-р биол. наук", NORM: "доктор биологических наук"},
    {ORTH: "д-р ветеринар. наук", NORM: "доктор ветеринарных наук"},
    {ORTH: "д-р воен. наук", NORM: "доктор военных наук"},
    {ORTH: "д-р геогр. наук", NORM: "доктор географических наук"},
    {ORTH: "д-р геол.-минерал. наук", NORM: "доктор геолого-минералогических наук"},
    {ORTH: "д-р искусствоведения", NORM: "доктор искусствоведения"},
    {ORTH: "д-р ист. наук", NORM: "доктор исторических наук"},
    {ORTH: "д-р культурологии", NORM: "доктор культурологии"},
    {ORTH: "д-р мед. наук", NORM: "доктор медицинских наук"},
    {ORTH: "д-р пед. наук", NORM: "доктор педагогических наук"},
    {ORTH: "д-р полит. наук", NORM: "доктор политических наук"},
    {ORTH: "д-р психол. наук", NORM: "доктор психологических наук"},
    {ORTH: "д-р с.-х. наук", NORM: "доктор сельскохозяйственных наук"},
    {ORTH: "д-р социол. наук", NORM: "доктор социологических наук"},
    {ORTH: "д-р техн. наук", NORM: "доктор технических наук"},
    {ORTH: "д-р фармацевт. наук", NORM: "доктор фармацевтических наук"},
    {ORTH: "д-р физ.-мат. наук", NORM: "доктор физико-математических наук"},
    {ORTH: "д-р филол. наук", NORM: "доктор филологических наук"},
    {ORTH: "д-р филос. наук", NORM: "доктор философских наук"},
    {ORTH: "д-р хим. наук", NORM: "доктор химических наук"},
    {ORTH: "д-р экон. наук", NORM: "доктор экономических наук"},
    {ORTH: "д-р юрид. наук", NORM: "доктор юридических наук"},
    {ORTH: "д-р", NORM: "доктор"},
    {ORTH: "д.б.н.", NORM: "доктор биологических наук"},
    {ORTH: "д.г.-м.н.", NORM: "доктор геолого-минералогических наук"},
    {ORTH: "д.г.н.", NORM: "доктор географических наук"},
    {ORTH: "д.и.н.", NORM: "доктор исторических наук"},
    {ORTH: "д.иск.", NORM: "доктор искусствоведения"},
    {ORTH: "д.м.н.", NORM: "доктор медицинских наук"},
    {ORTH: "д.п.н.", NORM: "доктор психологических наук"},
    {ORTH: "д.пед.н.", NORM: "доктор педагогических наук"},
    {ORTH: "д.полит.н.", NORM: "доктор политических наук"},
    {ORTH: "д.с.-х.н.", NORM: "доктор сельскохозяйственных наук"},
    {ORTH: "д.социол.н.", NORM: "доктор социологических наук"},
    {ORTH: "д.т.н.", NORM: "доктор технических наук"},
    {ORTH: "д.т.н", NORM: "доктор технических наук"},
    {ORTH: "д.ф.-м.н.", NORM: "доктор физико-математических наук"},
    {ORTH: "д.ф.н.", NORM: "доктор филологических наук"},
    {ORTH: "д.филос.н.", NORM: "доктор философских наук"},
    {ORTH: "д.фил.н.", NORM: "доктор филологических наук"},
    {ORTH: "д.х.н.", NORM: "доктор химических наук"},
    {ORTH: "д.э.н.", NORM: "доктор экономических наук"},
    {ORTH: "д.э.н", NORM: "доктор экономических наук"},
    {ORTH: "д.ю.н.", NORM: "доктор юридических наук"},
    {ORTH: "доц.", NORM: "доцент"},
    {ORTH: "и.о.", NORM: "исполняющий обязанности"},
    {ORTH: "к.б.н.", NORM: "кандидат биологических наук"},
    {ORTH: "к.воен.н.", NORM: "кандидат военных наук"},
    {ORTH: "к.г.-м.н.", NORM: "кандидат геолого-минералогических наук"},
    {ORTH: "к.г.н.", NORM: "кандидат географических наук"},
    {ORTH: "к.геогр.н", NORM: "кандидат географических наук"},
    {ORTH: "к.геогр.наук", NORM: "кандидат географических наук"},
    {ORTH: "к.и.н.", NORM: "кандидат исторических наук"},
    {ORTH: "к.иск.", NORM: "кандидат искусствоведения"},
    {ORTH: "к.м.н.", NORM: "кандидат медицинских наук"},
    {ORTH: "к.п.н.", NORM: "кандидат психологических наук"},
    {ORTH: "к.псх.н.", NORM: "кандидат психологических наук"},
    {ORTH: "к.пед.н.", NORM: "кандидат педагогических наук"},
    {ORTH: "канд.пед.наук", NORM: "кандидат педагогических наук"},
    {ORTH: "к.полит.н.", NORM: "кандидат политических наук"},
    {ORTH: "к.с.-х.н.", NORM: "кандидат сельскохозяйственных наук"},
    {ORTH: "к.социол.н.", NORM: "кандидат социологических наук"},
    {ORTH: "к.с.н.", NORM: "кандидат социологических наук"},
    {ORTH: "к.т.н.", NORM: "кандидат технических наук"},
    {ORTH: "к.ф.-м.н.", NORM: "кандидат физико-математических наук"},
    {ORTH: "к.ф.н.", NORM: "кандидат филологических наук"},
    {ORTH: "к.фил.н.", NORM: "кандидат филологических наук"},
    {ORTH: "к.филол.н", NORM: "кандидат филологических наук"},
    {ORTH: "к.фарм.наук", NORM: "кандидат фармакологических наук"},
    {ORTH: "к.фарм.н.", NORM: "кандидат фармакологических наук"},
    {ORTH: "к.фарм.н", NORM: "кандидат фармакологических наук"},
    {ORTH: "к.филос.наук", NORM: "кандидат философских наук"},
    {ORTH: "к.филос.н.", NORM: "кандидат философских наук"},
    {ORTH: "к.филос.н", NORM: "кандидат философских наук"},
    {ORTH: "к.х.н.", NORM: "кандидат химических наук"},
    {ORTH: "к.х.н", NORM: "кандидат химических наук"},
    {ORTH: "к.э.н.", NORM: "кандидат экономических наук"},
    {ORTH: "к.э.н", NORM: "кандидат экономических наук"},
    {ORTH: "к.ю.н.", NORM: "кандидат юридических наук"},
    {ORTH: "к.ю.н", NORM: "кандидат юридических наук"},
    {ORTH: "канд. архитектуры", NORM: "кандидат архитектуры"},
    {ORTH: "канд. биол. наук", NORM: "кандидат биологических наук"},
    {ORTH: "канд. ветеринар. наук", NORM: "кандидат ветеринарных наук"},
    {ORTH: "канд. воен. наук", NORM: "кандидат военных наук"},
    {ORTH: "канд. геогр. наук", NORM: "кандидат географических наук"},
    {ORTH: "канд. геол.-минерал. наук", NORM: "кандидат геолого-минералогических наук"},
    {ORTH: "канд. искусствоведения", NORM: "кандидат искусствоведения"},
    {ORTH: "канд. ист. наук", NORM: "кандидат исторических наук"},
    {ORTH: "к.ист.н.", NORM: "кандидат исторических наук"},
    {ORTH: "канд. культурологии", NORM: "кандидат культурологии"},
    {ORTH: "канд. мед. наук", NORM: "кандидат медицинских наук"},
    {ORTH: "канд. пед. наук", NORM: "кандидат педагогических наук"},
    {ORTH: "канд. полит. наук", NORM: "кандидат политических наук"},
    {ORTH: "канд. психол. наук", NORM: "кандидат психологических наук"},
    {ORTH: "канд. с.-х. наук", NORM: "кандидат сельскохозяйственных наук"},
    {ORTH: "канд. социол. наук", NORM: "кандидат социологических наук"},
    {ORTH: "к.соц.наук", NORM: "кандидат социологических наук"},
    {ORTH: "к.соц.н.", NORM: "кандидат социологических наук"},
    {ORTH: "к.соц.н", NORM: "кандидат социологических наук"},
    {ORTH: "канд. техн. наук", NORM: "кандидат технических наук"},
    {ORTH: "канд. фармацевт. наук", NORM: "кандидат фармацевтических наук"},
    {ORTH: "канд. физ.-мат. наук", NORM: "кандидат физико-математических наук"},
    {ORTH: "канд. филол. наук", NORM: "кандидат филологических наук"},
    {ORTH: "канд. филос. наук", NORM: "кандидат философских наук"},
    {ORTH: "канд. хим. наук", NORM: "кандидат химических наук"},
    {ORTH: "канд. экон. наук", NORM: "кандидат экономических наук"},
    {ORTH: "канд. юрид. наук", NORM: "кандидат юридических наук"},
    {ORTH: "в.н.с.", NORM: "ведущий научный сотрудник"},
    {ORTH: "мл. науч. сотр.", NORM: "младший научный сотрудник"},
    {ORTH: "м.н.с.", NORM: "младший научный сотрудник"},
    {ORTH: "проф.", NORM: "профессор"},
    {ORTH: "профессор.кафедры", NORM: "профессор кафедры"},
    {ORTH: "ст. науч. сотр.", NORM: "старший научный сотрудник"},
    {ORTH: "чл.-к.", NORM: "член корреспондент"},
    {ORTH: "чл.-корр.", NORM: "член-корреспондент"},
    {ORTH: "чл.-кор.", NORM: "член-корреспондент"},
    {ORTH: "дир.", NORM: "директор"},
    {ORTH: "зам. дир.", NORM: "заместитель директора"},
    {ORTH: "зав. каф.", NORM: "заведующий кафедрой"},
    {ORTH: "зав.кафедрой", NORM: "заведующий кафедрой"},
    {ORTH: "зав. кафедрой", NORM: "заведующий кафедрой"},
    {ORTH: "асп.", NORM: "аспирант"},
    {ORTH: "гл. науч. сотр.", NORM: "главный научный сотрудник"},
    {ORTH: "вед. науч. сотр.", NORM: "ведущий научный сотрудник"},
    {ORTH: "науч. сотр.", NORM: "научный сотрудник"},
    {ORTH: "к.м.с.", NORM: "кандидат в мастера спорта"},
 ]:
    _exc[abbr[ORTH]] = [abbr]
 for abbr in [
    # Literary phrases abbreviations
    {ORTH: "и т.д.", NORM: "и так далее"},
    {ORTH: "и т.п.", NORM: "и тому подобное"},
    {ORTH: "т.д.", NORM: "так далее"},
    {ORTH: "т.п.", NORM: "тому подобное"},
    {ORTH: "т.е.", NORM: "то есть"},
    {ORTH: "т.к.", NORM: "так как"},
    {ORTH: "в т.ч.", NORM: "в том числе"},
    {ORTH: "и пр.", NORM: "и прочие"},
    {ORTH: "и др.", NORM: "и другие"},
    {ORTH: "т.н.", NORM: "так называемый"},
 ]:
    _exc[abbr[ORTH]] = [abbr]
 for abbr in [
    # Appeal to a person abbreviations
    {ORTH: "г-н", NORM: "господин"},
    {ORTH: "г-да", NORM: "господа"},
    {ORTH: "г-жа", NORM: "госпожа"},
    {ORTH: "тов.", NORM: "товарищ"},
 ]:
    _exc[abbr[ORTH]] = [abbr]
 for abbr in [
    # Time periods abbreviations
    {ORTH: "до н.э.", NORM: "до нашей эры"},
    {ORTH: "по н.в.", NORM: "по настоящее время"},
    {ORTH: "в н.в.", NORM: "в настоящее время"},
    {ORTH: "наст.", NORM: "настоящий"},
    {ORTH: "наст. время", NORM: "настоящее время"},
    {ORTH: "г.г.", NORM: "годы"},
    {ORTH: "гг.", NORM: "годы"},
    {ORTH: "т.г.", NORM: "текущий год"},
 ]:
    _exc[abbr[ORTH]] = [abbr]
 for abbr in [
    # Address forming elements abbreviations
    {ORTH: "респ.", NORM: "республика"},
    {ORTH: "обл.", NORM: "область"},
    {ORTH: "г.ф.з.", NORM: "город федерального значения"},
    {ORTH: "а.обл.", NORM: "автономная область"},
    {ORTH: "а.окр.", NORM: "автономный округ"},
    {ORTH: "м.р-н", NORM: "муниципальный район"},
    {ORTH: "г.о.", NORM: "городской округ"},
    {ORTH: "г.п.", NORM: "городское поселение"},
    {ORTH: "с.п.", NORM: "сельское поселение"},
    {ORTH: "вн.р-н", NORM: "внутригородской район"},
    {ORTH: "вн.тер.г.", NORM: "внутригородская территория города"},
    {ORTH: "пос.", NORM: "поселение"},
    {ORTH: "р-н", NORM: "район"},
    {ORTH: "с/с", NORM: "сельсовет"},
    {ORTH: "г.", NORM: "город"},
    {ORTH: "п.г.т.", NORM: "поселок городского типа"},
    {ORTH: "пгт.", NORM: "поселок городского типа"},
    {ORTH: "р.п.", NORM: "рабочий поселок"},
    {ORTH: "рп.", NORM: "рабочий поселок"},
    {ORTH: "кп.", NORM: "курортный поселок"},
    {ORTH: "гп.", NORM: "городской поселок"},
    {ORTH: "п.", NORM: "поселок"},
    {ORTH: "в-ки", NORM: "выселки"},
    {ORTH: "г-к", NORM: "городок"},
    {ORTH: "з-ка", NORM: "заимка"},
    {ORTH: "п-к", NORM: "починок"},
    {ORTH: "киш.", NORM: "кишлак"},
    {ORTH: "п. ст. ", NORM: "поселок станция"},
    {ORTH: "п. ж/д ст. ", NORM: "поселок при железнодорожной станции"},
    {ORTH: "ж/д бл-ст", NORM: "железнодорожный блокпост"},
    {ORTH: "ж/д б-ка", NORM: "железнодорожная будка"},
    {ORTH: "ж/д в-ка", NORM: "железнодорожная ветка"},
    {ORTH: "ж/д к-ма", NORM: "железнодорожная казарма"},
    {ORTH: "ж/д к-т", NORM: "железнодорожный комбинат"},
    {ORTH: "ж/д пл-ма", NORM: "железнодорожная платформа"},
    {ORTH: "ж/д пл-ка", NORM: "железнодорожная площадка"},
    {ORTH: "ж/д п.п.", NORM: "железнодорожный путевой пост"},
    {ORTH: "ж/д о.п.", NORM: "железнодорожный остановочный пункт"},
    {ORTH: "ж/д рзд.", NORM: "железнодорожный разъезд"},
    {ORTH: "ж/д ст. ", NORM: "железнодорожная станция"},
    {ORTH: "м-ко", NORM: "местечко"},
    {ORTH: "д.", NORM: "деревня"},
    {ORTH: "с.", NORM: "село"},
    {ORTH: "сл.", NORM: "слобода"},
    {ORTH: "ст. ", NORM: "станция"},
    {ORTH: "ст-ца", NORM: "станица"},
    {ORTH: "у.", NORM: "улус"},
    {ORTH: "х.", NORM: "хутор"},
    {ORTH: "рзд.", NORM: "разъезд"},
    {ORTH: "зим.", NORM: "зимовье"},
    {ORTH: "б-г", NORM: "берег"},
    {ORTH: "ж/р", NORM: "жилой район"},
    {ORTH: "кв-л", NORM: "квартал"},
    {ORTH: "мкр.", NORM: "микрорайон"},
    {ORTH: "ост-в", NORM: "остров"},
    {ORTH: "платф.", NORM: "платформа"},
    {ORTH: "п/р", NORM: "промышленный район"},
    {ORTH: "р-н", NORM: "район"},
    {ORTH: "тер.", NORM: "территория"},
    {
        ORTH: "тер. СНО",
        NORM: "территория садоводческих некоммерческих объединений граждан",
    },
    {
        ORTH: "тер. ОНО",
        NORM: "территория огороднических некоммерческих объединений граждан",
    },
    {ORTH: "тер. ДНО", NORM: "территория дачных некоммерческих объединений граждан"},
    {ORTH: "тер. СНТ", NORM: "территория садоводческих некоммерческих товариществ"},
    {ORTH: "тер. ОНТ", NORM: "территория огороднических некоммерческих товариществ"},
    {ORTH: "тер. ДНТ", NORM: "территория дачных некоммерческих товариществ"},
    {ORTH: "тер. СПК", NORM: "территория садоводческих потребительских кооперативов"},
    {ORTH: "тер. ОПК", NORM: "территория огороднических потребительских кооперативов"},
    {ORTH: "тер. ДПК", NORM: "территория дачных потребительских кооперативов"},
    {ORTH: "тер. СНП", NORM: "территория садоводческих некоммерческих партнерств"},
    {ORTH: "тер. ОНП", NORM: "территория огороднических некоммерческих партнерств"},
    {ORTH: "тер. ДНП", NORM: "территория дачных некоммерческих партнерств"},
    {ORTH: "тер. ТСН", NORM: "территория товарищества собственников недвижимости"},
    {ORTH: "тер. ГСК", NORM: "территория гаражно-строительного кооператива"},
    {ORTH: "ус.", NORM: "усадьба"},
    {ORTH: "тер.ф.х.", NORM: "территория фермерского хозяйства"},
    {ORTH: "ю.", NORM: "юрты"},
    {ORTH: "ал.", NORM: "аллея"},
    {ORTH: "б-р", NORM: "бульвар"},
    {ORTH: "взв.", NORM: "взвоз"},
    {ORTH: "взд.", NORM: "въезд"},
    {ORTH: "дор.", NORM: "дорога"},
    {ORTH: "ззд.", NORM: "заезд"},
    {ORTH: "км", NORM: "километр"},
    {ORTH: "к-цо", NORM: "кольцо"},
    {ORTH: "лн.", NORM: "линия"},
    {ORTH: "мгстр.", NORM: "магистраль"},
    {ORTH: "наб.", NORM: "набережная"},
    {ORTH: "пер-д", NORM: "переезд"},
    {ORTH: "пер.", NORM: "переулок"},
    {ORTH: "пл-ка", NORM: "площадка"},
    {ORTH: "пл.", NORM: "площадь"},
    {ORTH: "пр-д", NORM: "проезд"},
    {ORTH: "пр-к", NORM: "просек"},
    {ORTH: "пр-ка", NORM: "просека"},
    {ORTH: "пр-лок", NORM: "проселок"},
    {ORTH: "пр-кт", NORM: "проспект"},
    {ORTH: "проул.", NORM: "проулок"},
    {ORTH: "рзд.", NORM: "разъезд"},
    {ORTH: "ряд", NORM: "ряд(ы)"},
    {ORTH: "с-р", NORM: "сквер"},
    {ORTH: "с-к", NORM: "спуск"},
    {ORTH: "сзд.", NORM: "съезд"},
    {ORTH: "туп.", NORM: "тупик"},
    {ORTH: "ул.", NORM: "улица"},
    {ORTH: "ш.", NORM: "шоссе"},
    {ORTH: "влд.", NORM: "владение"},
    {ORTH: "г-ж", NORM: "гараж"},
    {ORTH: "д.", NORM: "дом"},
    {ORTH: "двлд.", NORM: "домовладение"},
    {ORTH: "зд.", NORM: "здание"},
    {ORTH: "з/у", NORM: "земельный участок"},
    {ORTH: "кв.", NORM: "квартира"},
    {ORTH: "ком.", NORM: "комната"},
    {ORTH: "подв.", NORM: "подвал"},
    {ORTH: "кот.", NORM: "котельная"},
    {ORTH: "п-б", NORM: "погреб"},
    {ORTH: "к.", NORM: "корпус"},
    {ORTH: "ОНС", NORM: "объект незавершенного строительства"},
    {ORTH: "оф.", NORM: "офис"},
    {ORTH: "пав.", NORM: "павильон"},
    {ORTH: "помещ.", NORM: "помещение"},
    {ORTH: "раб.уч.", NORM: "рабочий участок"},
    {ORTH: "скл.", NORM: "склад"},
    {ORTH: "coop.", NORM: "сооружение"},
    {ORTH: "стр.", NORM: "строение"},
    {ORTH: "торг.зал", NORM: "торговый зал"},
    {ORTH: "а/п", NORM: "аэропорт"},
    {ORTH: "им.", NORM: "имени"},
 ]:
    _exc[abbr[ORTH]] = [abbr]
 for abbr in [
    # Others abbreviations
    {ORTH: "тыс.руб.", NORM: "тысяч рублей"},
    {ORTH: "тыс.", NORM: "тысяч"},
    {ORTH: "руб.", NORM: "рубль"},
    {ORTH: "долл.", NORM: "доллар"},
    {ORTH: "прим.", NORM: "примечание"},
    {ORTH: "прим.ред.", NORM: "примечание редакции"},
    {ORTH: "см. также", NORM: "смотри также"},
    {ORTH: "кв.м.", NORM: "квадрантный метр"},
    {ORTH: "м2", NORM: "квадрантный метр"},
    {ORTH: "б/у", NORM: "бывший в употреблении"},
    {ORTH: "сокр.", NORM: "сокращение"},
    {ORTH: "чел.", NORM: "человек"},
    {ORTH: "б.п.", NORM: "базисный пункт"},
 ]:
    _exc[abbr[ORTH]] = [abbr]
 TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
--- a/spacy/lang/sl/examples.py
+++ b/spacy/lang/sl/examples.py
@ -0,0 +1,18 @@
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.sl.examples import sentences
 >>> docs = nlp.pipe(sentences)
 """
 sentences = [
    "Apple načrtuje nakup britanskega startupa za 1 bilijon dolarjev",
    "France Prešeren je umrl 8. februarja 1849 v Kranju",
    "Staro ljubljansko letališče Moste bo obnovila družba BTC",
    "London je največje mesto v Združenem kraljestvu.",
    "Kje se skrivaš?",
    "Kdo je predsednik Francije?",
    "Katero je glavno mesto Združenih držav Amerike?",
    "Kdaj je bil rojen Milan Kučan?",
 ]
--- a/spacy/lang/sl/stop_words.py
+++ b/spacy/lang/sl/stop_words.py
@ -1,13 +1,10 @@
 # Source: https://github.com/stopwords-iso/stopwords-sl
-# TODO: probably needs to be tidied up – the list seems to have month names in
+# Removed various words that are not normally considered stop words, such as months.
 # it, which shouldn't be considered stop words.
 STOP_WORDS = set(
    """
 a
 ali
 april
 avgust
 b
 bi
 bil
@ -19,7 +16,6 @@ biti
 blizu
 bo
 bodo
 bojo
 bolj
 bom
 bomo
@ -37,16 +33,6 @@ da
 daleč
 dan
 danes
 datum
 december
 deset
 deseta
 deseti
 deseto
 devet
 deveta
 deveti
 deveto
 do
 dober
 dobra
@ -54,16 +40,7 @@ dobri
 dobro
 dokler
 dol
 dolg
 dolga
 dolgi
 dovolj
 drug
 druga
 drugi
 drugo
 dva
 dve
 e
 eden
 en
@ -74,7 +51,6 @@ enkrat
 eno
 etc.
 f
 februar
 g
 g.
 ga
@ -93,16 +69,12 @@ iv
 ix
 iz
 j
 januar
 jaz
 je
 ji
 jih
 jim
 jo
 julij
 junij
 jutri
 k
 kadarkoli
 kaj
@ -123,41 +95,23 @@ kje
 kjer
 kjerkoli
 ko
 koder
 koderkoli
 koga
 komu
 kot
 kratek
 kratka
 kratke
 kratki
 l
 lahka
 lahke
 lahki
 lahko
 le
 lep
 lepa
 lepe
 lepi
 lepo
 leto
 m
 maj
 majhen
 majhna
 majhni
 malce
 malo
 manj
 marec
 me
 med
 medtem
 mene
 mesec
 mi
 midva
 midve
@ -183,7 +137,6 @@ najmanj
 naju
 največ
 nam
 narobe
 nas
 nato
 nazaj
@ -192,7 +145,6 @@ naša
 naše
 ne
 nedavno
 nedelja
 nek
 neka
 nekaj
@ -236,7 +188,6 @@ njuna
 njuno
 no
 nocoj
 november
 npr.
 o
 ob
@ -244,51 +195,23 @@ oba
 obe
 oboje
 od
 odprt
 odprta
 odprti
 okoli
 oktober
 on
 onadva
 one
 oni
 onidve
 osem
 osma
 osmi
 osmo
 oz.
 p
 pa
 pet
 peta
 petek
 peti
 peto
 po
 pod
 pogosto
 poleg
 poln
 polna
 polni
 polno
 ponavadi
 ponedeljek
 ponovno
 potem
 povsod
 pozdravljen
 pozdravljeni
 prav
 prava
 prave
 pravi
 pravo
 prazen
 prazna
 prazno
 prbl.
 precej
 pred
@ -297,19 +220,10 @@ preko
 pri
 pribl.
 približno
 primer
 pripravljen
 pripravljena
 pripravljeni
 proti
 prva
 prvi
 prvo
 r
 ravno
 redko
 res
 reč
 s
 saj
 sam
@ -321,29 +235,17 @@ se
 sebe
 sebi
 sedaj
 sedem
 sedma
 sedmi
 sedmo
 sem
 september
 seveda
 si
 sicer
 skoraj
 skozi
 slab
 smo
 so
 sobota
 spet
 sreda
 srednja
 srednji
 sta
 ste
 stran
 stvar
 sva
 t
 ta
@ -358,10 +260,6 @@ te
 tebe
 tebi
 tega
 težak
 težka
 težki
 težko
 ti
 tista
 tiste
@ -371,11 +269,6 @@ tj.
 tja
 to
 toda
 torek
 tretja
 tretje
 tretji
 tri
 tu
 tudi
 tukaj
@ -392,10 +285,6 @@ vaša
 vaše
 ve
 vedno
 velik
 velika
 veliki
 veliko
 vendar
 ves
 več
@ -403,10 +292,6 @@ vi
 vidva
 vii
 viii
 visok
 visoka
 visoke
 visoki
 vsa
 vsaj
 vsak
@ -420,34 +305,21 @@ vsega
 vsi
 vso
 včasih
 včeraj
 x
 z
 za
 zadaj
 zadnji
 zakaj
 zaprta
 zaprti
 zaprto
 zdaj
 zelo
 zunaj
 č
 če
 često
 četrta
 četrtek
 četrti
 četrto
 čez
 čigav
 š
 šest
 šesta
 šesti
 šesto
 štiri
 ž
 že
 """.split()
--- a/spacy/lang/tr/lex_attrs.py
+++ b/spacy/lang/tr/lex_attrs.py
@ -53,7 +53,7 @@ _ordinal_words = [
    "doksanıncı",
    "yüzüncü",
    "bininci",
-    "mliyonuncu",
+    "milyonuncu",
    "milyarıncı",
    "trilyonuncu",
    "katrilyonuncu",
--- a/spacy/lang/uk/tokenizer_exceptions.py
+++ b/spacy/lang/uk/tokenizer_exceptions.py
@ -6,19 +6,30 @@ from ...util import update_exc
 _exc = {}
 for exc_data in [
    {ORTH: "обл.", NORM: "область"},
    {ORTH: "р-н.", NORM: "район"},
    {ORTH: "р-н", NORM: "район"},
    {ORTH: "м.", NORM: "місто"},
    {ORTH: "вул.", NORM: "вулиця"},
    {ORTH: "ім.", NORM: "імені"},
    {ORTH: "просп.", NORM: "проспект"},
    {ORTH: "пр-кт", NORM: "проспект"},
    {ORTH: "бул.", NORM: "бульвар"},
    {ORTH: "пров.", NORM: "провулок"},
    {ORTH: "пл.", NORM: "площа"},
    {ORTH: "майд.", NORM: "майдан"},
    {ORTH: "мкр.", NORM: "мікрорайон"},
    {ORTH: "ст.", NORM: "станція"},
    {ORTH: "ж/м", NORM: "житловий масив"},
    {ORTH: "наб.", NORM: "набережна"},
    {ORTH: "в/ч", NORM: "військова частина"},
    {ORTH: "в/м", NORM: "військове містечко"},
    {ORTH: "оз.", NORM: "озеро"},
    {ORTH: "ім.", NORM: "імені"},
    {ORTH: "г.", NORM: "гора"},
    {ORTH: "п.", NORM: "пан"},
    {ORTH: "м.", NORM: "місто"},
    {ORTH: "проф.", NORM: "професор"},
    {ORTH: "акад.", NORM: "академік"},
    {ORTH: "доц.", NORM: "доцент"},
    {ORTH: "оз.", NORM: "озеро"},
 ]:
    _exc[exc_data[ORTH]] = [exc_data]
--- a/spacy/language.py
+++ b/spacy/language.py
@ -131,7 +131,7 @@ class Language:
        self,
        vocab: Union[Vocab, bool] = True,
        *,
-        max_length: int = 10 ** 6,
+        max_length: int = 10**6,
        meta: Dict[str, Any] = {},
        create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None,
        batch_size: int = 1000,
@ -354,12 +354,15 @@ class Language:
    @property
    def pipe_labels(self) -> Dict[str, List[str]]:
        """Get the labels set by the pipeline components, if available (if
-        the component exposes a labels property).
+        the component exposes a labels property and the labels are not
        hidden).
        RETURNS (Dict[str, List[str]]): Labels keyed by component name.
        """
        labels = {}
        for name, pipe in self._components:
            if hasattr(pipe, "hide_labels") and pipe.hide_labels is True:
                continue
            if hasattr(pipe, "labels"):
                labels[name] = list(pipe.labels)
        return SimpleFrozenDict(labels)
@ -1219,8 +1222,9 @@ class Language:
            component_cfg = {}
        grads = {}
-        def get_grads(W, dW, key=None):
+        def get_grads(key, W, dW):
            grads[key] = (W, dW)
            return W, dW
        get_grads.learn_rate = sgd.learn_rate  # type: ignore[attr-defined, union-attr]
        get_grads.b1 = sgd.b1  # type: ignore[attr-defined, union-attr]
@ -1233,7 +1237,7 @@ class Language:
                examples, sgd=get_grads, losses=losses, **component_cfg.get(name, {})
            )
        for key, (W, dW) in grads.items():
-            sgd(W, dW, key=key)  # type: ignore[call-arg, misc]
+            sgd(key, W, dW)  # type: ignore[call-arg, misc]
        return losses
    def begin_training(
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@ -244,8 +244,12 @@ cdef class Matcher:
                        pipe = "parser"
                    error_msg = Errors.E155.format(pipe=pipe, attr=self.vocab.strings.as_string(attr))
                    raise ValueError(error_msg)
-        matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length,
+
-                                extensions=self._extensions, predicates=self._extra_predicates, with_alignments=with_alignments)
+        if self.patterns.empty():
            matches = []
        else:
            matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length,
                                    extensions=self._extensions, predicates=self._extra_predicates, with_alignments=with_alignments)
        final_matches = []
        pairs_by_id = {}
        # For each key, either add all matches, or only the filtered,
--- a/spacy/matcher/phrasematcher.pyi
+++ b/spacy/matcher/phrasematcher.pyi
@ -14,7 +14,7 @@ class PhraseMatcher:
    def add(
        self,
        key: str,
-        docs: List[List[Dict[str, Any]]],
+        docs: List[Doc],
        *,
        on_match: Optional[
            Callable[[Matcher, Doc, int, List[Tuple[Any, ...]]], Any]
--- a/spacy/ml/extract_spans.py
+++ b/spacy/ml/extract_spans.py
@ -63,4 +63,4 @@ def _get_span_indices(ops, spans: Ragged, lengths: Ints1d) -> Ints1d:
 def _ensure_cpu(spans: Ragged, lengths: Ints1d) -> Tuple[Ragged, Ints1d]:
-    return (Ragged(to_numpy(spans.dataXd), to_numpy(spans.lengths)), to_numpy(lengths))
+    return Ragged(to_numpy(spans.dataXd), to_numpy(spans.lengths)), to_numpy(lengths)
--- a/spacy/ml/models/entity_linker.py
+++ b/spacy/ml/models/entity_linker.py
@ -1,34 +1,82 @@
 from pathlib import Path
-from typing import Optional, Callable, Iterable, List
+from typing import Optional, Callable, Iterable, List, Tuple
 from thinc.types import Floats2d
 from thinc.api import chain, clone, list2ragged, reduce_mean, residual
-from thinc.api import Model, Maxout, Linear
+from thinc.api import Model, Maxout, Linear, noop, tuplify, Ragged
 from ...util import registry
 from ...kb import KnowledgeBase, Candidate, get_candidates
 from ...vocab import Vocab
 from ...tokens import Span, Doc
 from ..extract_spans import extract_spans
 from ...errors import Errors
-@registry.architectures("spacy.EntityLinker.v1")
+@registry.architectures("spacy.EntityLinker.v2")
 def build_nel_encoder(
    tok2vec: Model, nO: Optional[int] = None
 ) -> Model[List[Doc], Floats2d]:
-    with Model.define_operators({">>": chain, "**": clone}):
+    with Model.define_operators({">>": chain, "&": tuplify}):
        token_width = tok2vec.maybe_get_dim("nO")
        output_layer = Linear(nO=nO, nI=token_width)
        model = (
-            tok2vec
+            ((tok2vec >> list2ragged()) & build_span_maker())
-            >> list2ragged()
+            >> extract_spans()
            >> reduce_mean()
            >> residual(Maxout(nO=token_width, nI=token_width, nP=2, dropout=0.0))  # type: ignore[arg-type]
            >> output_layer
        )
        model.set_ref("output_layer", output_layer)
        model.set_ref("tok2vec", tok2vec)
    # flag to show this isn't legacy
    model.attrs["include_span_maker"] = True
    return model
 def build_span_maker(n_sents: int = 0) -> Model:
    model: Model = Model("span_maker", forward=span_maker_forward)
    model.attrs["n_sents"] = n_sents
    return model
 def span_maker_forward(model, docs: List[Doc], is_train) -> Tuple[Ragged, Callable]:
    ops = model.ops
    n_sents = model.attrs["n_sents"]
    candidates = []
    for doc in docs:
        cands = []
        try:
            sentences = [s for s in doc.sents]
        except ValueError:
            # no sentence info, normal in initialization
            for tok in doc:
                tok.is_sent_start = tok.i == 0
            sentences = [doc[:]]
        for ent in doc.ents:
            try:
                # find the sentence in the list of sentences.
                sent_index = sentences.index(ent.sent)
            except AttributeError:
                # Catch the exception when ent.sent is None and provide a user-friendly warning
                raise RuntimeError(Errors.E030) from None
            # get n previous sentences, if there are any
            start_sentence = max(0, sent_index - n_sents)
            # get n posterior sentences, or as many < n as there are
            end_sentence = min(len(sentences) - 1, sent_index + n_sents)
            # get token positions
            start_token = sentences[start_sentence].start
            end_token = sentences[end_sentence].end
            # save positions for extraction
            cands.append((start_token, end_token))
        candidates.append(ops.asarray2i(cands))
    candlens = ops.asarray1i([len(cands) for cands in candidates])
    candidates = ops.xp.concatenate(candidates)
    outputs = Ragged(candidates, candlens)
    # because this is just rearranging docs, the backprop does nothing
    return outputs, lambda x: []
@registry.misc("spacy.KBFromFile.v1")
 def load_kb(kb_path: Path) -> Callable[[Vocab], KnowledgeBase]:
    def kb_from_file(vocab):
--- a/spacy/ml/models/multi_task.py
+++ b/spacy/ml/models/multi_task.py
@ -85,7 +85,7 @@ def get_characters_loss(ops, docs, prediction, nr_char):
    target = ops.asarray(to_categorical(target_ids, n_classes=256), dtype="f")
    target = target.reshape((-1, 256 * nr_char))
    diff = prediction - target
-    loss = (diff ** 2).sum()
+    loss = (diff**2).sum()
    d_target = diff / float(prediction.shape[0])
    return loss, d_target
--- a/spacy/ml/models/tagger.py
+++ b/spacy/ml/models/tagger.py
@ -1,14 +1,14 @@
 from typing import Optional, List
-from thinc.api import zero_init, with_array, Softmax, chain, Model
+from thinc.api import zero_init, with_array, Softmax_v2, chain, Model
 from thinc.types import Floats2d
 from ...util import registry
 from ...tokens import Doc
-@registry.architectures("spacy.Tagger.v1")
+@registry.architectures("spacy.Tagger.v2")
 def build_tagger_model(
-    tok2vec: Model[List[Doc], List[Floats2d]], nO: Optional[int] = None
+    tok2vec: Model[List[Doc], List[Floats2d]], nO: Optional[int] = None, normalize=False
 ) -> Model[List[Doc], List[Floats2d]]:
    """Build a tagger model, using a provided token-to-vector component. The tagger
    model simply adds a linear layer with softmax activation to predict scores
@ -19,7 +19,9 @@ def build_tagger_model(
    """
    # TODO: glorot_uniform_init seems to work a bit better than zero_init here?!
    t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
-    output_layer = Softmax(nO, t2v_width, init_W=zero_init)
+    output_layer = Softmax_v2(
        nO, t2v_width, init_W=zero_init, normalize_outputs=normalize
    )
    softmax = with_array(output_layer)  # type: ignore
    model = chain(tok2vec, softmax)
    model.set_ref("tok2vec", tok2vec)
--- a/spacy/pipeline/init.py
+++ b/spacy/pipeline/init.py
@ -1,6 +1,7 @@
 from .attributeruler import AttributeRuler
 from .coref import CoreferenceResolver
 from .dep_parser import DependencyParser
 from .edit_tree_lemmatizer import EditTreeLemmatizer
 from .entity_linker import EntityLinker
 from .ner import EntityRecognizer
 from .entityruler import EntityRuler
--- a/spacy/pipeline/_edit_tree_internals/init.py
+++ b/spacy/pipeline/_edit_tree_internals/init.py
--- a/spacy/pipeline/_edit_tree_internals/edit_trees.pxd
+++ b/spacy/pipeline/_edit_tree_internals/edit_trees.pxd
@ -0,0 +1,93 @@
 from libc.stdint cimport uint32_t, uint64_t
 from libcpp.unordered_map cimport unordered_map
 from libcpp.vector cimport vector
 from ...typedefs cimport attr_t, hash_t, len_t
 from ...strings cimport StringStore
 cdef extern from "<algorithm>" namespace "std" nogil:
    void swap[T](T& a, T& b) except +  # Only available in Cython 3.
 # An edit tree (Müller et al., 2015) is a tree structure that consists of
 # edit operations. The two types of operations are string matches
 # and string substitutions. Given an input string s and an output string t,
 # subsitution and match nodes should be interpreted as follows:
 #
 # * Substitution node: consists of an original string and substitute string.
 #   If s matches the original string, then t is the substitute. Otherwise,
 #   the node does not apply.
 # * Match node: consists of a prefix length, suffix length, prefix edit tree,
 #   and suffix edit tree. If s is composed of a prefix, middle part, and suffix
 #   with the given suffix and prefix lengths, then t is the concatenation
 #   prefix_tree(prefix) + middle + suffix_tree(suffix).
 #
 # For efficiency, we represent strings in substitution nodes as integers, with
 # the actual strings stored in a StringStore. Subtrees in match nodes are stored
 # as tree identifiers (rather than pointers) to simplify serialization.
 cdef uint32_t NULL_TREE_ID
 cdef struct MatchNodeC:
    len_t prefix_len
    len_t suffix_len
    uint32_t prefix_tree
    uint32_t suffix_tree
 cdef struct SubstNodeC:
    attr_t orig
    attr_t subst
 cdef union NodeC:
    MatchNodeC match_node
    SubstNodeC subst_node
 cdef struct EditTreeC:
    bint is_match_node
    NodeC inner
 cdef inline EditTreeC edittree_new_match(len_t prefix_len, len_t suffix_len,
        uint32_t prefix_tree, uint32_t suffix_tree):
    cdef MatchNodeC match_node = MatchNodeC(prefix_len=prefix_len,
            suffix_len=suffix_len, prefix_tree=prefix_tree,
            suffix_tree=suffix_tree)
    cdef NodeC inner = NodeC(match_node=match_node)
    return EditTreeC(is_match_node=True, inner=inner)
 cdef inline EditTreeC edittree_new_subst(attr_t orig, attr_t subst):
    cdef EditTreeC node
    cdef SubstNodeC subst_node = SubstNodeC(orig=orig, subst=subst)
    cdef NodeC inner = NodeC(subst_node=subst_node)
    return EditTreeC(is_match_node=False, inner=inner)
 cdef inline uint64_t edittree_hash(EditTreeC tree):
    cdef MatchNodeC match_node
    cdef SubstNodeC subst_node
    if tree.is_match_node:
        match_node = tree.inner.match_node
        return hash((match_node.prefix_len, match_node.suffix_len, match_node.prefix_tree, match_node.suffix_tree))
    else:
        subst_node = tree.inner.subst_node
        return hash((subst_node.orig, subst_node.subst))
 cdef struct LCS:
    int source_begin
    int source_end
    int target_begin
    int target_end
 cdef inline bint lcs_is_empty(LCS lcs):
    return lcs.source_begin == 0 and lcs.source_end == 0 and lcs.target_begin == 0 and lcs.target_end == 0
 cdef class EditTrees:
    cdef vector[EditTreeC] trees
    cdef unordered_map[hash_t, uint32_t] map
    cdef StringStore strings
    cpdef uint32_t add(self, str form, str lemma)
    cpdef str apply(self, uint32_t tree_id, str form)
    cpdef unicode tree_to_str(self, uint32_t tree_id)
    cdef uint32_t _add(self, str form, str lemma)
    cdef _apply(self, uint32_t tree_id, str form_part, list lemma_pieces)
    cdef uint32_t _tree_id(self, EditTreeC tree)
--- a/spacy/pipeline/_edit_tree_internals/edit_trees.pyx
+++ b/spacy/pipeline/_edit_tree_internals/edit_trees.pyx
@ -0,0 +1,305 @@
 # cython: infer_types=True, binding=True
 from cython.operator cimport dereference as deref
 from libc.stdint cimport uint32_t
 from libc.stdint cimport UINT32_MAX
 from libc.string cimport memset
 from libcpp.pair cimport pair
 from libcpp.vector cimport vector
 from pathlib import Path
 from ...typedefs cimport hash_t
 from ... import util
 from ...errors import Errors
 from ...strings import StringStore
 from .schemas import validate_edit_tree
 NULL_TREE_ID = UINT32_MAX
 cdef LCS find_lcs(str source, str target):
    """
    Find the longest common subsequence (LCS) between two strings. If there are
    multiple LCSes, only one of them is returned.
    source (str): The first string.
    target (str): The second string.
    RETURNS (LCS): The spans of the longest common subsequences.
    """
    cdef Py_ssize_t source_len = len(source)
    cdef Py_ssize_t target_len = len(target)
    cdef size_t longest_align = 0;
    cdef int source_idx, target_idx
    cdef LCS lcs
    cdef Py_UCS4 source_cp, target_cp
    memset(&lcs, 0, sizeof(lcs))
    cdef vector[size_t] prev_aligns = vector[size_t](target_len);
    cdef vector[size_t] cur_aligns = vector[size_t](target_len);
    for (source_idx, source_cp) in enumerate(source):
        for (target_idx, target_cp) in enumerate(target):
            if source_cp == target_cp:
                if source_idx == 0 or target_idx == 0:
                    cur_aligns[target_idx] = 1
                else:
                    cur_aligns[target_idx] = prev_aligns[target_idx - 1] + 1
                # Check if this is the longest alignment and replace previous
                # best alignment when this is the case.
                if cur_aligns[target_idx] > longest_align:
                    longest_align = cur_aligns[target_idx]
                    lcs.source_begin = source_idx - longest_align + 1
                    lcs.source_end = source_idx + 1
                    lcs.target_begin = target_idx - longest_align + 1
                    lcs.target_end = target_idx + 1
            else:
                # No match, we start with a zero-length alignment.
                cur_aligns[target_idx] = 0
        swap(prev_aligns, cur_aligns)
    return lcs
 cdef class EditTrees:
    """Container for constructing and storing edit trees."""
    def __init__(self, strings: StringStore):
        """Create a container for edit trees.
        strings (StringStore): the string store to use."""
        self.strings = strings
    cpdef uint32_t add(self, str form, str lemma):
        """Add an edit tree that rewrites the given string into the given lemma.
        RETURNS (int): identifier of the edit tree in the container.
        """
        # Treat two empty strings as a special case. Generating an edit
        # tree for identical strings results in a match node. However,
        # since two empty strings have a zero-length LCS, a substitution
        # node would be created. Since we do not want to clutter the
        # recursive tree construction with logic for this case, handle
        # it in this wrapper method.
        if len(form) == 0 and len(lemma) == 0:
            tree = edittree_new_match(0, 0, NULL_TREE_ID, NULL_TREE_ID)
            return self._tree_id(tree)
        return self._add(form, lemma)
    cdef uint32_t _add(self, str form, str lemma):
        cdef LCS lcs = find_lcs(form, lemma)
        cdef EditTreeC tree
        cdef uint32_t tree_id, prefix_tree, suffix_tree
        if lcs_is_empty(lcs):
            tree = edittree_new_subst(self.strings.add(form), self.strings.add(lemma))
        else:
            # If we have a non-empty LCS, such as "gooi" in "ge[gooi]d" and "[gooi]en",
            # create edit trees for the prefix pair ("ge"/"") and the suffix pair ("d"/"en").
            prefix_tree = NULL_TREE_ID
            if lcs.source_begin != 0 or lcs.target_begin != 0:
                prefix_tree = self.add(form[:lcs.source_begin], lemma[:lcs.target_begin])
            suffix_tree = NULL_TREE_ID
            if lcs.source_end != len(form) or lcs.target_end != len(lemma):
                suffix_tree = self.add(form[lcs.source_end:], lemma[lcs.target_end:])
            tree = edittree_new_match(lcs.source_begin, len(form) - lcs.source_end, prefix_tree, suffix_tree)
        return self._tree_id(tree)
    cdef uint32_t _tree_id(self, EditTreeC tree):
         # If this tree has been constructed before, return its identifier.
        cdef hash_t hash = edittree_hash(tree)
        cdef unordered_map[hash_t, uint32_t].iterator iter = self.map.find(hash)
        if iter != self.map.end():
            return deref(iter).second
        #  The tree hasn't been seen before, store it.
        cdef uint32_t tree_id = self.trees.size()
        self.trees.push_back(tree)
        self.map.insert(pair[hash_t, uint32_t](hash, tree_id))
        return tree_id
    cpdef str apply(self, uint32_t tree_id, str form):
        """Apply an edit tree to a form.
        tree_id (uint32_t): the identifier of the edit tree to apply.
        form (str): the form to apply the edit tree to.
        RETURNS (str): the transformer form or None if the edit tree
            could not be applied to the form.
        """
        if tree_id >= self.trees.size():
            raise IndexError("Edit tree identifier out of range")
        lemma_pieces = []
        try:
            self._apply(tree_id, form, lemma_pieces)
        except ValueError:
            return None
        return "".join(lemma_pieces)
    cdef _apply(self, uint32_t tree_id, str form_part, list lemma_pieces):
        """Recursively apply an edit tree to a form, adding pieces to
        the lemma_pieces list."""
        assert tree_id <= self.trees.size()
        cdef EditTreeC tree = self.trees[tree_id]
        cdef MatchNodeC match_node
        cdef int suffix_start
        if tree.is_match_node:
            match_node = tree.inner.match_node
            if match_node.prefix_len + match_node.suffix_len > len(form_part):
                raise ValueError("Edit tree cannot be applied to form")
            suffix_start = len(form_part) - match_node.suffix_len
            if match_node.prefix_tree != NULL_TREE_ID:
                self._apply(match_node.prefix_tree, form_part[:match_node.prefix_len], lemma_pieces)
            lemma_pieces.append(form_part[match_node.prefix_len:suffix_start])
            if match_node.suffix_tree != NULL_TREE_ID:
                self._apply(match_node.suffix_tree, form_part[suffix_start:], lemma_pieces)
        else:
            if form_part == self.strings[tree.inner.subst_node.orig]:
                lemma_pieces.append(self.strings[tree.inner.subst_node.subst])
            else:
                raise ValueError("Edit tree cannot be applied to form")
    cpdef unicode tree_to_str(self, uint32_t tree_id):
        """Return the tree as a string. The tree tree string is formatted
        like an S-expression. This is primarily useful for debugging. Match
        nodes have the following format:
        (m prefix_len suffix_len prefix_tree suffix_tree)
        Substitution nodes have the following format:
        (s original substitute)
        tree_id (uint32_t): the identifier of the edit tree.
        RETURNS (str): the tree as an S-expression.
        """
        if tree_id >= self.trees.size():
            raise IndexError("Edit tree identifier out of range")
        cdef EditTreeC tree = self.trees[tree_id]
        cdef SubstNodeC subst_node
        if not tree.is_match_node:
            subst_node = tree.inner.subst_node
            return f"(s '{self.strings[subst_node.orig]}' '{self.strings[subst_node.subst]}')"
        cdef MatchNodeC match_node = tree.inner.match_node
        prefix_tree = "()"
        if match_node.prefix_tree != NULL_TREE_ID:
            prefix_tree = self.tree_to_str(match_node.prefix_tree)
        suffix_tree = "()"
        if match_node.suffix_tree != NULL_TREE_ID:
            suffix_tree = self.tree_to_str(match_node.suffix_tree)
        return f"(m {match_node.prefix_len} {match_node.suffix_len} {prefix_tree} {suffix_tree})"
    def from_json(self, trees: list) -> "EditTrees":
        self.trees.clear()
        for tree in trees:
            tree = _dict2tree(tree)
            self.trees.push_back(tree)
        self._rebuild_tree_map()
    def from_bytes(self, bytes_data: bytes, *) -> "EditTrees":
        def deserialize_trees(tree_dicts):
            cdef EditTreeC c_tree
            for tree_dict in tree_dicts:
                c_tree = _dict2tree(tree_dict)
                self.trees.push_back(c_tree)
        deserializers = {}
        deserializers["trees"] = lambda n: deserialize_trees(n)
        util.from_bytes(bytes_data, deserializers, [])
        self._rebuild_tree_map()
        return self
    def to_bytes(self, **kwargs) -> bytes:
        tree_dicts = []
        for tree in self.trees:
            tree = _tree2dict(tree)
            tree_dicts.append(tree)
        serializers = {}
        serializers["trees"] = lambda: tree_dicts
        return util.to_bytes(serializers, [])
    def to_disk(self, path, **kwargs) -> "EditTrees":
        path = util.ensure_path(path)
        with path.open("wb") as file_:
            file_.write(self.to_bytes())
    def from_disk(self, path, **kwargs) -> "EditTrees":
        path = util.ensure_path(path)
        if path.exists():
            with path.open("rb") as file_:
                data = file_.read()
            return self.from_bytes(data)
        return self
    def __getitem__(self, idx):
        return _tree2dict(self.trees[idx])
    def __len__(self):
        return self.trees.size()
    def _rebuild_tree_map(self):
        """Rebuild the tree hash -> tree id mapping"""
        cdef EditTreeC c_tree
        cdef uint32_t tree_id
        cdef hash_t tree_hash
        self.map.clear()
        for tree_id in range(self.trees.size()):
            c_tree = self.trees[tree_id]
            tree_hash = edittree_hash(c_tree)
            self.map.insert(pair[hash_t, uint32_t](tree_hash, tree_id))
    def __reduce__(self):
        return (unpickle_edittrees, (self.strings, self.to_bytes()))
 def unpickle_edittrees(strings, trees_data):
    return EditTrees(strings).from_bytes(trees_data)
 def _tree2dict(tree):
    if tree["is_match_node"]:
        tree = tree["inner"]["match_node"]
    else:
        tree = tree["inner"]["subst_node"]
    return(dict(tree))
 def _dict2tree(tree):
    errors = validate_edit_tree(tree)
    if errors:
        raise ValueError(Errors.E1026.format(errors="\n".join(errors)))
    tree = dict(tree)
    if "prefix_len" in tree:
        tree = {"is_match_node": True, "inner": {"match_node": tree}}
    else:
        tree = {"is_match_node": False, "inner": {"subst_node": tree}}
    return tree
--- a/spacy/pipeline/_edit_tree_internals/schemas.py
+++ b/spacy/pipeline/_edit_tree_internals/schemas.py
@ -0,0 +1,44 @@
 from typing import Any, Dict, List, Union
 from collections import defaultdict
 from pydantic import BaseModel, Field, ValidationError
 from pydantic.types import StrictBool, StrictInt, StrictStr
 class MatchNodeSchema(BaseModel):
    prefix_len: StrictInt = Field(..., title="Prefix length")
    suffix_len: StrictInt = Field(..., title="Suffix length")
    prefix_tree: StrictInt = Field(..., title="Prefix tree")
    suffix_tree: StrictInt = Field(..., title="Suffix tree")
    class Config:
        extra = "forbid"
 class SubstNodeSchema(BaseModel):
    orig: Union[int, StrictStr] = Field(..., title="Original substring")
    subst: Union[int, StrictStr] = Field(..., title="Replacement substring")
    class Config:
        extra = "forbid"
 class EditTreeSchema(BaseModel):
    __root__: Union[MatchNodeSchema, SubstNodeSchema]
 def validate_edit_tree(obj: Dict[str, Any]) -> List[str]:
    """Validate edit tree.
    obj (Dict[str, Any]): JSON-serializable data to validate.
    RETURNS (List[str]): A list of error messages, if available.
    """
    try:
        EditTreeSchema.parse_obj(obj)
        return []
    except ValidationError as e:
        errors = e.errors()
        data = defaultdict(list)
        for error in errors:
            err_loc = " -> ".join([str(p) for p in error.get("loc", [])])
            data[err_loc].append(error.get("msg"))
        return [f"[{loc}] {', '.join(msg)}" for loc, msg in data.items()]  # type: ignore[arg-type]
--- a/spacy/pipeline/_parser_internals/_state.pxd
+++ b/spacy/pipeline/_parser_internals/_state.pxd
@ -3,6 +3,7 @@ from libc.string cimport memcpy, memset
 from libc.stdlib cimport calloc, free
 from libc.stdint cimport uint32_t, uint64_t
 cimport libcpp
 from libcpp.unordered_map cimport unordered_map
 from libcpp.vector cimport vector
 from libcpp.set cimport set
 from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
@ -30,8 +31,8 @@ cdef cppclass StateC:
    vector[int] _stack
    vector[int] _rebuffer
    vector[SpanC] _ents
-    vector[ArcC] _left_arcs
+    unordered_map[int, vector[ArcC]] _left_arcs
-    vector[ArcC] _right_arcs
+    unordered_map[int, vector[ArcC]] _right_arcs
    vector[libcpp.bool] _unshiftable
    set[int] _sent_starts
    TokenC _empty_token
@ -160,15 +161,22 @@ cdef cppclass StateC:
        else:
            return &this._sent[i]
-    void get_arcs(vector[ArcC]* arcs) nogil const:
+    void map_get_arcs(const unordered_map[int, vector[ArcC]] &heads_arcs, vector[ArcC]* out) nogil const:
-        for i in range(this._left_arcs.size()):
+        cdef const vector[ArcC]* arcs
-            arc = this._left_arcs.at(i)
+        head_arcs_it = heads_arcs.const_begin()
-            if arc.head != -1 and arc.child != -1:
+        while head_arcs_it != heads_arcs.const_end():
-                arcs.push_back(arc)
+            arcs = &deref(head_arcs_it).second
-        for i in range(this._right_arcs.size()):
+            arcs_it = arcs.const_begin()
-            arc = this._right_arcs.at(i)
+            while arcs_it != arcs.const_end():
-            if arc.head != -1 and arc.child != -1:
+                arc = deref(arcs_it)
-                arcs.push_back(arc)
+                if arc.head != -1 and arc.child != -1:
                    out.push_back(arc)
                incr(arcs_it)
            incr(head_arcs_it)
    void get_arcs(vector[ArcC]* out) nogil const:
        this.map_get_arcs(this._left_arcs, out)
        this.map_get_arcs(this._right_arcs, out)
    int H(int child) nogil const:
        if child >= this.length or child < 0:
@ -182,37 +190,35 @@ cdef cppclass StateC:
        else:
            return this._ents.back().start
-    int L(int head, int idx) nogil const:
+    int nth_child(const unordered_map[int, vector[ArcC]]& heads_arcs, int head, int idx) nogil const:
-        if idx < 1 or this._left_arcs.size() == 0:
+        if idx < 1:
            return -1
-        # Work backwards through left-arcs to find the arc at the
+        head_arcs_it = heads_arcs.const_find(head)
        if head_arcs_it == heads_arcs.const_end():
            return -1
        cdef const vector[ArcC]* arcs = &deref(head_arcs_it).second
        # Work backwards through arcs to find the arc at the
        # requested index more quickly.
        cdef size_t child_index = 0
-        it = this._left_arcs.const_rbegin()
+        arcs_it = arcs.const_rbegin()
-        while it != this._left_arcs.rend():
+        while arcs_it != arcs.const_rend() and child_index != idx:
-            arc = deref(it)
+            arc = deref(arcs_it)
-            if arc.head == head and arc.child != -1 and arc.child < head:
+            if arc.child != -1:
                child_index += 1
                if child_index == idx:
                    return arc.child
-            incr(it)
+            incr(arcs_it)
        return -1
    int L(int head, int idx) nogil const:
        return this.nth_child(this._left_arcs, head, idx)
    int R(int head, int idx) nogil const:
-        if idx < 1 or this._right_arcs.size() == 0:
+        return this.nth_child(this._right_arcs, head, idx)
            return -1
        cdef vector[int] rights
        for i in range(this._right_arcs.size()):
            arc = this._right_arcs.at(i)
            if arc.head == head and arc.child != -1 and arc.child > head:
                rights.push_back(arc.child)
        idx = (<int>rights.size()) - idx
        if idx < 0:
            return -1
        else:
            return rights.at(idx)
    bint empty() nogil const:
        return this._stack.size() == 0
@ -253,22 +259,29 @@ cdef cppclass StateC:
    int r_edge(int word) nogil const:
        return word
- 
+
-    int n_L(int head) nogil const:
+    int n_arcs(const unordered_map[int, vector[ArcC]] &heads_arcs, int head) nogil const:
        cdef int n = 0
-        for i in range(this._left_arcs.size()):
+        head_arcs_it = heads_arcs.const_find(head)
-            arc = this._left_arcs.at(i) 
+        if head_arcs_it == heads_arcs.const_end():
-            if arc.head == head and arc.child != -1 and arc.child < arc.head:
+            return n
        cdef const vector[ArcC]* arcs = &deref(head_arcs_it).second
        arcs_it = arcs.const_begin()
        while arcs_it != arcs.end():
            arc = deref(arcs_it)
            if arc.child != -1:
                n += 1
            incr(arcs_it)
        return n
    int n_L(int head) nogil const:
        return n_arcs(this._left_arcs, head)
    int n_R(int head) nogil const:
-        cdef int n = 0
+        return n_arcs(this._right_arcs, head)
        for i in range(this._right_arcs.size()):
            arc = this._right_arcs.at(i) 
            if arc.head == head and arc.child != -1 and arc.child > arc.head:
                n += 1
        return n
    bint stack_is_connected() nogil const:
        return False
@ -328,19 +341,20 @@ cdef cppclass StateC:
        arc.child = child
        arc.label = label
        if head > child:
-            this._left_arcs.push_back(arc)
+            this._left_arcs[arc.head].push_back(arc)
        else:
-            this._right_arcs.push_back(arc)
+            this._right_arcs[arc.head].push_back(arc)
        this._heads[child] = head
-    void del_arc(int h_i, int c_i) nogil:
+    void map_del_arc(unordered_map[int, vector[ArcC]]* heads_arcs, int h_i, int c_i) nogil:
-        cdef vector[ArcC]* arcs
+        arcs_it = heads_arcs.find(h_i)
-        if h_i > c_i:
+        if arcs_it == heads_arcs.end():
-            arcs = &this._left_arcs
+            return
-        else:
+
-            arcs = &this._right_arcs
+        arcs = &deref(arcs_it).second
        if arcs.size() == 0:
            return
        arc = arcs.back()
        if arc.head == h_i and arc.child == c_i:
            arcs.pop_back()
@ -353,6 +367,12 @@ cdef cppclass StateC:
                    arc.label = 0
                    break
    void del_arc(int h_i, int c_i) nogil:
        if h_i > c_i:
            this.map_del_arc(&this._left_arcs, h_i, c_i)
        else:
            this.map_del_arc(&this._right_arcs, h_i, c_i)
    SpanC get_ent() nogil const:
        cdef SpanC ent
        if this._ents.size() == 0:
--- a/spacy/pipeline/_parser_internals/arc_eager.pyx
+++ b/spacy/pipeline/_parser_internals/arc_eager.pyx
@ -218,7 +218,7 @@ def _get_aligned_sent_starts(example):
        sent_starts = [False] * len(example.x)
        seen_words = set()
        for y_sent in example.y.sents:
-            x_indices = list(align[y_sent.start : y_sent.end].dataXd)
+            x_indices = list(align[y_sent.start : y_sent.end])
            if any(x_idx in seen_words for x_idx in x_indices):
                # If there are any tokens in X that align across two sentences,
                # regard the sentence annotations as missing, as we can't
--- a/spacy/pipeline/_parser_internals/nonproj.pyx
+++ b/spacy/pipeline/_parser_internals/nonproj.pyx
@ -4,6 +4,10 @@ for doing pseudo-projective parsing implementation uses the HEAD decoration
 scheme.
 """
 from copy import copy
 from libc.limits cimport INT_MAX
 from libc.stdlib cimport abs
 from libcpp cimport bool
 from libcpp.vector cimport vector
 from ...tokens.doc cimport Doc, set_children_from_heads
@ -41,13 +45,18 @@ def contains_cycle(heads):
 def is_nonproj_arc(tokenid, heads):
    cdef vector[int] c_heads = _heads_to_c(heads)
    return _is_nonproj_arc(tokenid, c_heads)
 cdef bool _is_nonproj_arc(int tokenid, const vector[int]& heads) nogil:
    # definition (e.g. Havelka 2007): an arc h -> d, h < d is non-projective
    # if there is a token k, h < k < d such that h is not
    # an ancestor of k. Same for h -> d, h > d
    head = heads[tokenid]
    if head == tokenid:  # root arcs cannot be non-projective
        return False
-    elif head is None:  # unattached tokens cannot be non-projective
+    elif head < 0:  # unattached tokens cannot be non-projective
        return False
    cdef int start, end
@ -56,19 +65,29 @@ def is_nonproj_arc(tokenid, heads):
    else:
        start, end = (tokenid+1, head)
    for k in range(start, end):
-        for ancestor in ancestors(k, heads):
+        if _has_head_as_ancestor(k, head, heads):
-            if ancestor is None:  # for unattached tokens/subtrees
+            continue
                break
            elif ancestor == head:  # normal case: k dominated by h
                break
        else:  # head not in ancestors: d -> h is non-projective
            return True
    return False
 cdef bool _has_head_as_ancestor(int tokenid, int head, const vector[int]& heads) nogil:
    ancestor = tokenid
    cnt = 0
    while cnt < heads.size():
        if heads[ancestor] == head or heads[ancestor] < 0:
            return True
        ancestor = heads[ancestor]
        cnt += 1
    return False
 def is_nonproj_tree(heads):
    cdef vector[int] c_heads = _heads_to_c(heads)
    # a tree is non-projective if at least one arc is non-projective
-    return any(is_nonproj_arc(word, heads) for word in range(len(heads)))
+    return any(_is_nonproj_arc(word, c_heads) for word in range(len(heads)))
 def decompose(label):
@ -98,16 +117,31 @@ def projectivize(heads, labels):
    # tree, i.e. connected and cycle-free. Returns a new pair (heads, labels)
    # which encode a projective and decorated tree.
    proj_heads = copy(heads)
-    smallest_np_arc = _get_smallest_nonproj_arc(proj_heads)
+
-    if smallest_np_arc is None:  # this sentence is already projective
+    cdef int new_head
    cdef vector[int] c_proj_heads = _heads_to_c(proj_heads)
    cdef int smallest_np_arc = _get_smallest_nonproj_arc(c_proj_heads)
    if smallest_np_arc == -1:  # this sentence is already projective
        return proj_heads, copy(labels)
-    while smallest_np_arc is not None:
+    while smallest_np_arc != -1:
-        _lift(smallest_np_arc, proj_heads)
+        new_head = _lift(smallest_np_arc, proj_heads)
-        smallest_np_arc = _get_smallest_nonproj_arc(proj_heads)
+        c_proj_heads[smallest_np_arc] = new_head
        smallest_np_arc = _get_smallest_nonproj_arc(c_proj_heads)
    deco_labels = _decorate(heads, proj_heads, labels)
    return proj_heads, deco_labels
 cdef vector[int] _heads_to_c(heads):
    cdef vector[int] c_heads;
    for head in heads:
        if head == None:
            c_heads.push_back(-1)
        else:
            assert head < len(heads)
            c_heads.push_back(head)
    return c_heads
 cpdef deprojectivize(Doc doc):
    # Reattach arcs with decorated labels (following HEAD scheme). For each
    # decorated arc X||Y, search top-down, left-to-right, breadth-first until
@ -137,27 +171,38 @@ def _decorate(heads, proj_heads, labels):
            deco_labels.append(labels[tokenid])
    return deco_labels
 def get_smallest_nonproj_arc_slow(heads):
    cdef vector[int] c_heads = _heads_to_c(heads)
    return _get_smallest_nonproj_arc(c_heads)
-def _get_smallest_nonproj_arc(heads):
+
 cdef int _get_smallest_nonproj_arc(const vector[int]& heads) nogil:
    # return the smallest non-proj arc or None
    # where size is defined as the distance between dep and head
    # and ties are broken left to right
-    smallest_size = float('inf')
+    cdef int smallest_size = INT_MAX
-    smallest_np_arc = None
+    cdef int smallest_np_arc = -1
-    for tokenid, head in enumerate(heads):
+    cdef int size
    cdef int tokenid
    cdef int head
    for tokenid in range(heads.size()):
        head = heads[tokenid]
        size = abs(tokenid-head)
-        if size < smallest_size and is_nonproj_arc(tokenid, heads):
+        if size < smallest_size and _is_nonproj_arc(tokenid, heads):
            smallest_size = size
            smallest_np_arc = tokenid
    return smallest_np_arc
-def _lift(tokenid, heads):
+cpdef int _lift(tokenid, heads):
    # reattaches a word to it's grandfather
    head = heads[tokenid]
    ghead = heads[head]
    cdef int new_head = ghead if head != ghead else tokenid
    # attach to ghead if head isn't attached to root else attach to root
-    heads[tokenid] = ghead if head != ghead else tokenid
+    heads[tokenid] = new_head
    return new_head
 def _find_new_head(token, headlabel):
--- a/spacy/pipeline/edit_tree_lemmatizer.py
+++ b/spacy/pipeline/edit_tree_lemmatizer.py
@ -0,0 +1,379 @@
 from typing import cast, Any, Callable, Dict, Iterable, List, Optional
 from typing import Sequence, Tuple, Union
 from collections import Counter
 from copy import deepcopy
 from itertools import islice
 import numpy as np
 import srsly
 from thinc.api import Config, Model, SequenceCategoricalCrossentropy
 from thinc.types import Floats2d, Ints1d, Ints2d
 from ._edit_tree_internals.edit_trees import EditTrees
 from ._edit_tree_internals.schemas import validate_edit_tree
 from .lemmatizer import lemmatizer_score
 from .trainable_pipe import TrainablePipe
 from ..errors import Errors
 from ..language import Language
 from ..tokens import Doc
 from ..training import Example, validate_examples, validate_get_examples
 from ..vocab import Vocab
 from .. import util
 default_model_config = """
 [model]
@architectures = "spacy.Tagger.v2"
 [model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v2"
 pretrained_vectors = null
 width = 96
 depth = 4
 embed_size = 2000
 window_size = 1
 maxout_pieces = 3
 subword_features = true
 """
 DEFAULT_EDIT_TREE_LEMMATIZER_MODEL = Config().from_str(default_model_config)["model"]
@Language.factory(
    "trainable_lemmatizer",
    assigns=["token.lemma"],
    requires=[],
    default_config={
        "model": DEFAULT_EDIT_TREE_LEMMATIZER_MODEL,
        "backoff": "orth",
        "min_tree_freq": 3,
        "overwrite": False,
        "top_k": 1,
        "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
    },
    default_score_weights={"lemma_acc": 1.0},
 )
 def make_edit_tree_lemmatizer(
    nlp: Language,
    name: str,
    model: Model,
    backoff: Optional[str],
    min_tree_freq: int,
    overwrite: bool,
    top_k: int,
    scorer: Optional[Callable],
 ):
    """Construct an EditTreeLemmatizer component."""
    return EditTreeLemmatizer(
        nlp.vocab,
        model,
        name,
        backoff=backoff,
        min_tree_freq=min_tree_freq,
        overwrite=overwrite,
        top_k=top_k,
        scorer=scorer,
    )
 class EditTreeLemmatizer(TrainablePipe):
    """
    Lemmatizer that lemmatizes each word using a predicted edit tree.
    """
    def __init__(
        self,
        vocab: Vocab,
        model: Model,
        name: str = "trainable_lemmatizer",
        *,
        backoff: Optional[str] = "orth",
        min_tree_freq: int = 3,
        overwrite: bool = False,
        top_k: int = 1,
        scorer: Optional[Callable] = lemmatizer_score,
    ):
        """
        Construct an edit tree lemmatizer.
        backoff (Optional[str]): backoff to use when the predicted edit trees
            are not applicable. Must be an attribute of Token or None (leave the
            lemma unset).
        min_tree_freq (int): prune trees that are applied less than this
            frequency in the training data.
        overwrite (bool): overwrite existing lemma annotations.
        top_k (int): try to apply at most the k most probable edit trees.
        """
        self.vocab = vocab
        self.model = model
        self.name = name
        self.backoff = backoff
        self.min_tree_freq = min_tree_freq
        self.overwrite = overwrite
        self.top_k = top_k
        self.trees = EditTrees(self.vocab.strings)
        self.tree2label: Dict[int, int] = {}
        self.cfg: Dict[str, Any] = {"labels": []}
        self.scorer = scorer
    def get_loss(
        self, examples: Iterable[Example], scores: List[Floats2d]
    ) -> Tuple[float, List[Floats2d]]:
        validate_examples(examples, "EditTreeLemmatizer.get_loss")
        loss_func = SequenceCategoricalCrossentropy(normalize=False, missing_value=-1)
        truths = []
        for eg in examples:
            eg_truths = []
            for (predicted, gold_lemma) in zip(
                eg.predicted, eg.get_aligned("LEMMA", as_string=True)
            ):
                if gold_lemma is None:
                    label = -1
                else:
                    tree_id = self.trees.add(predicted.text, gold_lemma)
                    label = self.tree2label.get(tree_id, 0)
                eg_truths.append(label)
            truths.append(eg_truths)
        d_scores, loss = loss_func(scores, truths)  # type: ignore
        if self.model.ops.xp.isnan(loss):
            raise ValueError(Errors.E910.format(name=self.name))
        return float(loss), d_scores
    def predict(self, docs: Iterable[Doc]) -> List[Ints2d]:
        n_docs = len(list(docs))
        if not any(len(doc) for doc in docs):
            # Handle cases where there are no tokens in any docs.
            n_labels = len(self.cfg["labels"])
            guesses: List[Ints2d] = [
                self.model.ops.alloc((0, n_labels), dtype="i") for doc in docs
            ]
            assert len(guesses) == n_docs
            return guesses
        scores = self.model.predict(docs)
        assert len(scores) == n_docs
        guesses = self._scores2guesses(docs, scores)
        assert len(guesses) == n_docs
        return guesses
    def _scores2guesses(self, docs, scores):
        guesses = []
        for doc, doc_scores in zip(docs, scores):
            if self.top_k == 1:
                doc_guesses = doc_scores.argmax(axis=1).reshape(-1, 1)
            else:
                doc_guesses = np.argsort(doc_scores)[..., : -self.top_k - 1 : -1]
            if not isinstance(doc_guesses, np.ndarray):
                doc_guesses = doc_guesses.get()
            doc_compat_guesses = []
            for token, candidates in zip(doc, doc_guesses):
                tree_id = -1
                for candidate in candidates:
                    candidate_tree_id = self.cfg["labels"][candidate]
                    if self.trees.apply(candidate_tree_id, token.text) is not None:
                        tree_id = candidate_tree_id
                        break
                doc_compat_guesses.append(tree_id)
            guesses.append(np.array(doc_compat_guesses))
        return guesses
    def set_annotations(self, docs: Iterable[Doc], batch_tree_ids):
        for i, doc in enumerate(docs):
            doc_tree_ids = batch_tree_ids[i]
            if hasattr(doc_tree_ids, "get"):
                doc_tree_ids = doc_tree_ids.get()
            for j, tree_id in enumerate(doc_tree_ids):
                if self.overwrite or doc[j].lemma == 0:
                    # If no applicable tree could be found during prediction,
                    # the special identifier -1 is used. Otherwise the tree
                    # is guaranteed to be applicable.
                    if tree_id == -1:
                        if self.backoff is not None:
                            doc[j].lemma = getattr(doc[j], self.backoff)
                    else:
                        lemma = self.trees.apply(tree_id, doc[j].text)
                        doc[j].lemma_ = lemma
    @property
    def labels(self) -> Tuple[int, ...]:
        """Returns the labels currently added to the component."""
        return tuple(self.cfg["labels"])
    @property
    def hide_labels(self) -> bool:
        return True
    @property
    def label_data(self) -> Dict:
        trees = []
        for tree_id in range(len(self.trees)):
            tree = self.trees[tree_id]
            if "orig" in tree:
                tree["orig"] = self.vocab.strings[tree["orig"]]
            if "subst" in tree:
                tree["subst"] = self.vocab.strings[tree["subst"]]
            trees.append(tree)
        return dict(trees=trees, labels=tuple(self.cfg["labels"]))
    def initialize(
        self,
        get_examples: Callable[[], Iterable[Example]],
        *,
        nlp: Optional[Language] = None,
        labels: Optional[Dict] = None,
    ):
        validate_get_examples(get_examples, "EditTreeLemmatizer.initialize")
        if labels is None:
            self._labels_from_data(get_examples)
        else:
            self._add_labels(labels)
        # Sample for the model.
        doc_sample = []
        label_sample = []
        for example in islice(get_examples(), 10):
            doc_sample.append(example.x)
            gold_labels: List[List[float]] = []
            for token in example.reference:
                if token.lemma == 0:
                    gold_label = None
                else:
                    gold_label = self._pair2label(token.text, token.lemma_)
                gold_labels.append(
                    [
                        1.0 if label == gold_label else 0.0
                        for label in self.cfg["labels"]
                    ]
                )
            gold_labels = cast(Floats2d, gold_labels)
            label_sample.append(self.model.ops.asarray(gold_labels, dtype="float32"))
        self._require_labels()
        assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
        assert len(label_sample) > 0, Errors.E923.format(name=self.name)
        self.model.initialize(X=doc_sample, Y=label_sample)
    def from_bytes(self, bytes_data, *, exclude=tuple()):
        deserializers = {
            "cfg": lambda b: self.cfg.update(srsly.json_loads(b)),
            "model": lambda b: self.model.from_bytes(b),
            "vocab": lambda b: self.vocab.from_bytes(b, exclude=exclude),
            "trees": lambda b: self.trees.from_bytes(b),
        }
        util.from_bytes(bytes_data, deserializers, exclude)
        return self
    def to_bytes(self, *, exclude=tuple()):
        serializers = {
            "cfg": lambda: srsly.json_dumps(self.cfg),
            "model": lambda: self.model.to_bytes(),
            "vocab": lambda: self.vocab.to_bytes(exclude=exclude),
            "trees": lambda: self.trees.to_bytes(),
        }
        return util.to_bytes(serializers, exclude)
    def to_disk(self, path, exclude=tuple()):
        path = util.ensure_path(path)
        serializers = {
            "cfg": lambda p: srsly.write_json(p, self.cfg),
            "model": lambda p: self.model.to_disk(p),
            "vocab": lambda p: self.vocab.to_disk(p, exclude=exclude),
            "trees": lambda p: self.trees.to_disk(p),
        }
        util.to_disk(path, serializers, exclude)
    def from_disk(self, path, exclude=tuple()):
        def load_model(p):
            try:
                with open(p, "rb") as mfile:
                    self.model.from_bytes(mfile.read())
            except AttributeError:
                raise ValueError(Errors.E149) from None
        deserializers = {
            "cfg": lambda p: self.cfg.update(srsly.read_json(p)),
            "model": load_model,
            "vocab": lambda p: self.vocab.from_disk(p, exclude=exclude),
            "trees": lambda p: self.trees.from_disk(p),
        }
        util.from_disk(path, deserializers, exclude)
        return self
    def _add_labels(self, labels: Dict):
        if "labels" not in labels:
            raise ValueError(Errors.E857.format(name="labels"))
        if "trees" not in labels:
            raise ValueError(Errors.E857.format(name="trees"))
        self.cfg["labels"] = list(labels["labels"])
        trees = []
        for tree in labels["trees"]:
            errors = validate_edit_tree(tree)
            if errors:
                raise ValueError(Errors.E1026.format(errors="\n".join(errors)))
            tree = dict(tree)
            if "orig" in tree:
                tree["orig"] = self.vocab.strings[tree["orig"]]
            if "orig" in tree:
                tree["subst"] = self.vocab.strings[tree["subst"]]
            trees.append(tree)
        self.trees.from_json(trees)
        for label, tree in enumerate(self.labels):
            self.tree2label[tree] = label
    def _labels_from_data(self, get_examples: Callable[[], Iterable[Example]]):
        # Count corpus tree frequencies in ad-hoc storage to avoid cluttering
        # the final pipe/string store.
        vocab = Vocab()
        trees = EditTrees(vocab.strings)
        tree_freqs: Counter = Counter()
        repr_pairs: Dict = {}
        for example in get_examples():
            for token in example.reference:
                if token.lemma != 0:
                    tree_id = trees.add(token.text, token.lemma_)
                    tree_freqs[tree_id] += 1
                    repr_pairs[tree_id] = (token.text, token.lemma_)
        # Construct trees that make the frequency cut-off using representative
        # form - token pairs.
        for tree_id, freq in tree_freqs.items():
            if freq >= self.min_tree_freq:
                form, lemma = repr_pairs[tree_id]
                self._pair2label(form, lemma, add_label=True)
    def _pair2label(self, form, lemma, add_label=False):
        """
        Look up the edit tree identifier for a form/label pair. If the edit
        tree is unknown and "add_label" is set, the edit tree will be added to
        the labels.
        """
        tree_id = self.trees.add(form, lemma)
        if tree_id not in self.tree2label:
            if not add_label:
                return None
            self.tree2label[tree_id] = len(self.cfg["labels"])
            self.cfg["labels"].append(tree_id)
        return self.tree2label[tree_id]
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@ -6,17 +6,17 @@ import srsly
 import random
 from thinc.api import CosineDistance, Model, Optimizer, Config
 from thinc.api import set_dropout_rate
 import warnings
 from ..kb import KnowledgeBase, Candidate
 from ..ml import empty_kb
 from ..tokens import Doc, Span
 from .pipe import deserialize_config
 from .legacy.entity_linker import EntityLinker_v1
 from .trainable_pipe import TrainablePipe
 from ..language import Language
 from ..vocab import Vocab
 from ..training import Example, validate_examples, validate_get_examples
-from ..errors import Errors, Warnings
+from ..errors import Errors
 from ..util import SimpleFrozenList, registry
 from .. import util
 from ..scorer import Scorer
@ -26,7 +26,7 @@ BACKWARD_OVERWRITE = True
 default_model_config = """
 [model]
-@architectures = "spacy.EntityLinker.v1"
+@architectures = "spacy.EntityLinker.v2"
 [model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v2"
@ -55,6 +55,7 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
        "get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
        "overwrite": True,
        "scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
        "use_gold_ents": True,
    },
    default_score_weights={
        "nel_micro_f": 1.0,
@ -75,6 +76,7 @@ def make_entity_linker(
    get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
    overwrite: bool,
    scorer: Optional[Callable],
    use_gold_ents: bool,
 ):
    """Construct an EntityLinker component.
@ -90,6 +92,22 @@ def make_entity_linker(
        produces a list of candidates, given a certain knowledge base and a textual mention.
    scorer (Optional[Callable]): The scoring method.
    """
    if not model.attrs.get("include_span_maker", False):
        # The only difference in arguments here is that use_gold_ents is not available
        return EntityLinker_v1(
            nlp.vocab,
            model,
            name,
            labels_discard=labels_discard,
            n_sents=n_sents,
            incl_prior=incl_prior,
            incl_context=incl_context,
            entity_vector_length=entity_vector_length,
            get_candidates=get_candidates,
            overwrite=overwrite,
            scorer=scorer,
        )
    return EntityLinker(
        nlp.vocab,
        model,
@ -102,6 +120,7 @@ def make_entity_linker(
        get_candidates=get_candidates,
        overwrite=overwrite,
        scorer=scorer,
        use_gold_ents=use_gold_ents,
    )
@ -136,6 +155,7 @@ class EntityLinker(TrainablePipe):
        get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
        overwrite: bool = BACKWARD_OVERWRITE,
        scorer: Optional[Callable] = entity_linker_score,
        use_gold_ents: bool,
    ) -> None:
        """Initialize an entity linker.
@ -152,6 +172,8 @@ class EntityLinker(TrainablePipe):
            produces a list of candidates, given a certain knowledge base and a textual mention.
        scorer (Optional[Callable]): The scoring method. Defaults to
            Scorer.score_links.
        use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
            component must provide entity annotations.
        DOCS: https://spacy.io/api/entitylinker#init
        """
@ -169,6 +191,7 @@ class EntityLinker(TrainablePipe):
        # create an empty KB by default. If you want to load a predefined one, specify it in 'initialize'.
        self.kb = empty_kb(entity_vector_length)(self.vocab)
        self.scorer = scorer
        self.use_gold_ents = use_gold_ents
    def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
        """Define the KB of this pipe by providing a function that will
@ -212,14 +235,48 @@ class EntityLinker(TrainablePipe):
        doc_sample = []
        vector_sample = []
        for example in islice(get_examples(), 10):
-            doc_sample.append(example.x)
+            doc = example.x
            if self.use_gold_ents:
                doc.ents = example.y.ents
            doc_sample.append(doc)
            vector_sample.append(self.model.ops.alloc1f(nO))
        assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
        assert len(vector_sample) > 0, Errors.E923.format(name=self.name)
        # XXX In order for size estimation to work, there has to be at least
        # one entity. It's not used for training so it doesn't have to be real,
        # so we add a fake one if none are present.
        # We can't use Doc.has_annotation here because it can be True for docs
        # that have been through an NER component but got no entities.
        has_annotations = any([doc.ents for doc in doc_sample])
        if not has_annotations:
            doc = doc_sample[0]
            ent = doc[0:1]
            ent.label_ = "XXX"
            doc.ents = (ent,)
        self.model.initialize(
            X=doc_sample, Y=self.model.ops.asarray(vector_sample, dtype="float32")
        )
        if not has_annotations:
            # Clean up dummy annotation
            doc.ents = []
    def batch_has_learnable_example(self, examples):
        """Check if a batch contains a learnable example.
        If one isn't present, then the update step needs to be skipped.
        """
        for eg in examples:
            for ent in eg.predicted.ents:
                candidates = list(self.get_candidates(self.kb, ent))
                if candidates:
                    return True
        return False
    def update(
        self,
        examples: Iterable[Example],
@ -247,35 +304,29 @@ class EntityLinker(TrainablePipe):
        if not examples:
            return losses
        validate_examples(examples, "EntityLinker.update")
-        sentence_docs = []
+
        for eg in examples:
            sentences = [s for s in eg.reference.sents]
            kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
            for ent in eg.reference.ents:
                # KB ID of the first token is the same as the whole span
                kb_id = kb_ids[ent.start]
                if kb_id:
                    try:
                        # find the sentence in the list of sentences.
                        sent_index = sentences.index(ent.sent)
                    except AttributeError:
                        # Catch the exception when ent.sent is None and provide a user-friendly warning
                        raise RuntimeError(Errors.E030) from None
                    # get n previous sentences, if there are any
                    start_sentence = max(0, sent_index - self.n_sents)
                    # get n posterior sentences, or as many < n as there are
                    end_sentence = min(len(sentences) - 1, sent_index + self.n_sents)
                    # get token positions
                    start_token = sentences[start_sentence].start
                    end_token = sentences[end_sentence].end
                    # append that span as a doc to training
                    sent_doc = eg.predicted[start_token:end_token].as_doc()
                    sentence_docs.append(sent_doc)
        set_dropout_rate(self.model, drop)
-        if not sentence_docs:
+        docs = [eg.predicted for eg in examples]
-            warnings.warn(Warnings.W093.format(name="Entity Linker"))
+        # save to restore later
        old_ents = [doc.ents for doc in docs]
        for doc, ex in zip(docs, examples):
            if self.use_gold_ents:
                doc.ents = ex.reference.ents
            else:
                # only keep matching ents
                doc.ents = ex.get_matching_ents()
        # make sure we have something to learn from, if not, short-circuit
        if not self.batch_has_learnable_example(examples):
            return losses
-        sentence_encodings, bp_context = self.model.begin_update(sentence_docs)
+
        sentence_encodings, bp_context = self.model.begin_update(docs)
        # now restore the ents
        for doc, old in zip(docs, old_ents):
            doc.ents = old
        loss, d_scores = self.get_loss(
            sentence_encodings=sentence_encodings, examples=examples
        )
@ -288,24 +339,38 @@ class EntityLinker(TrainablePipe):
    def get_loss(self, examples: Iterable[Example], sentence_encodings: Floats2d):
        validate_examples(examples, "EntityLinker.get_loss")
        entity_encodings = []
        eidx = 0  # indices in gold entities to keep
        keep_ents = []  # indices in sentence_encodings to keep
        for eg in examples:
            kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
            for ent in eg.reference.ents:
                kb_id = kb_ids[ent.start]
                if kb_id:
                    entity_encoding = self.kb.get_vector(kb_id)
                    entity_encodings.append(entity_encoding)
                    keep_ents.append(eidx)
                eidx += 1
        entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32")
-        if sentence_encodings.shape != entity_encodings.shape:
+        selected_encodings = sentence_encodings[keep_ents]
        # If the entity encodings list is empty, then
        if selected_encodings.shape != entity_encodings.shape:
            err = Errors.E147.format(
                method="get_loss", msg="gold entities do not match up"
            )
            raise RuntimeError(err)
        # TODO: fix typing issue here
-        gradients = self.distance.get_grad(sentence_encodings, entity_encodings)  # type: ignore
+        gradients = self.distance.get_grad(selected_encodings, entity_encodings)  # type: ignore
-        loss = self.distance.get_loss(sentence_encodings, entity_encodings)  # type: ignore
+        # to match the input size, we need to give a zero gradient for items not in the kb
        out = self.model.ops.alloc2f(*sentence_encodings.shape)
        out[keep_ents] = gradients
        loss = self.distance.get_loss(selected_encodings, entity_encodings)  # type: ignore
        loss = loss / len(entity_encodings)
-        return float(loss), gradients
+        return float(loss), out
    def predict(self, docs: Iterable[Doc]) -> List[str]:
        """Apply the pipeline's model to a batch of docs, without modifying them.
--- a/spacy/pipeline/legacy/init.py
+++ b/spacy/pipeline/legacy/init.py
@ -0,0 +1,3 @@
 from .entity_linker import EntityLinker_v1
 __all__ = ["EntityLinker_v1"]
--- a/spacy/pipeline/legacy/entity_linker.py
+++ b/spacy/pipeline/legacy/entity_linker.py
@ -0,0 +1,427 @@
 # This file is present to provide a prior version of the EntityLinker component
 # for backwards compatability. For details see #9669.
 from typing import Optional, Iterable, Callable, Dict, Union, List, Any
 from thinc.types import Floats2d
 from pathlib import Path
 from itertools import islice
 import srsly
 import random
 from thinc.api import CosineDistance, Model, Optimizer, Config
 from thinc.api import set_dropout_rate
 import warnings
 from ...kb import KnowledgeBase, Candidate
 from ...ml import empty_kb
 from ...tokens import Doc, Span
 from ..pipe import deserialize_config
 from ..trainable_pipe import TrainablePipe
 from ...language import Language
 from ...vocab import Vocab
 from ...training import Example, validate_examples, validate_get_examples
 from ...errors import Errors, Warnings
 from ...util import SimpleFrozenList, registry
 from ... import util
 from ...scorer import Scorer
 # See #9050
 BACKWARD_OVERWRITE = True
 def entity_linker_score(examples, **kwargs):
    return Scorer.score_links(examples, negative_labels=[EntityLinker_v1.NIL], **kwargs)
 class EntityLinker_v1(TrainablePipe):
    """Pipeline component for named entity linking.
    DOCS: https://spacy.io/api/entitylinker
    """
    NIL = "NIL"  # string used to refer to a non-existing link
    def __init__(
        self,
        vocab: Vocab,
        model: Model,
        name: str = "entity_linker",
        *,
        labels_discard: Iterable[str],
        n_sents: int,
        incl_prior: bool,
        incl_context: bool,
        entity_vector_length: int,
        get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
        overwrite: bool = BACKWARD_OVERWRITE,
        scorer: Optional[Callable] = entity_linker_score,
    ) -> None:
        """Initialize an entity linker.
        vocab (Vocab): The shared vocabulary.
        model (thinc.api.Model): The Thinc Model powering the pipeline component.
        name (str): The component instance name, used to add entries to the
            losses during training.
        labels_discard (Iterable[str]): NER labels that will automatically get a "NIL" prediction.
        n_sents (int): The number of neighbouring sentences to take into account.
        incl_prior (bool): Whether or not to include prior probabilities from the KB in the model.
        incl_context (bool): Whether or not to include the local context in the model.
        entity_vector_length (int): Size of encoding vectors in the KB.
        get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
            produces a list of candidates, given a certain knowledge base and a textual mention.
        scorer (Optional[Callable]): The scoring method. Defaults to
            Scorer.score_links.
        DOCS: https://spacy.io/api/entitylinker#init
        """
        self.vocab = vocab
        self.model = model
        self.name = name
        self.labels_discard = list(labels_discard)
        self.n_sents = n_sents
        self.incl_prior = incl_prior
        self.incl_context = incl_context
        self.get_candidates = get_candidates
        self.cfg: Dict[str, Any] = {"overwrite": overwrite}
        self.distance = CosineDistance(normalize=False)
        # how many neighbour sentences to take into account
        # create an empty KB by default. If you want to load a predefined one, specify it in 'initialize'.
        self.kb = empty_kb(entity_vector_length)(self.vocab)
        self.scorer = scorer
    def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
        """Define the KB of this pipe by providing a function that will
        create it using this object's vocab."""
        if not callable(kb_loader):
            raise ValueError(Errors.E885.format(arg_type=type(kb_loader)))
        self.kb = kb_loader(self.vocab)
    def validate_kb(self) -> None:
        # Raise an error if the knowledge base is not initialized.
        if self.kb is None:
            raise ValueError(Errors.E1018.format(name=self.name))
        if len(self.kb) == 0:
            raise ValueError(Errors.E139.format(name=self.name))
    def initialize(
        self,
        get_examples: Callable[[], Iterable[Example]],
        *,
        nlp: Optional[Language] = None,
        kb_loader: Optional[Callable[[Vocab], KnowledgeBase]] = None,
    ):
        """Initialize the pipe for training, using a representative set
        of data examples.
        get_examples (Callable[[], Iterable[Example]]): Function that
            returns a representative sample of gold-standard Example objects.
        nlp (Language): The current nlp object the component is part of.
        kb_loader (Callable[[Vocab], KnowledgeBase]): A function that creates a KnowledgeBase from a Vocab instance.
            Note that providing this argument, will overwrite all data accumulated in the current KB.
            Use this only when loading a KB as-such from file.
        DOCS: https://spacy.io/api/entitylinker#initialize
        """
        validate_get_examples(get_examples, "EntityLinker_v1.initialize")
        if kb_loader is not None:
            self.set_kb(kb_loader)
        self.validate_kb()
        nO = self.kb.entity_vector_length
        doc_sample = []
        vector_sample = []
        for example in islice(get_examples(), 10):
            doc_sample.append(example.x)
            vector_sample.append(self.model.ops.alloc1f(nO))
        assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
        assert len(vector_sample) > 0, Errors.E923.format(name=self.name)
        self.model.initialize(
            X=doc_sample, Y=self.model.ops.asarray(vector_sample, dtype="float32")
        )
    def update(
        self,
        examples: Iterable[Example],
        *,
        drop: float = 0.0,
        sgd: Optional[Optimizer] = None,
        losses: Optional[Dict[str, float]] = None,
    ) -> Dict[str, float]:
        """Learn from a batch of documents and gold-standard information,
        updating the pipe's model. Delegates to predict and get_loss.
        examples (Iterable[Example]): A batch of Example objects.
        drop (float): The dropout rate.
        sgd (thinc.api.Optimizer): The optimizer.
        losses (Dict[str, float]): Optional record of the loss during training.
            Updated using the component name as the key.
        RETURNS (Dict[str, float]): The updated losses dictionary.
        DOCS: https://spacy.io/api/entitylinker#update
        """
        self.validate_kb()
        if losses is None:
            losses = {}
        losses.setdefault(self.name, 0.0)
        if not examples:
            return losses
        validate_examples(examples, "EntityLinker_v1.update")
        sentence_docs = []
        for eg in examples:
            sentences = [s for s in eg.reference.sents]
            kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
            for ent in eg.reference.ents:
                # KB ID of the first token is the same as the whole span
                kb_id = kb_ids[ent.start]
                if kb_id:
                    try:
                        # find the sentence in the list of sentences.
                        sent_index = sentences.index(ent.sent)
                    except AttributeError:
                        # Catch the exception when ent.sent is None and provide a user-friendly warning
                        raise RuntimeError(Errors.E030) from None
                    # get n previous sentences, if there are any
                    start_sentence = max(0, sent_index - self.n_sents)
                    # get n posterior sentences, or as many < n as there are
                    end_sentence = min(len(sentences) - 1, sent_index + self.n_sents)
                    # get token positions
                    start_token = sentences[start_sentence].start
                    end_token = sentences[end_sentence].end
                    # append that span as a doc to training
                    sent_doc = eg.predicted[start_token:end_token].as_doc()
                    sentence_docs.append(sent_doc)
        set_dropout_rate(self.model, drop)
        if not sentence_docs:
            warnings.warn(Warnings.W093.format(name="Entity Linker"))
            return losses
        sentence_encodings, bp_context = self.model.begin_update(sentence_docs)
        loss, d_scores = self.get_loss(
            sentence_encodings=sentence_encodings, examples=examples
        )
        bp_context(d_scores)
        if sgd is not None:
            self.finish_update(sgd)
        losses[self.name] += loss
        return losses
    def get_loss(self, examples: Iterable[Example], sentence_encodings: Floats2d):
        validate_examples(examples, "EntityLinker_v1.get_loss")
        entity_encodings = []
        for eg in examples:
            kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
            for ent in eg.reference.ents:
                kb_id = kb_ids[ent.start]
                if kb_id:
                    entity_encoding = self.kb.get_vector(kb_id)
                    entity_encodings.append(entity_encoding)
        entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32")
        if sentence_encodings.shape != entity_encodings.shape:
            err = Errors.E147.format(
                method="get_loss", msg="gold entities do not match up"
            )
            raise RuntimeError(err)
        # TODO: fix typing issue here
        gradients = self.distance.get_grad(sentence_encodings, entity_encodings)  # type: ignore
        loss = self.distance.get_loss(sentence_encodings, entity_encodings)  # type: ignore
        loss = loss / len(entity_encodings)
        return float(loss), gradients
    def predict(self, docs: Iterable[Doc]) -> List[str]:
        """Apply the pipeline's model to a batch of docs, without modifying them.
        Returns the KB IDs for each entity in each doc, including NIL if there is
        no prediction.
        docs (Iterable[Doc]): The documents to predict.
        RETURNS (List[str]): The models prediction for each document.
        DOCS: https://spacy.io/api/entitylinker#predict
        """
        self.validate_kb()
        entity_count = 0
        final_kb_ids: List[str] = []
        if not docs:
            return final_kb_ids
        if isinstance(docs, Doc):
            docs = [docs]
        for i, doc in enumerate(docs):
            sentences = [s for s in doc.sents]
            if len(doc) > 0:
                # Looping through each entity (TODO: rewrite)
                for ent in doc.ents:
                    sent = ent.sent
                    sent_index = sentences.index(sent)
                    assert sent_index >= 0
                    # get n_neighbour sentences, clipped to the length of the document
                    start_sentence = max(0, sent_index - self.n_sents)
                    end_sentence = min(len(sentences) - 1, sent_index + self.n_sents)
                    start_token = sentences[start_sentence].start
                    end_token = sentences[end_sentence].end
                    sent_doc = doc[start_token:end_token].as_doc()
                    # currently, the context is the same for each entity in a sentence (should be refined)
                    xp = self.model.ops.xp
                    if self.incl_context:
                        sentence_encoding = self.model.predict([sent_doc])[0]
                        sentence_encoding_t = sentence_encoding.T
                        sentence_norm = xp.linalg.norm(sentence_encoding_t)
                    entity_count += 1
                    if ent.label_ in self.labels_discard:
                        # ignoring this entity - setting to NIL
                        final_kb_ids.append(self.NIL)
                    else:
                        candidates = list(self.get_candidates(self.kb, ent))
                        if not candidates:
                            # no prediction possible for this entity - setting to NIL
                            final_kb_ids.append(self.NIL)
                        elif len(candidates) == 1:
                            # shortcut for efficiency reasons: take the 1 candidate
                            # TODO: thresholding
                            final_kb_ids.append(candidates[0].entity_)
                        else:
                            random.shuffle(candidates)
                            # set all prior probabilities to 0 if incl_prior=False
                            prior_probs = xp.asarray([c.prior_prob for c in candidates])
                            if not self.incl_prior:
                                prior_probs = xp.asarray([0.0 for _ in candidates])
                            scores = prior_probs
                            # add in similarity from the context
                            if self.incl_context:
                                entity_encodings = xp.asarray(
                                    [c.entity_vector for c in candidates]
                                )
                                entity_norm = xp.linalg.norm(entity_encodings, axis=1)
                                if len(entity_encodings) != len(prior_probs):
                                    raise RuntimeError(
                                        Errors.E147.format(
                                            method="predict",
                                            msg="vectors not of equal length",
                                        )
                                    )
                                # cosine similarity
                                sims = xp.dot(entity_encodings, sentence_encoding_t) / (
                                    sentence_norm * entity_norm
                                )
                                if sims.shape != prior_probs.shape:
                                    raise ValueError(Errors.E161)
                                scores = prior_probs + sims - (prior_probs * sims)
                            # TODO: thresholding
                            best_index = scores.argmax().item()
                            best_candidate = candidates[best_index]
                            final_kb_ids.append(best_candidate.entity_)
        if not (len(final_kb_ids) == entity_count):
            err = Errors.E147.format(
                method="predict", msg="result variables not of equal length"
            )
            raise RuntimeError(err)
        return final_kb_ids
    def set_annotations(self, docs: Iterable[Doc], kb_ids: List[str]) -> None:
        """Modify a batch of documents, using pre-computed scores.
        docs (Iterable[Doc]): The documents to modify.
        kb_ids (List[str]): The IDs to set, produced by EntityLinker.predict.
        DOCS: https://spacy.io/api/entitylinker#set_annotations
        """
        count_ents = len([ent for doc in docs for ent in doc.ents])
        if count_ents != len(kb_ids):
            raise ValueError(Errors.E148.format(ents=count_ents, ids=len(kb_ids)))
        i = 0
        overwrite = self.cfg["overwrite"]
        for doc in docs:
            for ent in doc.ents:
                kb_id = kb_ids[i]
                i += 1
                for token in ent:
                    if token.ent_kb_id == 0 or overwrite:
                        token.ent_kb_id_ = kb_id
    def to_bytes(self, *, exclude=tuple()):
        """Serialize the pipe to a bytestring.
        exclude (Iterable[str]): String names of serialization fields to exclude.
        RETURNS (bytes): The serialized object.
        DOCS: https://spacy.io/api/entitylinker#to_bytes
        """
        self._validate_serialization_attrs()
        serialize = {}
        if hasattr(self, "cfg") and self.cfg is not None:
            serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
        serialize["vocab"] = lambda: self.vocab.to_bytes(exclude=exclude)
        serialize["kb"] = self.kb.to_bytes
        serialize["model"] = self.model.to_bytes
        return util.to_bytes(serialize, exclude)
    def from_bytes(self, bytes_data, *, exclude=tuple()):
        """Load the pipe from a bytestring.
        exclude (Iterable[str]): String names of serialization fields to exclude.
        RETURNS (TrainablePipe): The loaded object.
        DOCS: https://spacy.io/api/entitylinker#from_bytes
        """
        self._validate_serialization_attrs()
        def load_model(b):
            try:
                self.model.from_bytes(b)
            except AttributeError:
                raise ValueError(Errors.E149) from None
        deserialize = {}
        if hasattr(self, "cfg") and self.cfg is not None:
            deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b))
        deserialize["vocab"] = lambda b: self.vocab.from_bytes(b, exclude=exclude)
        deserialize["kb"] = lambda b: self.kb.from_bytes(b)
        deserialize["model"] = load_model
        util.from_bytes(bytes_data, deserialize, exclude)
        return self
    def to_disk(
        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
    ) -> None:
        """Serialize the pipe to disk.
        path (str / Path): Path to a directory.
        exclude (Iterable[str]): String names of serialization fields to exclude.
        DOCS: https://spacy.io/api/entitylinker#to_disk
        """
        serialize = {}
        serialize["vocab"] = lambda p: self.vocab.to_disk(p, exclude=exclude)
        serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
        serialize["kb"] = lambda p: self.kb.to_disk(p)
        serialize["model"] = lambda p: self.model.to_disk(p)
        util.to_disk(path, serialize, exclude)
    def from_disk(
        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
    ) -> "EntityLinker_v1":
        """Load the pipe from disk. Modifies the object in place and returns it.
        path (str / Path): Path to a directory.
        exclude (Iterable[str]): String names of serialization fields to exclude.
        RETURNS (EntityLinker): The modified EntityLinker object.
        DOCS: https://spacy.io/api/entitylinker#from_disk
        """
        def load_model(p):
            try:
                with p.open("rb") as infile:
                    self.model.from_bytes(infile.read())
            except AttributeError:
                raise ValueError(Errors.E149) from None
        deserialize: Dict[str, Callable[[Any], Any]] = {}
        deserialize["cfg"] = lambda p: self.cfg.update(deserialize_config(p))
        deserialize["vocab"] = lambda p: self.vocab.from_disk(p, exclude=exclude)
        deserialize["kb"] = lambda p: self.kb.from_disk(p)
        deserialize["model"] = load_model
        util.from_disk(path, deserialize, exclude)
        return self
    def rehearse(self, examples, *, sgd=None, losses=None, **config):
        raise NotImplementedError
    def add_label(self, label):
        raise NotImplementedError
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@ -25,7 +25,7 @@ BACKWARD_EXTEND = False
 default_model_config = """
 [model]
-@architectures = "spacy.Tagger.v1"
+@architectures = "spacy.Tagger.v2"
 [model.tok2vec]
@architectures = "spacy.Tok2Vec.v2"
--- a/spacy/pipeline/pipe.pyi
+++ b/spacy/pipeline/pipe.pyi
@ -26,6 +26,8 @@ class Pipe:
    @property
    def labels(self) -> Tuple[str, ...]: ...
    @property
    def hide_labels(self) -> bool: ...
    @property
    def label_data(self) -> Any: ...
    def _require_labels(self) -> None: ...
    def set_error_handler(
--- a/spacy/pipeline/pipe.pyx
+++ b/spacy/pipeline/pipe.pyx
@ -102,6 +102,10 @@ cdef class Pipe:
    def labels(self) -> Tuple[str, ...]:
        return tuple()
    @property
    def hide_labels(self) -> bool:
        return False
    @property
    def label_data(self):
        """Optional JSON-serializable data that would be sufficient to recreate
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@ -1,6 +1,6 @@
 # cython: infer_types=True, profile=True, binding=True
 from itertools import islice
 from typing import Optional, Callable
 from itertools import islice
 import srsly
 from thinc.api import Model, SequenceCategoricalCrossentropy, Config
@ -20,7 +20,7 @@ BACKWARD_OVERWRITE = False
 default_model_config = """
 [model]
-@architectures = "spacy.Tagger.v1"
+@architectures = "spacy.Tagger.v2"
 [model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v2"
@ -99,6 +99,10 @@ class SentenceRecognizer(Tagger):
        # are 0
        return tuple(["I", "S"])
    @property
    def hide_labels(self):
        return True
    @property
    def label_data(self):
        return None
--- a/spacy/pipeline/spancat.py
+++ b/spacy/pipeline/spancat.py
@ -1,9 +1,10 @@
 import numpy
 from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast
 from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops
 from thinc.api import Optimizer
 from thinc.types import Ragged, Ints2d, Floats2d, Ints1d
 import numpy
 from ..compat import Protocol, runtime_checkable
 from ..scorer import Scorer
 from ..language import Language
@ -271,6 +272,24 @@ class SpanCategorizer(TrainablePipe):
        scores = self.model.predict((docs, indices))  # type: ignore
        return indices, scores
    def set_candidates(
        self, docs: Iterable[Doc], *, candidates_key: str = "candidates"
    ) -> None:
        """Use the spancat suggester to add a list of span candidates to a list of docs.
        This method is intended to be used for debugging purposes.
        docs (Iterable[Doc]): The documents to modify.
        candidates_key (str): Key of the Doc.spans dict to save the candidate spans under.
        DOCS: https://spacy.io/api/spancategorizer#set_candidates
        """
        suggester_output = self.suggester(docs, ops=self.model.ops)
        for candidates, doc in zip(suggester_output, docs):  # type: ignore
            doc.spans[candidates_key] = []
            for index in candidates.dataXd:
                doc.spans[candidates_key].append(doc[index[0] : index[1]])
    def set_annotations(self, docs: Iterable[Doc], indices_scores) -> None:
        """Modify a batch of Doc objects, using pre-computed scores.
@ -377,7 +396,7 @@ class SpanCategorizer(TrainablePipe):
        # If the prediction is 0.9 and it's false, the gradient will be
        # 0.9 (0.9 - 0.0)
        d_scores = scores - target
-        loss = float((d_scores ** 2).sum())
+        loss = float((d_scores**2).sum())
        return loss, d_scores
    def initialize(
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@ -27,7 +27,7 @@ BACKWARD_OVERWRITE = False
 default_model_config = """
 [model]
-@architectures = "spacy.Tagger.v1"
+@architectures = "spacy.Tagger.v2"
 [model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v2"
@ -225,6 +225,7 @@ class Tagger(TrainablePipe):
        DOCS: https://spacy.io/api/tagger#rehearse
        """
        loss_func = SequenceCategoricalCrossentropy()
        if losses is None:
            losses = {}
        losses.setdefault(self.name, 0.0)
@ -236,12 +237,12 @@ class Tagger(TrainablePipe):
            # Handle cases where there are no tokens in any docs.
            return losses
        set_dropout_rate(self.model, drop)
-        guesses, backprop = self.model.begin_update(docs)
+        tag_scores, bp_tag_scores = self.model.begin_update(docs)
-        target = self._rehearsal_model(examples)
+        tutor_tag_scores, _ = self._rehearsal_model.begin_update(docs)
-        gradient = guesses - target
+        grads, loss = loss_func(tag_scores, tutor_tag_scores)
-        backprop(gradient)
+        bp_tag_scores(grads)
        self.finish_update(sgd)
-        losses[self.name] += (gradient**2).sum()
+        losses[self.name] += loss
        return losses
    def get_loss(self, examples, scores):
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@ -158,6 +158,13 @@ class TextCategorizer(TrainablePipe):
        self.cfg = dict(cfg)
        self.scorer = scorer
    @property
    def support_missing_values(self):
        # There are no missing values as the textcat should always
        # predict exactly one label. All other labels are 0.0
        # Subclasses may override this property to change internal behaviour.
        return False
    @property
    def labels(self) -> Tuple[str]:
        """RETURNS (Tuple[str]): The labels currently added to the component.
@ -276,12 +283,12 @@ class TextCategorizer(TrainablePipe):
            return losses
        set_dropout_rate(self.model, drop)
        scores, bp_scores = self.model.begin_update(docs)
-        target = self._rehearsal_model(examples)
+        target, _ = self._rehearsal_model.begin_update(docs)
        gradient = scores - target
        bp_scores(gradient)
        if sgd is not None:
            self.finish_update(sgd)
-        losses[self.name] += (gradient ** 2).sum()
+        losses[self.name] += (gradient**2).sum()
        return losses
    def _examples_to_truth(
@ -294,7 +301,7 @@ class TextCategorizer(TrainablePipe):
            for j, label in enumerate(self.labels):
                if label in eg.reference.cats:
                    truths[i, j] = eg.reference.cats[label]
-                else:
+                elif self.support_missing_values:
                    not_missing[i, j] = 0.0
        truths = self.model.ops.asarray(truths)  # type: ignore
        return truths, not_missing  # type: ignore
@ -313,9 +320,9 @@ class TextCategorizer(TrainablePipe):
        self._validate_categories(examples)
        truths, not_missing = self._examples_to_truth(examples)
        not_missing = self.model.ops.asarray(not_missing)  # type: ignore
-        d_scores = (scores - truths) / scores.shape[0]
+        d_scores = scores - truths
        d_scores *= not_missing
-        mean_square_error = (d_scores ** 2).sum(axis=1).mean()
+        mean_square_error = (d_scores**2).mean()
        return float(mean_square_error), d_scores
    def add_label(self, label: str) -> int:
--- a/spacy/pipeline/textcat_multilabel.py
+++ b/spacy/pipeline/textcat_multilabel.py
@ -1,8 +1,8 @@
 from itertools import islice
 from typing import Iterable, Optional, Dict, List, Callable, Any
 from thinc.api import Model, Config
 from thinc.types import Floats2d
 from thinc.api import Model, Config
 from itertools import islice
 from ..language import Language
 from ..training import Example, validate_get_examples
@ -158,6 +158,10 @@ class MultiLabel_TextCategorizer(TextCategorizer):
        self.cfg = dict(cfg)
        self.scorer = scorer
    @property
    def support_missing_values(self):
        return True
    def initialize(  # type: ignore[override]
        self,
        get_examples: Callable[[], Iterable[Example]],
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@ -118,6 +118,10 @@ class Tok2Vec(TrainablePipe):
        DOCS: https://spacy.io/api/tok2vec#predict
        """
        if not any(len(doc) for doc in docs):
            # Handle cases where there are no tokens in any docs.
            width = self.model.get_dim("nO")
            return [self.model.ops.alloc((0, width)) for doc in docs]
        tokvecs = self.model.predict(docs)
        batch_id = Tok2VecListener.get_batch_id(docs)
        for listener in self.listeners:
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@ -228,7 +228,7 @@ class Scorer:
                if token.orth_.isspace():
                    continue
                if align.x2y.lengths[token.i] == 1:
-                    gold_i = align.x2y[token.i].dataXd[0, 0]
+                    gold_i = align.x2y[token.i][0]
                    if gold_i not in missing_indices:
                        pred_tags.add((gold_i, getter(token, attr)))
            tag_score.score_set(pred_tags, gold_tags)
@ -287,7 +287,7 @@ class Scorer:
                if token.orth_.isspace():
                    continue
                if align.x2y.lengths[token.i] == 1:
-                    gold_i = align.x2y[token.i].dataXd[0, 0]
+                    gold_i = align.x2y[token.i][0]
                    if gold_i not in missing_indices:
                        value = getter(token, attr)
                        morph = gold_doc.vocab.strings[value]
@ -553,7 +553,8 @@ class Scorer:
            getter(doc, attr) should return the values for the individual doc.
        labels (Iterable[str]): The set of possible labels. Defaults to [].
        multi_label (bool): Whether the attribute allows multiple labels.
-            Defaults to True.
+            Defaults to True. When set to False (exclusive labels), missing
            gold labels are interpreted as 0.0.
        positive_label (str): The positive label for a binary task with
            exclusive classes. Defaults to None.
        threshold (float): Cutoff to consider a prediction "positive". Defaults
@ -592,13 +593,15 @@ class Scorer:
            for label in labels:
                pred_score = pred_cats.get(label, 0.0)
-                gold_score = gold_cats.get(label, 0.0)
+                gold_score = gold_cats.get(label)
                if not gold_score and not multi_label:
                    gold_score = 0.0
                if gold_score is not None:
                    auc_per_type[label].score_set(pred_score, gold_score)
            if multi_label:
                for label in labels:
                    pred_score = pred_cats.get(label, 0.0)
-                    gold_score = gold_cats.get(label, 0.0)
+                    gold_score = gold_cats.get(label)
                    if gold_score is not None:
                        if pred_score >= threshold and gold_score > 0:
                            f_per_type[label].tp += 1
@ -610,16 +613,15 @@ class Scorer:
                # Get the highest-scoring for each.
                pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1])
                gold_label, gold_score = max(gold_cats.items(), key=lambda it: it[1])
-                if gold_score is not None:
+                if pred_label == gold_label and pred_score >= threshold:
-                    if pred_label == gold_label and pred_score >= threshold:
+                    f_per_type[pred_label].tp += 1
-                        f_per_type[pred_label].tp += 1
+                else:
-                    else:
+                    f_per_type[gold_label].fn += 1
-                        f_per_type[gold_label].fn += 1
+                    if pred_score >= threshold:
-                        if pred_score >= threshold:
+                        f_per_type[pred_label].fp += 1
                            f_per_type[pred_label].fp += 1
            elif gold_cats:
                gold_label, gold_score = max(gold_cats, key=lambda it: it[1])
-                if gold_score is not None and gold_score > 0:
+                if gold_score > 0:
                    f_per_type[gold_label].fn += 1
            elif pred_cats:
                pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1])
@ -800,13 +802,13 @@ class Scorer:
                if align.x2y.lengths[token.i] != 1:
                    gold_i = None  # type: ignore
                else:
-                    gold_i = align.x2y[token.i].dataXd[0, 0]
+                    gold_i = align.x2y[token.i][0]
                if gold_i not in missing_indices:
                    dep = getter(token, attr)
                    head = head_getter(token, head_attr)
                    if dep not in ignore_labels and token.orth_.strip():
                        if align.x2y.lengths[head.i] == 1:
-                            gold_head = align.x2y[head.i].dataXd[0, 0]
+                            gold_head = align.x2y[head.i][0]
                        else:
                            gold_head = None
                        # None is indistinct, so we can't just add it to the set
@ -856,7 +858,7 @@ def get_ner_prf(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
        for pred_ent in eg.x.ents:
            if pred_ent.label_ not in score_per_type:
                score_per_type[pred_ent.label_] = PRFScore()
-            indices = align_x2y[pred_ent.start : pred_ent.end].dataXd.ravel()
+            indices = align_x2y[pred_ent.start : pred_ent.end]
            if len(indices):
                g_span = eg.y[indices[0] : indices[-1] + 1]
                # Check we aren't missing annotation on this span. If so,
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -99,6 +99,11 @@ def de_vocab():
    return get_lang_class("de")().vocab
@pytest.fixture(scope="session")
 def dsb_tokenizer():
    return get_lang_class("dsb")().tokenizer
@pytest.fixture(scope="session")
 def el_tokenizer():
    return get_lang_class("el")().tokenizer
@ -155,6 +160,11 @@ def fr_tokenizer():
    return get_lang_class("fr")().tokenizer
@pytest.fixture(scope="session")
 def fr_vocab():
    return get_lang_class("fr")().vocab
@pytest.fixture(scope="session")
 def ga_tokenizer():
    return get_lang_class("ga")().tokenizer
@ -205,18 +215,41 @@ def it_tokenizer():
    return get_lang_class("it")().tokenizer
@pytest.fixture(scope="session")
 def it_vocab():
    return get_lang_class("it")().vocab
@pytest.fixture(scope="session")
 def ja_tokenizer():
    pytest.importorskip("sudachipy")
    return get_lang_class("ja")().tokenizer
@pytest.fixture(scope="session")
 def hsb_tokenizer():
    return get_lang_class("hsb")().tokenizer
@pytest.fixture(scope="session")
 def ko_tokenizer():
    pytest.importorskip("natto")
    return get_lang_class("ko")().tokenizer
@pytest.fixture(scope="session")
 def ko_tokenizer_tokenizer():
    config = {
        "nlp": {
            "tokenizer": {
                "@tokenizers": "spacy.Tokenizer.v1",
            }
        }
    }
    nlp = get_lang_class("ko").from_config(config)
    return nlp.tokenizer
@pytest.fixture(scope="session")
 def lb_tokenizer():
    return get_lang_class("lb")().tokenizer
@ -324,6 +357,11 @@ def sv_tokenizer():
    return get_lang_class("sv")().tokenizer
@pytest.fixture(scope="session")
 def ta_tokenizer():
    return get_lang_class("ta")().tokenizer
@pytest.fixture(scope="session")
 def th_tokenizer():
    pytest.importorskip("pythainlp")
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@ -684,6 +684,7 @@ def test_has_annotation(en_vocab):
    attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "HEAD", "ENT_IOB", "ENT_TYPE")
    for attr in attrs:
        assert not doc.has_annotation(attr)
        assert not doc.has_annotation(attr, require_complete=True)
    doc[0].tag_ = "A"
    doc[0].pos_ = "X"
@ -709,6 +710,27 @@ def test_has_annotation(en_vocab):
        assert doc.has_annotation(attr, require_complete=True)
 def test_has_annotation_sents(en_vocab):
    doc = Doc(en_vocab, words=["Hello", "beautiful", "world"])
    attrs = ("SENT_START", "IS_SENT_START", "IS_SENT_END")
    for attr in attrs:
        assert not doc.has_annotation(attr)
        assert not doc.has_annotation(attr, require_complete=True)
    # The first token (index 0) is always assumed to be a sentence start,
    # and ignored by the check in doc.has_annotation
    doc[1].is_sent_start = False
    for attr in attrs:
        assert doc.has_annotation(attr)
        assert not doc.has_annotation(attr, require_complete=True)
    doc[2].is_sent_start = False
    for attr in attrs:
        assert doc.has_annotation(attr)
        assert doc.has_annotation(attr, require_complete=True)
 def test_is_flags_deprecated(en_tokenizer):
    doc = en_tokenizer("test")
    with pytest.deprecated_call():
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@ -573,6 +573,55 @@ def test_span_with_vectors(doc):
    doc.vocab.vectors = prev_vectors
 # fmt: off
 def test_span_comparison(doc):
    # Identical start, end, only differ in label and kb_id
    assert Span(doc, 0, 3) == Span(doc, 0, 3)
    assert Span(doc, 0, 3, "LABEL") == Span(doc, 0, 3, "LABEL")
    assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") == Span(doc, 0, 3, "LABEL", kb_id="KB_ID")
    assert Span(doc, 0, 3) != Span(doc, 0, 3, "LABEL")
    assert Span(doc, 0, 3) != Span(doc, 0, 3, "LABEL", kb_id="KB_ID")
    assert Span(doc, 0, 3, "LABEL") != Span(doc, 0, 3, "LABEL", kb_id="KB_ID")
    assert Span(doc, 0, 3) <= Span(doc, 0, 3) and Span(doc, 0, 3) >= Span(doc, 0, 3)
    assert Span(doc, 0, 3, "LABEL") <= Span(doc, 0, 3, "LABEL") and Span(doc, 0, 3, "LABEL") >= Span(doc, 0, 3, "LABEL")
    assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") <= Span(doc, 0, 3, "LABEL", kb_id="KB_ID")
    assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") >= Span(doc, 0, 3, "LABEL", kb_id="KB_ID")
    assert (Span(doc, 0, 3) < Span(doc, 0, 3, "", kb_id="KB_ID") < Span(doc, 0, 3, "LABEL") < Span(doc, 0, 3, "LABEL", kb_id="KB_ID"))
    assert (Span(doc, 0, 3) <= Span(doc, 0, 3, "", kb_id="KB_ID") <= Span(doc, 0, 3, "LABEL") <= Span(doc, 0, 3, "LABEL", kb_id="KB_ID"))
    assert (Span(doc, 0, 3, "LABEL", kb_id="KB_ID") > Span(doc, 0, 3, "LABEL") > Span(doc, 0, 3, "", kb_id="KB_ID") > Span(doc, 0, 3))
    assert (Span(doc, 0, 3, "LABEL", kb_id="KB_ID") >= Span(doc, 0, 3, "LABEL") >= Span(doc, 0, 3, "", kb_id="KB_ID") >= Span(doc, 0, 3))
    # Different end
    assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") < Span(doc, 0, 4, "LABEL", kb_id="KB_ID")
    assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") < Span(doc, 0, 4)
    assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") <= Span(doc, 0, 4)
    assert Span(doc, 0, 4) > Span(doc, 0, 3, "LABEL", kb_id="KB_ID")
    assert Span(doc, 0, 4) >= Span(doc, 0, 3, "LABEL", kb_id="KB_ID")
    # Different start
    assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") != Span(doc, 1, 3, "LABEL", kb_id="KB_ID")
    assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") < Span(doc, 1, 3)
    assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") <= Span(doc, 1, 3)
    assert Span(doc, 1, 3) > Span(doc, 0, 3, "LABEL", kb_id="KB_ID")
    assert Span(doc, 1, 3) >= Span(doc, 0, 3, "LABEL", kb_id="KB_ID")
    # Different start & different end
    assert Span(doc, 0, 4, "LABEL", kb_id="KB_ID") != Span(doc, 1, 3, "LABEL", kb_id="KB_ID")
    assert Span(doc, 0, 4, "LABEL", kb_id="KB_ID") < Span(doc, 1, 3)
    assert Span(doc, 0, 4, "LABEL", kb_id="KB_ID") <= Span(doc, 1, 3)
    assert Span(doc, 1, 3) > Span(doc, 0, 4, "LABEL", kb_id="KB_ID")
    assert Span(doc, 1, 3) >= Span(doc, 0, 4, "LABEL", kb_id="KB_ID")
 # fmt: on
@pytest.mark.parametrize(
    "start,end,expected_sentences,expected_sentences_with_hook",
    [
@ -606,3 +655,16 @@ def test_span_sents(doc, start, end, expected_sentences, expected_sentences_with
 def test_span_sents_not_parsed(doc_not_parsed):
    with pytest.raises(ValueError):
        list(Span(doc_not_parsed, 0, 3).sents)
 def test_span_group_copy(doc):
    doc.spans["test"] = [doc[0:1], doc[2:4]]
    assert len(doc.spans["test"]) == 2
    doc_copy = doc.copy()
    # check that the spans were indeed copied
    assert len(doc_copy.spans["test"]) == 2
    # add a new span to the original doc
    doc.spans["test"].append(doc[3:4])
    assert len(doc.spans["test"]) == 3
    # check that the copy spans were not modified and this is an isolated doc
    assert len(doc_copy.spans["test"]) == 2
--- a/spacy/tests/doc/test_span_group.py
+++ b/spacy/tests/doc/test_span_group.py
@ -0,0 +1,242 @@
 import pytest
 from random import Random
 from spacy.matcher import Matcher
 from spacy.tokens import Span, SpanGroup
@pytest.fixture
 def doc(en_tokenizer):
    doc = en_tokenizer("0 1 2 3 4 5 6")
    matcher = Matcher(en_tokenizer.vocab, validate=True)
    # fmt: off
    matcher.add("4", [[{}, {}, {}, {}]])
    matcher.add("2", [[{}, {}, ]])
    matcher.add("1", [[{}, ]])
    # fmt: on
    matches = matcher(doc)
    spans = []
    for match in matches:
        spans.append(
            Span(doc, match[1], match[2], en_tokenizer.vocab.strings[match[0]])
        )
    Random(42).shuffle(spans)
    doc.spans["SPANS"] = SpanGroup(
        doc, name="SPANS", attrs={"key": "value"}, spans=spans
    )
    return doc
@pytest.fixture
 def other_doc(en_tokenizer):
    doc = en_tokenizer("0 1 2 3 4 5 6")
    matcher = Matcher(en_tokenizer.vocab, validate=True)
    # fmt: off
    matcher.add("4", [[{}, {}, {}, {}]])
    matcher.add("2", [[{}, {}, ]])
    matcher.add("1", [[{}, ]])
    # fmt: on
    matches = matcher(doc)
    spans = []
    for match in matches:
        spans.append(
            Span(doc, match[1], match[2], en_tokenizer.vocab.strings[match[0]])
        )
    Random(42).shuffle(spans)
    doc.spans["SPANS"] = SpanGroup(
        doc, name="SPANS", attrs={"key": "value"}, spans=spans
    )
    return doc
@pytest.fixture
 def span_group(en_tokenizer):
    doc = en_tokenizer("0 1 2 3 4 5 6")
    matcher = Matcher(en_tokenizer.vocab, validate=True)
    # fmt: off
    matcher.add("4", [[{}, {}, {}, {}]])
    matcher.add("2", [[{}, {}, ]])
    matcher.add("1", [[{}, ]])
    # fmt: on
    matches = matcher(doc)
    spans = []
    for match in matches:
        spans.append(
            Span(doc, match[1], match[2], en_tokenizer.vocab.strings[match[0]])
        )
    Random(42).shuffle(spans)
    doc.spans["SPANS"] = SpanGroup(
        doc, name="SPANS", attrs={"key": "value"}, spans=spans
    )
 def test_span_group_copy(doc):
    span_group = doc.spans["SPANS"]
    clone = span_group.copy()
    assert clone != span_group
    assert clone.name == span_group.name
    assert clone.attrs == span_group.attrs
    assert len(clone) == len(span_group)
    assert list(span_group) == list(clone)
    clone.name = "new_name"
    clone.attrs["key"] = "new_value"
    clone.append(Span(doc, 0, 6, "LABEL"))
    assert clone.name != span_group.name
    assert clone.attrs != span_group.attrs
    assert span_group.attrs["key"] == "value"
    assert list(span_group) != list(clone)
 def test_span_group_set_item(doc, other_doc):
    span_group = doc.spans["SPANS"]
    index = 5
    span = span_group[index]
    span.label_ = "NEW LABEL"
    span.kb_id = doc.vocab.strings["KB_ID"]
    assert span_group[index].label != span.label
    assert span_group[index].kb_id != span.kb_id
    span_group[index] = span
    assert span_group[index].start == span.start
    assert span_group[index].end == span.end
    assert span_group[index].label == span.label
    assert span_group[index].kb_id == span.kb_id
    assert span_group[index] == span
    with pytest.raises(IndexError):
        span_group[-100] = span
    with pytest.raises(IndexError):
        span_group[100] = span
    span = Span(other_doc, 0, 2)
    with pytest.raises(ValueError):
        span_group[index] = span
 def test_span_group_has_overlap(doc):
    span_group = doc.spans["SPANS"]
    assert span_group.has_overlap
 def test_span_group_concat(doc, other_doc):
    span_group_1 = doc.spans["SPANS"]
    spans = [doc[0:5], doc[0:6]]
    span_group_2 = SpanGroup(
        doc,
        name="MORE_SPANS",
        attrs={"key": "new_value", "new_key": "new_value"},
        spans=spans,
    )
    span_group_3 = span_group_1._concat(span_group_2)
    assert span_group_3.name == span_group_1.name
    assert span_group_3.attrs == {"key": "value", "new_key": "new_value"}
    span_list_expected = list(span_group_1) + list(span_group_2)
    assert list(span_group_3) == list(span_list_expected)
    # Inplace
    span_list_expected = list(span_group_1) + list(span_group_2)
    span_group_3 = span_group_1._concat(span_group_2, inplace=True)
    assert span_group_3 == span_group_1
    assert span_group_3.name == span_group_1.name
    assert span_group_3.attrs == {"key": "value", "new_key": "new_value"}
    assert list(span_group_3) == list(span_list_expected)
    span_group_2 = other_doc.spans["SPANS"]
    with pytest.raises(ValueError):
        span_group_1._concat(span_group_2)
 def test_span_doc_delitem(doc):
    span_group = doc.spans["SPANS"]
    length = len(span_group)
    index = 5
    span = span_group[index]
    next_span = span_group[index + 1]
    del span_group[index]
    assert len(span_group) == length - 1
    assert span_group[index] != span
    assert span_group[index] == next_span
    with pytest.raises(IndexError):
        del span_group[-100]
    with pytest.raises(IndexError):
        del span_group[100]
 def test_span_group_add(doc):
    span_group_1 = doc.spans["SPANS"]
    spans = [doc[0:5], doc[0:6]]
    span_group_2 = SpanGroup(
        doc,
        name="MORE_SPANS",
        attrs={"key": "new_value", "new_key": "new_value"},
        spans=spans,
    )
    span_group_3_expected = span_group_1._concat(span_group_2)
    span_group_3 = span_group_1 + span_group_2
    assert len(span_group_3) == len(span_group_3_expected)
    assert span_group_3.attrs == {"key": "value", "new_key": "new_value"}
    assert list(span_group_3) == list(span_group_3_expected)
 def test_span_group_iadd(doc):
    span_group_1 = doc.spans["SPANS"].copy()
    spans = [doc[0:5], doc[0:6]]
    span_group_2 = SpanGroup(
        doc,
        name="MORE_SPANS",
        attrs={"key": "new_value", "new_key": "new_value"},
        spans=spans,
    )
    span_group_1_expected = span_group_1._concat(span_group_2)
    span_group_1 += span_group_2
    assert len(span_group_1) == len(span_group_1_expected)
    assert span_group_1.attrs == {"key": "value", "new_key": "new_value"}
    assert list(span_group_1) == list(span_group_1_expected)
    span_group_1 = doc.spans["SPANS"].copy()
    span_group_1 += spans
    assert len(span_group_1) == len(span_group_1_expected)
    assert span_group_1.attrs == {
        "key": "value",
    }
    assert list(span_group_1) == list(span_group_1_expected)
 def test_span_group_extend(doc):
    span_group_1 = doc.spans["SPANS"].copy()
    spans = [doc[0:5], doc[0:6]]
    span_group_2 = SpanGroup(
        doc,
        name="MORE_SPANS",
        attrs={"key": "new_value", "new_key": "new_value"},
        spans=spans,
    )
    span_group_1_expected = span_group_1._concat(span_group_2)
    span_group_1.extend(span_group_2)
    assert len(span_group_1) == len(span_group_1_expected)
    assert span_group_1.attrs == {"key": "value", "new_key": "new_value"}
    assert list(span_group_1) == list(span_group_1_expected)
    span_group_1 = doc.spans["SPANS"]
    span_group_1.extend(spans)
    assert len(span_group_1) == len(span_group_1_expected)
    assert span_group_1.attrs == {"key": "value"}
    assert list(span_group_1) == list(span_group_1_expected)
 def test_span_group_dealloc(span_group):
    with pytest.raises(AttributeError):
        print(span_group.doc)
--- a/spacy/tests/doc/test_to_json.py
+++ b/spacy/tests/doc/test_to_json.py
@ -1,5 +1,5 @@
 import pytest
-from spacy.tokens import Doc
+from spacy.tokens import Doc, Span
@pytest.fixture()
@ -60,3 +60,13 @@ def test_doc_to_json_underscore_error_serialize(doc):
    Doc.set_extension("json_test4", method=lambda doc: doc.text)
    with pytest.raises(ValueError):
        doc.to_json(underscore=["json_test4"])
 def test_doc_to_json_span(doc):
    """Test that Doc.to_json() includes spans"""
    doc.spans["test"] = [Span(doc, 0, 2, "test"), Span(doc, 0, 1, "test")]
    json_doc = doc.to_json()
    assert "spans" in json_doc
    assert len(json_doc["spans"]) == 1
    assert len(json_doc["spans"]["test"]) == 2
    assert json_doc["spans"]["test"][0]["start"] == 0
--- a/spacy/tests/lang/dsb/init.py
+++ b/spacy/tests/lang/dsb/init.py
--- a/spacy/tests/lang/dsb/test_text.py
+++ b/spacy/tests/lang/dsb/test_text.py
@ -0,0 +1,25 @@
 import pytest
@pytest.mark.parametrize(
    "text,match",
    [
        ("10", True),
        ("1", True),
        ("10,000", True),
        ("10,00", True),
        ("jadno", True),
        ("dwanassćo", True),
        ("milion", True),
        ("sto", True),
        ("ceła", False),
        ("kopica", False),
        ("narěcow", False),
        (",", False),
        ("1/2", True),
    ],
 )
 def test_lex_attrs_like_number(dsb_tokenizer, text, match):
    tokens = dsb_tokenizer(text)
    assert len(tokens) == 1
    assert tokens[0].like_num == match
--- a/spacy/tests/lang/dsb/test_tokenizer.py
+++ b/spacy/tests/lang/dsb/test_tokenizer.py
@ -0,0 +1,29 @@
 import pytest
 DSB_BASIC_TOKENIZATION_TESTS = [
    (
        "Ale eksistěrujo mimo togo ceła kopica narěcow, ako na pśikład slěpjańska.",
        [
            "Ale",
            "eksistěrujo",
            "mimo",
            "togo",
            "ceła",
            "kopica",
            "narěcow",
            ",",
            "ako",
            "na",
            "pśikład",
            "slěpjańska",
            ".",
        ],
    ),
 ]
@pytest.mark.parametrize("text,expected_tokens", DSB_BASIC_TOKENIZATION_TESTS)
 def test_dsb_tokenizer_basic(dsb_tokenizer, text, expected_tokens):
    tokens = dsb_tokenizer(text)
    token_list = [token.text for token in tokens if not token.is_space]
    assert expected_tokens == token_list
--- a/spacy/tests/lang/fi/test_noun_chunks.py
+++ b/spacy/tests/lang/fi/test_noun_chunks.py
@ -0,0 +1,189 @@
 import pytest
 from spacy.tokens import Doc
 FI_NP_TEST_EXAMPLES = [
    (
        "Kaksi tyttöä potkii punaista palloa",
        ["NUM", "NOUN", "VERB", "ADJ", "NOUN"],
        ["nummod", "nsubj", "ROOT", "amod", "obj"],
        [1, 1, 0, 1, -2],
        ["Kaksi tyttöä", "punaista palloa"],
    ),
    (
        "Erittäin vaarallinen leijona karkasi kiertävän sirkuksen eläintenkesyttäjältä",
        ["ADV", "ADJ", "NOUN", "VERB", "ADJ", "NOUN", "NOUN"],
        ["advmod", "amod", "nsubj", "ROOT", "amod", "nmod:poss", "obl"],
        [1, 1, 1, 0, 1, 1, -3],
        ["Erittäin vaarallinen leijona", "kiertävän sirkuksen eläintenkesyttäjältä"],
    ),
    (
        "Leijona raidallisine tassuineen piileksii Porin kaupungin lähellä",
        ["NOUN", "ADJ", "NOUN", "VERB", "PROPN", "NOUN", "ADP"],
        ["nsubj", "amod", "nmod", "ROOT", "nmod:poss", "obl", "case"],
        [3, 1, -2, 0, 1, -2, -1],
        ["Leijona raidallisine tassuineen", "Porin kaupungin"],
    ),
    (
        "Lounaalla nautittiin salaattia, maukasta kanaa ja raikasta vettä",
        ["NOUN", "VERB", "NOUN", "PUNCT", "ADJ", "NOUN", "CCONJ", "ADJ", "NOUN"],
        ["obl", "ROOT", "obj", "punct", "amod", "conj", "cc", "amod", "conj"],
        [1, 0, -1, 2, 1, -3, 2, 1, -6],
        ["Lounaalla", "salaattia", "maukasta kanaa", "raikasta vettä"],
    ),
    (
        "Minua houkuttaa maalle muuttaminen talven jälkeen",
        ["PRON", "VERB", "NOUN", "NOUN", "NOUN", "ADP"],
        ["obj", "ROOT", "nmod", "nsubj", "obl", "case"],
        [1, 0, 1, -2, -3, -1],
        ["maalle muuttaminen", "talven"],
    ),
    (
        "Päivän kohokohta oli vierailu museossa kummilasten kanssa",
        ["NOUN", "NOUN", "AUX", "NOUN", "NOUN", "NOUN", "ADP"],
        ["nmod:poss", "nsubj:cop", "cop", "ROOT", "nmod", "obl", "case"],
        [1, 2, 1, 0, -1, -2, -1],
        ["Päivän kohokohta", "vierailu museossa", "kummilasten"],
    ),
    (
        "Yrittäjät maksoivat tuomioistuimen määräämät korvaukset",
        ["NOUN", "VERB", "NOUN", "VERB", "NOUN"],
        ["nsubj", "ROOT", "nsubj", "acl", "obj"],
        [1, 0, 1, 1, -3],
        ["Yrittäjät", "tuomioistuimen", "korvaukset"],
    ),
    (
        "Julkisoikeudelliset tai niihin rinnastettavat saatavat ovat suoraan ulosottokelpoisia",
        ["ADJ", "CCONJ", "PRON", "VERB", "NOUN", "AUX", "ADV", "NOUN"],
        ["amod", "cc", "obl", "acl", "nsubj:cop", "cop", "advmod", "ROOT"],
        [4, 3, 1, 1, 3, 2, 1, 0],
        ["Julkisoikeudelliset tai niihin rinnastettavat saatavat", "ulosottokelpoisia"],
    ),
    (
        "Se oli ala-arvoista käytöstä kaikilta oppilailta, myös valvojaoppilailta",
        ["PRON", "AUX", "ADJ", "NOUN", "PRON", "NOUN", "PUNCT", "ADV", "NOUN"],
        ["nsubj:cop", "cop", "amod", "ROOT", "det", "nmod", "punct", "advmod", "appos"],
        [3, 2, 1, 0, 1, -2, 2, 1, -3],
        ["ala-arvoista käytöstä kaikilta oppilailta", "valvojaoppilailta"],
    ),
    (
        "Isä souti veneellä, jonka hän oli vuokrannut",
        ["NOUN", "VERB", "NOUN", "PUNCT", "PRON", "PRON", "AUX", "VERB"],
        ["nsubj", "ROOT", "obl", "punct", "obj", "nsubj", "aux", "acl:relcl"],
        [1, 0, -1, 4, 3, 2, 1, -5],
        ["Isä", "veneellä"],
    ),
    (
        "Kirja, jonka poimin hyllystä, kertoo norsuista",
        ["NOUN", "PUNCT", "PRON", "VERB", "NOUN", "PUNCT", "VERB", "NOUN"],
        ["nsubj", "punct", "obj", "acl:relcl", "obl", "punct", "ROOT", "obl"],
        [6, 2, 1, -3, -1, 1, 0, -1],
        ["Kirja", "hyllystä", "norsuista"],
    ),
    (
        "Huomenna on päivä, jota olemme odottaneet",
        ["NOUN", "AUX", "NOUN", "PUNCT", "PRON", "AUX", "VERB"],
        ["ROOT", "cop", "nsubj:cop", "punct", "obj", "aux", "acl:relcl"],
        [0, -1, -2, 3, 2, 1, -4],
        ["Huomenna", "päivä"],
    ),
    (
        "Liikkuvuuden lisääminen on yksi korkeakoulutuksen keskeisistä kehittämiskohteista",
        ["NOUN", "NOUN", "AUX", "PRON", "NOUN", "ADJ", "NOUN"],
        ["nmod:gobj", "nsubj:cop", "cop", "ROOT", "nmod:poss", "amod", "nmod"],
        [1, 2, 1, 0, 2, 1, -3],
        [
            "Liikkuvuuden lisääminen",
            "korkeakoulutuksen keskeisistä kehittämiskohteista",
        ],
    ),
    (
        "Kaupalliset palvelut jätetään yksityisten palveluntarjoajien tarjottavaksi",
        ["ADJ", "NOUN", "VERB", "ADJ", "NOUN", "NOUN"],
        ["amod", "obj", "ROOT", "amod", "nmod:gsubj", "obl"],
        [1, 1, 0, 1, 1, -3],
        ["Kaupalliset palvelut", "yksityisten palveluntarjoajien tarjottavaksi"],
    ),
    (
        "New York tunnetaan kaupunkina, joka ei koskaan nuku",
        ["PROPN", "PROPN", "VERB", "NOUN", "PUNCT", "PRON", "AUX", "ADV", "VERB"],
        [
            "obj",
            "flat:name",
            "ROOT",
            "obl",
            "punct",
            "nsubj",
            "aux",
            "advmod",
            "acl:relcl",
        ],
        [2, -1, 0, -1, 4, 3, 2, 1, -5],
        ["New York", "kaupunkina"],
    ),
    (
        "Loput vihjeet saat herra Möttöseltä",
        ["NOUN", "NOUN", "VERB", "NOUN", "PROPN"],
        ["compound:nn", "obj", "ROOT", "compound:nn", "obj"],
        [1, 1, 0, 1, -2],
        ["Loput vihjeet", "herra Möttöseltä"],
    ),
    (
        "mahdollisuus tukea muita päivystysyksiköitä",
        ["NOUN", "VERB", "PRON", "NOUN"],
        ["ROOT", "acl", "det", "obj"],
        [0, -1, 1, -2],
        ["mahdollisuus", "päivystysyksiköitä"],
    ),
    (
        "sairaanhoitopiirit harjoittavat leikkaustoimintaa alueellaan useammassa sairaalassa",
        ["NOUN", "VERB", "NOUN", "NOUN", "ADJ", "NOUN"],
        ["nsubj", "ROOT", "obj", "obl", "amod", "obl"],
        [1, 0, -1, -1, 1, -3],
        [
            "sairaanhoitopiirit",
            "leikkaustoimintaa",
            "alueellaan",
            "useammassa sairaalassa",
        ],
    ),
    (
        "Lain mukaan varhaiskasvatus on suunnitelmallista toimintaa",
        ["NOUN", "ADP", "NOUN", "AUX", "ADJ", "NOUN"],
        ["obl", "case", "nsubj:cop", "cop", "amod", "ROOT"],
        [5, -1, 3, 2, 1, 0],
        ["Lain", "varhaiskasvatus", "suunnitelmallista toimintaa"],
    ),
 ]
 def test_noun_chunks_is_parsed(fi_tokenizer):
    """Test that noun_chunks raises Value Error for 'fi' language if Doc is not parsed.
    To check this test, we're constructing a Doc
    with a new Vocab here and forcing is_parsed to 'False'
    to make sure the noun chunks don't run.
    """
    doc = fi_tokenizer("Tämä on testi")
    with pytest.raises(ValueError):
        list(doc.noun_chunks)
@pytest.mark.parametrize(
    "text,pos,deps,heads,expected_noun_chunks", FI_NP_TEST_EXAMPLES
 )
 def test_fi_noun_chunks(fi_tokenizer, text, pos, deps, heads, expected_noun_chunks):
    tokens = fi_tokenizer(text)
    assert len(heads) == len(pos)
    doc = Doc(
        tokens.vocab,
        words=[t.text for t in tokens],
        heads=[head + i for i, head in enumerate(heads)],
        deps=deps,
        pos=pos,
    )
    noun_chunks = list(doc.noun_chunks)
    assert len(noun_chunks) == len(expected_noun_chunks)
    for i, np in enumerate(noun_chunks):
        assert np.text == expected_noun_chunks[i]
--- a/spacy/tests/lang/fr/test_noun_chunks.py
+++ b/spacy/tests/lang/fr/test_noun_chunks.py
@ -1,8 +1,230 @@
 from spacy.tokens import Doc
 import pytest
 # fmt: off
@pytest.mark.parametrize(
    "words,heads,deps,pos,chunk_offsets",
    [
        # determiner + noun
        # un nom -> un nom
        (
            ["un", "nom"],
            [1, 1],
            ["det", "ROOT"],
            ["DET", "NOUN"],
            [(0, 2)],
        ),
        # determiner + noun starting with vowel
        # l'heure -> l'heure
        (
            ["l'", "heure"],
            [1, 1],
            ["det", "ROOT"],
            ["DET", "NOUN"],
            [(0, 2)],
        ),
        # determiner + plural noun
        # les romans -> les romans
        (
            ["les", "romans"],
            [1, 1],
            ["det", "ROOT"],
            ["DET", "NOUN"],
            [(0, 2)],
        ),
        # det + adj + noun
        # Le vieux Londres  -> Le vieux Londres 
        (
            ['Les', 'vieux', 'Londres'],
            [2, 2, 2],
            ["det", "amod", "ROOT"],
            ["DET", "ADJ", "NOUN"],
            [(0,3)]
        ),
        # det + noun + adj
        # le nom propre  -> le nom propre   a proper noun
        (
            ["le", "nom", "propre"],
            [1, 1, 1],
            ["det", "ROOT", "amod"],
            ["DET", "NOUN", "ADJ"],
            [(0, 3)],
        ),
        # det + noun + adj plural
        # Les chiens bruns  -> les chiens bruns
        (
            ["Les", "chiens", "bruns"],
            [1, 1, 1],
            ["det", "ROOT", "amod"],
            ["DET", "NOUN", "ADJ"],
            [(0, 3)],
        ),
        # multiple adjectives: one adj before the noun, one adj after the noun
        # un nouveau film intéressant -> un nouveau film intéressant
        (
            ["un", "nouveau", "film", "intéressant"],
            [2, 2, 2, 2],
            ["det", "amod", "ROOT", "amod"],
            ["DET", "ADJ", "NOUN", "ADJ"],
            [(0,4)]
        ),
        # multiple adjectives, both adjs after the noun
        # une personne intelligente et drôle -> une personne intelligente et drôle
        (
            ["une", "personne", "intelligente", "et", "drôle"],
            [1, 1, 1, 4, 2],
            ["det", "ROOT", "amod", "cc", "conj"],
            ["DET", "NOUN", "ADJ", "CCONJ", "ADJ"],
            [(0,5)]
        ),
        # relative pronoun
        # un bus qui va au ville -> un bus, qui, ville
        (
            ['un', 'bus', 'qui', 'va', 'au', 'ville'],
            [1, 1, 3, 1, 5, 3],
            ['det', 'ROOT', 'nsubj', 'acl:relcl', 'case', 'obl:arg'],
            ['DET', 'NOUN', 'PRON', 'VERB', 'ADP', 'NOUN'],
            [(0,2), (2,3), (5,6)]
        ),
        # relative subclause
        # Voilà la maison que nous voulons acheter -> la maison, nous         That's the house that we want to buy.
        (
            ['Voilà', 'la', 'maison', 'que', 'nous', 'voulons', 'acheter'],
            [0, 2, 0, 5, 5, 2, 5],
            ['ROOT', 'det', 'obj', 'mark', 'nsubj', 'acl:relcl', 'xcomp'],
            ['VERB', 'DET', 'NOUN', 'SCONJ', 'PRON', 'VERB', 'VERB'],
            [(1,3), (4,5)]
        ),
        # Person name and title by flat
        # Louis XIV -> Louis XIV
        (
            ["Louis", "XIV"],
            [0, 0],
            ["ROOT", "flat:name"],
            ["PROPN", "PROPN"],
            [(0,2)]
        ),
        # Organization name by flat
        # Nations Unies -> Nations Unies
        (
            ["Nations", "Unies"],
            [0, 0],
            ["ROOT", "flat:name"],
            ["PROPN", "PROPN"],
            [(0,2)]
        ),
        # Noun compound, person name created by two flats
        # Louise de Bratagne -> Louise de Bratagne
        (
            ["Louise", "de", "Bratagne"],
            [0, 0, 0],
            ["ROOT", "flat:name", "flat:name"],
            ["PROPN", "PROPN", "PROPN"],
            [(0,3)]
        ),
        # Noun compound, person name created by two flats
        # Louis François Joseph -> Louis François Joseph
        (
            ["Louis", "François", "Joseph"],
            [0, 0, 0],
            ["ROOT", "flat:name", "flat:name"],
            ["PROPN", "PROPN", "PROPN"],
            [(0,3)]
        ),
        # one determiner + one noun + one adjective qualified by an adverb
        # quelques agriculteurs très riches -> quelques agriculteurs très riches
        (
            ["quelques", "agriculteurs", "très", "riches"],
            [1, 1, 3, 1],
            ['det', 'ROOT', 'advmod', 'amod'],
            ['DET', 'NOUN', 'ADV', 'ADJ'],
            [(0,4)]
        ),
        # Two NPs conjuncted
        # Il a un chien et un chat -> Il, un chien, un chat
        ( 
            ['Il', 'a', 'un', 'chien', 'et', 'un', 'chat'],
            [1, 1, 3, 1, 6, 6, 3],
            ['nsubj', 'ROOT', 'det', 'obj', 'cc', 'det', 'conj'],
            ['PRON', 'VERB', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN'],
            [(0,1), (2,4), (5,7)]
        ),
        # Two NPs together
        # l'écrivain brésilien Aníbal Machado -> l'écrivain brésilien, Aníbal Machado
        (
            ["l'", 'écrivain', 'brésilien', 'Aníbal', 'Machado'],
            [1, 1, 1, 1, 3],
            ['det', 'ROOT', 'amod', 'appos', 'flat:name'],
            ['DET', 'NOUN', 'ADJ', 'PROPN', 'PROPN'],
            [(0, 3), (3, 5)]
        ),
        # nmod relation between NPs
        # la destruction de la ville -> la destruction, la ville
        (
            ['la', 'destruction', 'de', 'la', 'ville'],
            [1, 1, 4, 4, 1],
            ['det', 'ROOT', 'case', 'det', 'nmod'],
            ['DET', 'NOUN', 'ADP', 'DET', 'NOUN'],
            [(0,2), (3,5)]
        ),
        # nmod relation between NPs
        # Archiduchesse d’Autriche -> Archiduchesse, Autriche
        (
            ['Archiduchesse', 'd’', 'Autriche'],
            [0, 2, 0],
            ['ROOT', 'case', 'nmod'],
            ['NOUN', 'ADP', 'PROPN'],
            [(0,1), (2,3)]
        ),
        # Compounding by nmod, several NPs chained together
        # la première usine de drogue du gouvernement -> la première usine, drogue, gouvernement
        (
            ["la", "première", "usine", "de", "drogue", "du", "gouvernement"],
            [2, 2, 2, 4, 2, 6, 2],
            ['det', 'amod', 'ROOT', 'case', 'nmod', 'case', 'nmod'],
            ['DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'],
            [(0, 3), (4, 5), (6, 7)]
        ),
        # several NPs
        # Traduction du rapport de Susana -> Traduction, rapport, Susana
        (
            ['Traduction', 'du', 'raport', 'de', 'Susana'],
            [0, 2, 0, 4, 2],
            ['ROOT', 'case', 'nmod', 'case', 'nmod'],
            ['NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'],
            [(0,1), (2,3), (4,5)]  
        ),
        # Several NPs
        # Le gros chat de Susana et son amie -> Le gros chat, Susana, son amie
        (  
            ['Le', 'gros', 'chat', 'de', 'Susana', 'et', 'son', 'amie'],
            [2, 2, 2, 4, 2, 7, 7, 2],
            ['det', 'amod', 'ROOT', 'case', 'nmod', 'cc', 'det', 'conj'],
            ['DET', 'ADJ', 'NOUN', 'ADP', 'PROPN', 'CCONJ', 'DET', 'NOUN'],
            [(0,3), (4,5), (6,8)]
        ),
        # Passive subject
        # Les nouvelles dépenses sont alimentées par le grand compte bancaire de Clinton -> Les nouvelles dépenses, le grand compte bancaire, Clinton
        (
            ['Les', 'nouvelles', 'dépenses', 'sont', 'alimentées', 'par', 'le', 'grand', 'compte', 'bancaire', 'de', 'Clinton'],
            [2, 2, 4, 4, 4, 8, 8, 8, 4, 8, 11, 8],
            ['det', 'amod', 'nsubj:pass', 'aux:pass', 'ROOT', 'case', 'det', 'amod', 'obl:agent', 'amod', 'case', 'nmod'],
            ['DET', 'ADJ', 'NOUN', 'AUX', 'VERB', 'ADP', 'DET', 'ADJ', 'NOUN', 'ADJ', 'ADP', 'PROPN'],
            [(0, 3), (6, 10), (11, 12)]
        )
    ],
 )
 # fmt: on
 def test_fr_noun_chunks(fr_vocab, words, heads, deps, pos, chunk_offsets):
    doc = Doc(fr_vocab, words=words, heads=heads, deps=deps, pos=pos)
    assert [(c.start, c.end) for c in doc.noun_chunks] == chunk_offsets
 def test_noun_chunks_is_parsed_fr(fr_tokenizer):
    """Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed."""
-    doc = fr_tokenizer("trouver des travaux antérieurs")
+    doc = fr_tokenizer("Je suis allé à l'école")
    with pytest.raises(ValueError):
        list(doc.noun_chunks)
--- a/spacy/tests/lang/hsb/init.py
+++ b/spacy/tests/lang/hsb/init.py
--- a/spacy/tests/lang/hsb/test_text.py
+++ b/spacy/tests/lang/hsb/test_text.py
@ -0,0 +1,25 @@
 import pytest
@pytest.mark.parametrize(
    "text,match",
    [
        ("10", True),
        ("1", True),
        ("10,000", True),
        ("10,00", True),
        ("jedne", True),
        ("dwanaće", True),
        ("milion", True),
        ("sto", True),
        ("załožene", False),
        ("wona", False),
        ("powšitkownej", False),
        (",", False),
        ("1/2", True),
    ],
 )
 def test_lex_attrs_like_number(hsb_tokenizer, text, match):
    tokens = hsb_tokenizer(text)
    assert len(tokens) == 1
    assert tokens[0].like_num == match
--- a/spacy/tests/lang/hsb/test_tokenizer.py
+++ b/spacy/tests/lang/hsb/test_tokenizer.py
@ -0,0 +1,32 @@
 import pytest
 HSB_BASIC_TOKENIZATION_TESTS = [
    (
        "Hornjoserbšćina wobsteji resp. wobsteješe z wjacorych dialektow, kotrež so zdźěla chětro wot so rozeznawachu.",
        [
            "Hornjoserbšćina",
            "wobsteji",
            "resp.",
            "wobsteješe",
            "z",
            "wjacorych",
            "dialektow",
            ",",
            "kotrež",
            "so",
            "zdźěla",
            "chětro",
            "wot",
            "so",
            "rozeznawachu",
            ".",
        ],
    ),
 ]
@pytest.mark.parametrize("text,expected_tokens", HSB_BASIC_TOKENIZATION_TESTS)
 def test_hsb_tokenizer_basic(hsb_tokenizer, text, expected_tokens):
    tokens = hsb_tokenizer(text)
    token_list = [token.text for token in tokens if not token.is_space]
    assert expected_tokens == token_list
--- a/spacy/tests/lang/it/test_noun_chunks.py
+++ b/spacy/tests/lang/it/test_noun_chunks.py
@ -0,0 +1,221 @@
 from spacy.tokens import Doc
 import pytest
 # fmt: off
@pytest.mark.parametrize(
    "words,heads,deps,pos,chunk_offsets",
    [
        # determiner + noun
        # un pollo -> un pollo
        (
            ["un", "pollo"],
            [1, 1],
            ["det", "ROOT"],
            ["DET", "NOUN"],
            [(0,2)],
        ),
        # two determiners + noun
        # il mio cane -> il mio cane
        (
            ["il", "mio", "cane"],
            [2, 2, 2],
            ["det", "det:poss", "ROOT"],
            ["DET", "DET", "NOUN"],
            [(0,3)],
        ),
        # two determiners, one is after noun. rare usage but still testing
        # il cane mio-> il cane mio
        (
            ["il", "cane", "mio"],
            [1, 1, 1],
            ["det", "ROOT", "det:poss"],
            ["DET", "NOUN", "DET"],
            [(0,3)],
        ),
        # relative pronoun
        # È molto bello il vestito che hai acquistat -> il vestito, che   the dress that you bought is very pretty.
        (
            ["È", "molto", "bello", "il", "vestito", "che", "hai", "acquistato"],
            [2, 2, 2, 4, 2, 7, 7, 4],
            ['cop', 'advmod', 'ROOT', 'det', 'nsubj', 'obj', 'aux', 'acl:relcl'],
            ['AUX', 'ADV', 'ADJ', 'DET', 'NOUN', 'PRON', 'AUX', 'VERB'],
            [(3,5), (5,6)]
        ),
        # relative subclause
        # il computer che hai comprato -> il computer, che     the computer that you bought
        (
            ['il', 'computer', 'che', 'hai', 'comprato'],
            [1, 1, 4, 4, 1],
            ['det', 'ROOT', 'nsubj', 'aux', 'acl:relcl'],
            ['DET', 'NOUN', 'PRON', 'AUX', 'VERB'],
            [(0,2), (2,3)]
        ),
        # det + noun + adj
        # Una macchina grande  -> Una macchina grande
        (
            ["Una", "macchina", "grande"],
            [1, 1, 1],
            ["det", "ROOT", "amod"],
            ["DET", "NOUN", "ADJ"],
            [(0,3)],
        ),
        # noun + adj plural
        # mucche bianche 
        (
            ["mucche", "bianche"],
            [0, 0],
            ["ROOT", "amod"],
            ["NOUN", "ADJ"],
            [(0,2)],
        ),
        # det + adj + noun
        # Una grande macchina -> Una grande macchina
        (
            ['Una', 'grande', 'macchina'],
            [2, 2, 2],
            ["det", "amod", "ROOT"],
            ["DET", "ADJ", "NOUN"],
            [(0,3)]
        ),
        # det + adj + noun, det with apostrophe
        # un'importante associazione -> un'importante associazione
        (
            ["Un'", 'importante', 'associazione'],
            [2, 2, 2],
            ["det", "amod", "ROOT"],
            ["DET", "ADJ", "NOUN"],
            [(0,3)]
        ),
        # multiple adjectives
        # Un cane piccolo e marrone -> Un cane piccolo e marrone
        (
            ["Un", "cane", "piccolo", "e", "marrone"],
            [1, 1, 1, 4, 2],
            ["det", "ROOT", "amod", "cc", "conj"],
            ["DET", "NOUN", "ADJ", "CCONJ", "ADJ"],
            [(0,5)]
        ),
        # determiner, adjective, compound created by flat
        # le Nazioni Unite -> le Nazioni Unite
        (
            ["le", "Nazioni", "Unite"],
            [1, 1, 1],
            ["det", "ROOT", "flat:name"],
            ["DET", "PROPN", "PROPN"],
            [(0,3)]
        ),
        # one determiner + one noun + one adjective qualified by an adverb
        # alcuni contadini molto ricchi -> alcuni contadini molto ricchi     some very rich farmers
        (
            ['alcuni', 'contadini', 'molto', 'ricchi'],
            [1, 1, 3, 1],
            ['det', 'ROOT', 'advmod', 'amod'],
            ['DET', 'NOUN', 'ADV', 'ADJ'],
            [(0,4)]
        ),
        # Two NPs conjuncted
        # Ho un cane e un gatto -> un cane, un gatto
        ( 
            ['Ho', 'un', 'cane', 'e', 'un', 'gatto'],
            [0, 2, 0, 5, 5, 0],
            ['ROOT', 'det', 'obj', 'cc', 'det', 'conj'],
            ['VERB', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN'],
            [(1,3), (4,6)]
        ),
        # Two NPs together
        # lo scrittore brasiliano Aníbal Machado -> lo scrittore brasiliano, Aníbal Machado
        (
            ['lo', 'scrittore', 'brasiliano', 'Aníbal', 'Machado'],
            [1, 1, 1, 1, 3],
            ['det', 'ROOT', 'amod', 'nmod', 'flat:name'],
            ['DET', 'NOUN', 'ADJ', 'PROPN', 'PROPN'],
            [(0, 3), (3, 5)]
        ),
        # Noun compound, person name and titles
        # Dom Pedro II -> Dom Pedro II
        (
            ["Dom", "Pedro", "II"],
            [0, 0, 0],
            ["ROOT", "flat:name", "flat:name"],
            ["PROPN", "PROPN", "PROPN"],
            [(0,3)]
        ),
        # Noun compound created by flat
        # gli Stati Uniti
        (
            ["gli", "Stati", "Uniti"],
            [1, 1, 1],
            ["det", "ROOT", "flat:name"],
            ["DET", "PROPN", "PROPN"],
            [(0,3)]
        ),
        # nmod relation between NPs
        # la distruzione della città -> la distruzione, città
        (
            ['la', 'distruzione', 'della', 'città'],
            [1, 1, 3, 1],
            ['det', 'ROOT', 'case', 'nmod'],
            ['DET', 'NOUN', 'ADP', 'NOUN'],
            [(0,2), (3,4)]
        ),
        # Compounding by nmod, several NPs chained together
        # la prima fabbrica di droga del governo -> la prima fabbrica, droga, governo
        (
            ["la", "prima", "fabbrica", "di", "droga", "del", "governo"],
            [2, 2, 2, 4, 2, 6, 2],
            ['det', 'amod', 'ROOT', 'case', 'nmod', 'case', 'nmod'],
            ['DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'],
            [(0, 3), (4, 5), (6, 7)]
        ),
        # several NPs
        # Traduzione del rapporto di Susana -> Traduzione, rapporto, Susana
        (
            ['Traduzione', 'del', 'rapporto', 'di', 'Susana'],
            [0, 2, 0, 4, 2],
            ['ROOT', 'case', 'nmod', 'case', 'nmod'],
            ['NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'],
            [(0,1), (2,3), (4,5)]  
        ),
        # Several NPs
        # Il gatto grasso di Susana e la sua amica -> Il gatto grasso, Susana, sua amica
        (  
            ['Il', 'gatto', 'grasso', 'di', 'Susana', 'e', 'la', 'sua', 'amica'],
            [1, 1, 1, 4, 1, 8, 8, 8, 1],
            ['det', 'ROOT', 'amod', 'case', 'nmod', 'cc', 'det', 'det:poss', 'conj'],
            ['DET', 'NOUN', 'ADJ', 'ADP', 'PROPN', 'CCONJ', 'DET', 'DET', 'NOUN'],
            [(0,3), (4,5), (6,9)]
        ),
        # Passive subject
        # La nuova spesa è alimentata dal grande conto in banca di Clinton  -> Le nuova spesa, grande conto, banca, Clinton
        (
            ['La', 'nuova', 'spesa', 'è', 'alimentata', 'dal', 'grande', 'conto', 'in', 'banca', 'di', 'Clinton'],
            [2, 2, 4, 4, 4, 7, 7, 4, 9, 7, 11, 9],
            ['det', 'amod', 'nsubj:pass', 'aux:pass', 'ROOT', 'case', 'amod', 'obl:agent', 'case', 'nmod', 'case', 'nmod'],
            ['DET', 'ADJ', 'NOUN', 'AUX', 'VERB', 'ADP', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'],
            [(0, 3), (6, 8), (9, 10), (11,12)]
        ),
        # Misc
        # Ma mentre questo prestito possa ora sembrare gestibile, un improvviso cambiamento delle circostanze potrebbe portare a problemi di debiti -> questo prestiti, un provisso cambiento, circostanze, problemi, debiti
        (
            ['Ma', 'mentre', 'questo', 'prestito', 'possa', 'ora', 'sembrare', 'gestibile', ',', 'un', 'improvviso', 'cambiamento', 'delle', 'circostanze', 'potrebbe', 'portare', 'a', 'problemi', 'di', 'debitii'],
            [15, 6, 3, 6, 6, 6, 15, 6, 6, 11, 11, 15, 13, 11, 15, 15, 17, 15, 19, 17],
            ['cc', 'mark', 'det', 'nsubj', 'aux', 'advmod', 'advcl', 'xcomp', 'punct', 'det', 'amod', 'nsubj', 'case', 'nmod', 'aux', 'ROOT', 'case', 'obl', 'case', 'nmod'],
            ['CCONJ', 'SCONJ', 'DET', 'NOUN', 'AUX', 'ADV', 'VERB', 'ADJ', 'PUNCT', 'DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'AUX', 'VERB', 'ADP', 'NOUN', 'ADP', 'NOUN'],
            [(2,4), (9,12), (13,14), (17,18), (19,20)]
        )
    ],
 )
 # fmt: on
 def test_it_noun_chunks(it_vocab, words, heads, deps, pos, chunk_offsets):
    doc = Doc(it_vocab, words=words, heads=heads, deps=deps, pos=pos)
    assert [(c.start, c.end) for c in doc.noun_chunks] == chunk_offsets
 def test_noun_chunks_is_parsed_it(it_tokenizer):
    """Test that noun_chunks raises Value Error for 'it' language if Doc is not parsed."""
    doc = it_tokenizer("Sei andato a Oxford")
    with pytest.raises(ValueError):
        list(doc.noun_chunks)
--- a/spacy/tests/lang/it/test_stopwords.py
+++ b/spacy/tests/lang/it/test_stopwords.py
@ -0,0 +1,17 @@
 import pytest
@pytest.mark.parametrize(
    "word", ["un", "lo", "dell", "dall", "si", "ti", "mi", "quest", "quel", "quello"]
 )
 def test_stopwords_basic(it_tokenizer, word):
    tok = it_tokenizer(word)[0]
    assert tok.is_stop
@pytest.mark.parametrize(
    "word", ["quest'uomo", "l'ho", "un'amica", "dell'olio", "s'arrende", "m'ascolti"]
 )
 def test_stopwords_elided(it_tokenizer, word):
    tok = it_tokenizer(word)[0]
    assert tok.is_stop
--- a/Show More
+++ b/Show More
`@ -63,4 +63,4 @@ def _get_span_indices(ops, spans: Ragged, lengths: Ints1d) -> Ints1d:`


	`def _ensure_cpu(spans: Ragged, lengths: Ints1d) -> Tuple[Ragged, Ints1d]:`	`def _ensure_cpu(spans: Ragged, lengths: Ints1d) -> Tuple[Ragged, Ints1d]:`
	`return (Ragged(to_numpy(spans.dataXd), to_numpy(spans.lengths)), to_numpy(lengths))`	`return Ragged(to_numpy(spans.dataXd), to_numpy(spans.lengths)), to_numpy(lengths)`
		`@ -0,0 +1,3 @@`
							`from .entity_linker import EntityLinker_v1`

							`__all__ = ["EntityLinker_v1"]`