Merge branch 'master' into spacy.io

2025-07-18 20:22:25 +03:00 · 2022-02-11 11:26:08 +01:00 · 2022-02-11 11:26:08 +01:00 · d43082289c
commit d43082289c
parent 7f9996571a 7961a0a959
64 changed files with 877 additions and 202 deletions
--- a/.github/workflows/gputests.yml
+++ b/.github/workflows/gputests.yml
@ -0,0 +1,21 @@
+name: Weekly GPU tests
+
+on:
+  schedule:
+    - cron: '0 1 * * MON'
+
+jobs:
+  weekly-gputests:
+    strategy:
+      fail-fast: false
+      matrix:
+        branch: [master, develop, v4]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Trigger buildkite build
+        uses: buildkite/trigger-pipeline-action@v1.2.0
+        env:
+          PIPELINE: explosion-ai/spacy-slow-gpu-tests
+          BRANCH: ${{ matrix.branch }}
+          MESSAGE: ":github: Weekly GPU + slow tests - triggered from a GitHub Action"
+          BUILDKITE_API_ACCESS_TOKEN: ${{ secrets.BUILDKITE_SECRET }}
--- a/.github/workflows/slowtests.yml
+++ b/.github/workflows/slowtests.yml
@ -0,0 +1,35 @@
+name: Daily slow tests
+
+on:
+  schedule:
+    - cron: '0 0 * * *'
+
+jobs:
+  daily-slowtests:
+    strategy:
+      fail-fast: false
+      matrix:
+        branch: [master, develop, v4]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v1
+      - name: Get commits from past 24 hours
+        id: check_commits
+        run: |
+          today=$(date '+%Y-%m-%d %H:%M:%S')
+          yesterday=$(date -d "yesterday" '+%Y-%m-%d %H:%M:%S')
+          if git log --after=$yesterday --before=$today | grep commit ; then
+            echo "::set-output name=run_tests::true"
+          else
+            echo "::set-output name=run_tests::false"
+          fi
+
+      - name: Trigger buildkite build
+        if: steps.check_commits.outputs.run_tests == 'true'
+        uses: buildkite/trigger-pipeline-action@v1.2.0
+        env:
+          PIPELINE: explosion-ai/spacy-slow-tests
+          BRANCH: ${{ matrix.branch }}
+          MESSAGE: ":github: Daily slow tests - triggered from a GitHub Action"
+          BUILDKITE_API_ACCESS_TOKEN: ${{ secrets.BUILDKITE_SECRET }}
--- a/2
+++ b/2
@ -1,6 +1,6 @@
 The MIT License (MIT)

-Copyright (C) 2016-2021 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal
+Copyright (C) 2016-2022 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,11 +1,8 @@
-recursive-include include *.h
 recursive-include spacy *.pyi *.pyx *.pxd *.txt *.cfg *.jinja *.toml
 include LICENSE
 include README.md
 include pyproject.toml
 include spacy/py.typed
-recursive-exclude spacy/lang *.json
-recursive-include spacy/lang *.json.gz
-recursive-include spacy/cli *.json *.yml
+recursive-include spacy/cli *.yml
 recursive-include licenses *
 recursive-exclude spacy *.cpp
--- a/requirements.txt
+++ b/requirements.txt
@ -31,7 +31,8 @@ pytest-timeout>=1.3.0,<2.0.0
 mock>=2.0.0,<3.0.0
 flake8>=3.8.0,<3.10.0
 hypothesis>=3.27.0,<7.0.0
-mypy>=0.910
+mypy==0.910
 types-dataclasses>=0.1.3; python_version < "3.7"
 types-mock>=0.1.1
 types-requests
+black>=22.0,<23.0
--- a/setup.cfg
+++ b/setup.cfg
@ -77,37 +77,39 @@ transformers =
 ray =
    spacy_ray>=0.1.0,<1.0.0
 cuda =
-    cupy>=5.0.0b4,<10.0.0
+    cupy>=5.0.0b4,<11.0.0
 cuda80 =
-    cupy-cuda80>=5.0.0b4,<10.0.0
+    cupy-cuda80>=5.0.0b4,<11.0.0
 cuda90 =
-    cupy-cuda90>=5.0.0b4,<10.0.0
+    cupy-cuda90>=5.0.0b4,<11.0.0
 cuda91 =
-    cupy-cuda91>=5.0.0b4,<10.0.0
+    cupy-cuda91>=5.0.0b4,<11.0.0
 cuda92 =
-    cupy-cuda92>=5.0.0b4,<10.0.0
+    cupy-cuda92>=5.0.0b4,<11.0.0
 cuda100 =
-    cupy-cuda100>=5.0.0b4,<10.0.0
+    cupy-cuda100>=5.0.0b4,<11.0.0
 cuda101 =
-    cupy-cuda101>=5.0.0b4,<10.0.0
+    cupy-cuda101>=5.0.0b4,<11.0.0
 cuda102 =
-    cupy-cuda102>=5.0.0b4,<10.0.0
+    cupy-cuda102>=5.0.0b4,<11.0.0
 cuda110 =
-    cupy-cuda110>=5.0.0b4,<10.0.0
+    cupy-cuda110>=5.0.0b4,<11.0.0
 cuda111 =
-    cupy-cuda111>=5.0.0b4,<10.0.0
+    cupy-cuda111>=5.0.0b4,<11.0.0
 cuda112 =
-    cupy-cuda112>=5.0.0b4,<10.0.0
+    cupy-cuda112>=5.0.0b4,<11.0.0
 cuda113 =
-    cupy-cuda113>=5.0.0b4,<10.0.0
+    cupy-cuda113>=5.0.0b4,<11.0.0
 cuda114 =
-    cupy-cuda114>=5.0.0b4,<10.0.0
+    cupy-cuda114>=5.0.0b4,<11.0.0
+cuda115 =
+    cupy-cuda115>=5.0.0b4,<11.0.0
 apple =
    thinc-apple-ops>=0.0.4,<1.0.0
 # Language tokenizers with external dependencies
 ja =
-    sudachipy>=0.4.9
-    sudachidict_core>=20200330
+    sudachipy>=0.5.2,!=0.6.1
+    sudachidict_core>=20211220
 ko =
    natto-py==0.9.0
 th =
--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@ -1,3 +1,6 @@
+from .errors import Errors
+
+IOB_STRINGS = ("", "I", "O", "B")

 IDS = {
    "": NULL_ATTR,
@ -64,7 +67,6 @@ IDS = {
    "FLAG61": FLAG61,
    "FLAG62": FLAG62,
    "FLAG63": FLAG63,
-
    "ID": ID,
    "ORTH": ORTH,
    "LOWER": LOWER,
@ -72,7 +74,6 @@ IDS = {
    "SHAPE": SHAPE,
    "PREFIX": PREFIX,
    "SUFFIX": SUFFIX,
-
    "LENGTH": LENGTH,
    "LEMMA": LEMMA,
    "POS": POS,
@ -87,7 +88,7 @@ IDS = {
    "SPACY": SPACY,
    "LANG": LANG,
    "MORPH": MORPH,
-    "IDX": IDX
+    "IDX": IDX,
 }


@ -109,28 +110,66 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
    """
    inty_attrs = {}
    if _do_deprecated:
-        if 'F' in stringy_attrs:
+        if "F" in stringy_attrs:
            stringy_attrs["ORTH"] = stringy_attrs.pop("F")
-        if 'L' in stringy_attrs:
+        if "L" in stringy_attrs:
            stringy_attrs["LEMMA"] = stringy_attrs.pop("L")
-        if 'pos' in stringy_attrs:
+        if "pos" in stringy_attrs:
            stringy_attrs["TAG"] = stringy_attrs.pop("pos")
-        if 'morph' in stringy_attrs:
-            morphs = stringy_attrs.pop('morph')
-        if 'number' in stringy_attrs:
-            stringy_attrs.pop('number')
-        if 'tenspect' in stringy_attrs:
-            stringy_attrs.pop('tenspect')
+        if "morph" in stringy_attrs:
+            morphs = stringy_attrs.pop("morph")
+        if "number" in stringy_attrs:
+            stringy_attrs.pop("number")
+        if "tenspect" in stringy_attrs:
+            stringy_attrs.pop("tenspect")
        morph_keys = [
-            'PunctType', 'PunctSide', 'Other', 'Degree', 'AdvType', 'Number',
-            'VerbForm', 'PronType', 'Aspect', 'Tense', 'PartType', 'Poss',
-            'Hyph', 'ConjType', 'NumType', 'Foreign', 'VerbType', 'NounType',
-            'Gender', 'Mood', 'Negative', 'Tense', 'Voice', 'Abbr',
-            'Derivation', 'Echo', 'Foreign', 'NameType', 'NounType', 'NumForm',
-            'NumValue', 'PartType', 'Polite', 'StyleVariant',
-            'PronType', 'AdjType', 'Person', 'Variant', 'AdpType',
-            'Reflex', 'Negative', 'Mood', 'Aspect', 'Case',
-            'Polarity', 'PrepCase', 'Animacy' # U20
+            "PunctType",
+            "PunctSide",
+            "Other",
+            "Degree",
+            "AdvType",
+            "Number",
+            "VerbForm",
+            "PronType",
+            "Aspect",
+            "Tense",
+            "PartType",
+            "Poss",
+            "Hyph",
+            "ConjType",
+            "NumType",
+            "Foreign",
+            "VerbType",
+            "NounType",
+            "Gender",
+            "Mood",
+            "Negative",
+            "Tense",
+            "Voice",
+            "Abbr",
+            "Derivation",
+            "Echo",
+            "Foreign",
+            "NameType",
+            "NounType",
+            "NumForm",
+            "NumValue",
+            "PartType",
+            "Polite",
+            "StyleVariant",
+            "PronType",
+            "AdjType",
+            "Person",
+            "Variant",
+            "AdpType",
+            "Reflex",
+            "Negative",
+            "Mood",
+            "Aspect",
+            "Case",
+            "Polarity",
+            "PrepCase",
+            "Animacy",  # U20
        ]
        for key in morph_keys:
            if key in stringy_attrs:
@ -142,8 +181,13 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
    for name, value in stringy_attrs.items():
        int_key = intify_attr(name)
        if int_key is not None:
+            if int_key == ENT_IOB:
+                if value in IOB_STRINGS:
+                    value = IOB_STRINGS.index(value)
+                elif isinstance(value, str):
+                    raise ValueError(Errors.E1025.format(value=value))
            if strings_map is not None and isinstance(value, str):
-                if hasattr(strings_map, 'add'):
+                if hasattr(strings_map, "add"):
                    value = strings_map.add(value)
                else:
                    value = strings_map[value]
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@ -14,7 +14,7 @@ from ..training.initialize import get_sourced_components
 from ..schemas import ConfigSchemaTraining
 from ..pipeline._parser_internals import nonproj
 from ..pipeline._parser_internals.nonproj import DELIMITER
-from ..pipeline import Morphologizer
+from ..pipeline import Morphologizer, SpanCategorizer
 from ..morphology import Morphology
 from ..language import Language
 from ..util import registry, resolve_dot_names
@ -193,6 +193,70 @@ def debug_data(
    else:
        msg.info("No word vectors present in the package")

+    if "spancat" in factory_names:
+        model_labels_spancat = _get_labels_from_spancat(nlp)
+        has_low_data_warning = False
+        has_no_neg_warning = False
+
+        msg.divider("Span Categorization")
+        msg.table(model_labels_spancat, header=["Spans Key", "Labels"], divider=True)
+
+        msg.text("Label counts in train data: ", show=verbose)
+        for spans_key, data_labels in gold_train_data["spancat"].items():
+            msg.text(
+                f"Key: {spans_key}, {_format_labels(data_labels.items(), counts=True)}",
+                show=verbose,
+            )
+        # Data checks: only take the spans keys in the actual spancat components
+        data_labels_in_component = {
+            spans_key: gold_train_data["spancat"][spans_key]
+            for spans_key in model_labels_spancat.keys()
+        }
+        for spans_key, data_labels in data_labels_in_component.items():
+            for label, count in data_labels.items():
+                # Check for missing labels
+                spans_key_in_model = spans_key in model_labels_spancat.keys()
+                if (spans_key_in_model) and (
+                    label not in model_labels_spancat[spans_key]
+                ):
+                    msg.warn(
+                        f"Label '{label}' is not present in the model labels of key '{spans_key}'. "
+                        "Performance may degrade after training."
+                    )
+                # Check for low number of examples per label
+                if count <= NEW_LABEL_THRESHOLD:
+                    msg.warn(
+                        f"Low number of examples for label '{label}' in key '{spans_key}' ({count})"
+                    )
+                    has_low_data_warning = True
+                # Check for negative examples
+                with msg.loading("Analyzing label distribution..."):
+                    neg_docs = _get_examples_without_label(
+                        train_dataset, label, "spancat", spans_key
+                    )
+                if neg_docs == 0:
+                    msg.warn(f"No examples for texts WITHOUT new label '{label}'")
+                    has_no_neg_warning = True
+
+        if has_low_data_warning:
+            msg.text(
+                f"To train a new span type, your data should include at "
+                f"least {NEW_LABEL_THRESHOLD} instances of the new label",
+                show=verbose,
+            )
+        else:
+            msg.good("Good amount of examples for all labels")
+
+        if has_no_neg_warning:
+            msg.text(
+                "Training data should always include examples of spans "
+                "in context, as well as examples without a given span "
+                "type.",
+                show=verbose,
+            )
+        else:
+            msg.good("Examples without ocurrences available for all labels")
+
    if "ner" in factory_names:
        # Get all unique NER labels present in the data
        labels = set(
@ -203,6 +267,7 @@ def debug_data(
        has_low_data_warning = False
        has_no_neg_warning = False
        has_ws_ents_error = False
+        has_boundary_cross_ents_warning = False

        msg.divider("Named Entity Recognition")
        msg.info(f"{len(model_labels)} label(s)")
@ -237,17 +302,25 @@ def debug_data(
                has_low_data_warning = True

                with msg.loading("Analyzing label distribution..."):
-                    neg_docs = _get_examples_without_label(train_dataset, label)
+                    neg_docs = _get_examples_without_label(train_dataset, label, "ner")
                if neg_docs == 0:
                    msg.warn(f"No examples for texts WITHOUT new label '{label}'")
                    has_no_neg_warning = True

+        if gold_train_data["boundary_cross_ents"]:
+            msg.warn(
+                f"{gold_train_data['boundary_cross_ents']} entity span(s) crossing sentence boundaries"
+            )
+            has_boundary_cross_ents_warning = True
+
        if not has_low_data_warning:
            msg.good("Good amount of examples for all labels")
        if not has_no_neg_warning:
            msg.good("Examples without occurrences available for all labels")
        if not has_ws_ents_error:
            msg.good("No entities consisting of or starting/ending with whitespace")
+        if not has_boundary_cross_ents_warning:
+            msg.good("No entities crossing sentence boundaries")

        if has_low_data_warning:
            msg.text(
@ -564,7 +637,9 @@ def _compile_gold(
        "deps": Counter(),
        "words": Counter(),
        "roots": Counter(),
+        "spancat": dict(),
        "ws_ents": 0,
+        "boundary_cross_ents": 0,
        "n_words": 0,
        "n_misaligned_words": 0,
        "words_missing_vectors": Counter(),
@ -593,6 +668,7 @@ def _compile_gold(
                if nlp.vocab.strings[word] not in nlp.vocab.vectors:
                    data["words_missing_vectors"].update([word])
        if "ner" in factory_names:
+            sent_starts = eg.get_aligned_sent_starts()
            for i, label in enumerate(eg.get_aligned_ner()):
                if label is None:
                    continue
@ -602,8 +678,19 @@ def _compile_gold(
                if label.startswith(("B-", "U-")):
                    combined_label = label.split("-")[1]
                    data["ner"][combined_label] += 1
+                if sent_starts[i] == True and label.startswith(("I-", "L-")):
+                    data["boundary_cross_ents"] += 1
                elif label == "-":
                    data["ner"]["-"] += 1
+        if "spancat" in factory_names:
+            for span_key in list(eg.reference.spans.keys()):
+                if span_key not in data["spancat"]:
+                    data["spancat"][span_key] = Counter()
+                for i, span in enumerate(eg.reference.spans[span_key]):
+                    if span.label_ is None:
+                        continue
+                    else:
+                        data["spancat"][span_key][span.label_] += 1
        if "textcat" in factory_names or "textcat_multilabel" in factory_names:
            data["cats"].update(gold.cats)
            if any(val not in (0, 1) for val in gold.cats.values()):
@ -674,21 +761,57 @@ def _format_labels(
    return ", ".join([f"'{l}'" for l in cast(Iterable[str], labels)])


-def _get_examples_without_label(data: Sequence[Example], label: str) -> int:
+def _get_examples_without_label(
+    data: Sequence[Example],
+    label: str,
+    component: Literal["ner", "spancat"] = "ner",
+    spans_key: Optional[str] = "sc",
+) -> int:
    count = 0
    for eg in data:
+        if component == "ner":
            labels = [
                label.split("-")[1]
                for label in eg.get_aligned_ner()
                if label not in ("O", "-", None)
            ]
+
+        if component == "spancat":
+            labels = (
+                [span.label_ for span in eg.reference.spans[spans_key]]
+                if spans_key in eg.reference.spans
+                else []
+            )
+
        if label not in labels:
            count += 1
    return count


-def _get_labels_from_model(nlp: Language, pipe_name: str) -> Set[str]:
-    if pipe_name not in nlp.pipe_names:
-        return set()
+def _get_labels_from_model(nlp: Language, factory_name: str) -> Set[str]:
+    pipe_names = [
+        pipe_name
+        for pipe_name in nlp.pipe_names
+        if nlp.get_pipe_meta(pipe_name).factory == factory_name
+    ]
+    labels: Set[str] = set()
+    for pipe_name in pipe_names:
        pipe = nlp.get_pipe(pipe_name)
-    return set(pipe.labels)
+        labels.update(pipe.labels)
+    return labels
+
+
+def _get_labels_from_spancat(nlp: Language) -> Dict[str, Set[str]]:
+    pipe_names = [
+        pipe_name
+        for pipe_name in nlp.pipe_names
+        if nlp.get_pipe_meta(pipe_name).factory == "spancat"
+    ]
+    labels: Dict[str, Set[str]] = {}
+    for pipe_name in pipe_names:
+        pipe = nlp.get_pipe(pipe_name)
+        assert isinstance(pipe, SpanCategorizer)
+        if pipe.key not in labels:
+            labels[pipe.key] = set()
+        labels[pipe.key].update(pipe.labels)
+    return labels
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@ -7,6 +7,7 @@ from collections import defaultdict
 from catalogue import RegistryError
 import srsly
 import sys
+import re

 from ._util import app, Arg, Opt, string_to_list, WHEEL_SUFFIX, SDIST_SUFFIX
 from ..schemas import validate, ModelMetaSchema
@ -109,6 +110,24 @@ def package(
            ", ".join(meta["requirements"]),
        )
    if name is not None:
+        if not name.isidentifier():
+            msg.fail(
+                f"Model name ('{name}') is not a valid module name. "
+                "This is required so it can be imported as a module.",
+                "We recommend names that use ASCII A-Z, a-z, _ (underscore), "
+                "and 0-9. "
+                "For specific details see: https://docs.python.org/3/reference/lexical_analysis.html#identifiers",
+                exits=1,
+            )
+        if not _is_permitted_package_name(name):
+            msg.fail(
+                f"Model name ('{name}') is not a permitted package name. "
+                "This is required to correctly load the model with spacy.load.",
+                "We recommend names that use ASCII A-Z, a-z, _ (underscore), "
+                "and 0-9. "
+                "For specific details see: https://www.python.org/dev/peps/pep-0426/#name",
+                exits=1,
+            )
        meta["name"] = name
    if version is not None:
        meta["version"] = version
@ -162,7 +181,7 @@ def package(
        imports="\n".join(f"from . import {m}" for m in imports)
    )
    create_file(package_path / "__init__.py", init_py)
-    msg.good(f"Successfully created package '{model_name_v}'", main_path)
+    msg.good(f"Successfully created package directory '{model_name_v}'", main_path)
    if create_sdist:
        with util.working_dir(main_path):
            util.run_command([sys.executable, "setup.py", "sdist"], capture=False)
@ -171,8 +190,14 @@ def package(
    if create_wheel:
        with util.working_dir(main_path):
            util.run_command([sys.executable, "setup.py", "bdist_wheel"], capture=False)
-        wheel = main_path / "dist" / f"{model_name_v}{WHEEL_SUFFIX}"
+        wheel_name_squashed = re.sub("_+", "_", model_name_v)
+        wheel = main_path / "dist" / f"{wheel_name_squashed}{WHEEL_SUFFIX}"
        msg.good(f"Successfully created binary wheel", wheel)
+    if "__" in model_name:
+        msg.warn(
+            f"Model name ('{model_name}') contains a run of underscores. "
+            "Runs of underscores are not significant in installed package names.",
+        )


 def has_wheel() -> bool:
@ -422,6 +447,14 @@ def _format_label_scheme(data: Dict[str, Any]) -> str:
    return md.text


+def _is_permitted_package_name(package_name: str) -> bool:
+    # regex from: https://www.python.org/dev/peps/pep-0426/#name
+    permitted_match = re.search(
+        r"^([A-Z0-9]|[A-Z0-9][A-Z0-9._-]*[A-Z0-9])$", package_name, re.IGNORECASE
+    )
+    return permitted_match is not None
+
+
 TEMPLATE_SETUP = """
 #!/usr/bin/env python
 import io
--- a/spacy/cli/project/assets.py
+++ b/spacy/cli/project/assets.py
@ -1,6 +1,7 @@
 from typing import Any, Dict, Optional
 from pathlib import Path
 from wasabi import msg
+import os
 import re
 import shutil
 import requests
@ -129,10 +130,17 @@ def fetch_asset(
        the asset failed.
    """
    dest_path = (project_path / dest).resolve()
-    if dest_path.exists() and checksum:
+    if dest_path.exists():
        # If there's already a file, check for checksum
+        if checksum:
            if checksum == get_checksum(dest_path):
                msg.good(f"Skipping download with matching checksum: {dest}")
+                return
+        else:
+            # If there's not a checksum, make sure the file is a possibly valid size
+            if os.path.getsize(dest_path) == 0:
+                msg.warn(f"Asset exists but with size of 0 bytes, deleting: {dest}")
+                os.remove(dest_path)
    # We might as well support the user here and create parent directories in
    # case the asset dir isn't listed as a dir to create in the project.yml
    if not dest_path.parent.exists():
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@ -6,6 +6,11 @@ can help generate the best possible configuration, given a user's requirements.
 [paths]
 train = null
 dev = null
+{% if use_transformer or optimize == "efficiency" or not word_vectors -%}
+vectors = null
+{% else -%}
+vectors = "{{ word_vectors }}"
+{% endif -%}

 [system]
 {% if use_transformer -%}
@ -421,8 +426,4 @@ compound = 1.001
 {% endif %}

 [initialize]
-{% if use_transformer or optimize == "efficiency" or not word_vectors -%}
 vectors = ${paths.vectors}
-{% else -%}
-vectors = "{{ word_vectors }}"
-{% endif -%}
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@ -68,12 +68,14 @@ seed = ${system.seed}
 gpu_allocator = ${system.gpu_allocator}
 dropout = 0.1
 accumulate_gradient = 1
-# Controls early-stopping. 0 disables early stopping.
+# Controls early-stopping, i.e., the number of steps to continue without
+# improvement before stopping. 0 disables early stopping.
 patience = 1600
 # Number of epochs. 0 means unlimited. If >= 0, train corpus is loaded once in
 # memory and shuffled within the training loop. -1 means stream train corpus
 # rather than loading in memory with no shuffling within the training loop.
 max_epochs = 0
+# Maximum number of update steps to train for. 0 means an unlimited number of steps.
 max_steps = 20000
 eval_frequency = 200
 # Control how scores are printed and checkpoints are evaluated.
--- a/spacy/displacy/render.py
+++ b/spacy/displacy/render.py
@ -18,7 +18,7 @@ DEFAULT_LABEL_COLORS = {
    "LOC": "#ff9561",
    "PERSON": "#aa9cfc",
    "NORP": "#c887fb",
-    "FACILITY": "#9cc9cc",
+    "FAC": "#9cc9cc",
    "EVENT": "#ffeb80",
    "LAW": "#ff8197",
    "LANGUAGE": "#ff8197",
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -483,7 +483,7 @@ class Errors(metaclass=ErrorsWithCodes):
            "components, since spans are only views of the Doc. Use Doc and "
            "Token attributes (or custom extension attributes) only and remove "
            "the following: {attrs}")
-    E181 = ("Received invalid attributes for unkown object {obj}: {attrs}. "
+    E181 = ("Received invalid attributes for unknown object {obj}: {attrs}. "
            "Only Doc and Token attributes are supported.")
    E182 = ("Received invalid attribute declaration: {attr}\nDid you forget "
            "to define the attribute? For example: `{attr}.???`")
@ -888,9 +888,12 @@ class Errors(metaclass=ErrorsWithCodes):
    E1021 = ("`pos` value \"{pp}\" is not a valid Universal Dependencies tag. "
             "Non-UD tags should use the `tag` property.")
    E1022 = ("Words must be of type str or int, but input is of type '{wtype}'")
-    E1023 = ("Couldn't read EntityRuler from the {path}. This file doesn't exist.")
-    E1024 = ("A pattern with ID \"{ent_id}\" is not present in EntityRuler patterns.")
-    
+    E1023 = ("Couldn't read EntityRuler from the {path}. This file doesn't "
+             "exist.")
+    E1024 = ("A pattern with ID \"{ent_id}\" is not present in EntityRuler "
+             "patterns.")
+    E1025 = ("Cannot intify the value '{value}' as an IOB string. The only "
+             "supported values are: 'I', 'O', 'B' and ''")
    

 # Deprecated model shortcuts, only used in errors and warnings
--- a/spacy/glossary.py
+++ b/spacy/glossary.py
@ -310,7 +310,6 @@ GLOSSARY = {
    "re": "repeated element",
    "rs": "reported speech",
    "sb": "subject",
-    "sb": "subject",
    "sbp": "passivized subject (PP)",
    "sp": "subject or predicate",
    "svp": "separable verb prefix",
--- a/spacy/lang/hi/lex_attrs.py
+++ b/spacy/lang/hi/lex_attrs.py
@ -90,7 +90,7 @@ _eleven_to_beyond = [
    "अड़सठ",
    "उनहत्तर",
    "सत्तर",
-    "इकहत्तर"
+    "इकहत्तर",
    "बहत्तर",
    "तिहत्तर",
    "चौहत्तर",
--- a/spacy/lang/xx/examples.py
+++ b/spacy/lang/xx/examples.py
@ -59,7 +59,7 @@ sentences = [
    "Czy w ciągu ostatnich 48 godzin spożyłeś leki zawierające paracetamol?",
    "Kto ma ochotę zapoznać się z innymi niż w książkach przygodami Muminków i ich przyjaciół, temu polecam komiks Tove Jansson „Muminki i morze”.",
    "Apple está querendo comprar uma startup do Reino Unido por 100 milhões de dólares.",
-    "Carros autônomos empurram a responsabilidade do seguro para os fabricantes.."
+    "Carros autônomos empurram a responsabilidade do seguro para os fabricantes..",
    "São Francisco considera banir os robôs de entrega que andam pelas calçadas.",
    "Londres é a maior cidade do Reino Unido.",
    # Translations from English:
--- a/spacy/language.py
+++ b/spacy/language.py
@ -131,7 +131,7 @@ class Language:
        self,
        vocab: Union[Vocab, bool] = True,
        *,
-        max_length: int = 10 ** 6,
+        max_length: int = 10**6,
        meta: Dict[str, Any] = {},
        create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None,
        batch_size: int = 1000,
@ -354,12 +354,15 @@ class Language:
    @property
    def pipe_labels(self) -> Dict[str, List[str]]:
        """Get the labels set by the pipeline components, if available (if
-        the component exposes a labels property).
+        the component exposes a labels property and the labels are not
+        hidden).

        RETURNS (Dict[str, List[str]]): Labels keyed by component name.
        """
        labels = {}
        for name, pipe in self._components:
+            if hasattr(pipe, "hide_labels") and pipe.hide_labels is True:
+                continue
            if hasattr(pipe, "labels"):
                labels[name] = list(pipe.labels)
        return SimpleFrozenDict(labels)
@ -522,7 +525,7 @@ class Language:
        requires: Iterable[str] = SimpleFrozenList(),
        retokenizes: bool = False,
        func: Optional["Pipe"] = None,
-    ) -> Callable:
+    ) -> Callable[..., Any]:
        """Register a new pipeline component. Can be used for stateless function
        components that don't require a separate factory. Can be used as a
        decorator on a function or classmethod, or called as a function with the
@ -1285,9 +1288,9 @@ class Language:
            )
        except IOError:
            raise IOError(Errors.E884.format(vectors=I["vectors"]))
-        if self.vocab.vectors.data.shape[1] >= 1:
+        if self.vocab.vectors.shape[1] >= 1:
            ops = get_current_ops()
-            self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
+            self.vocab.vectors.to_ops(ops)
        if hasattr(self.tokenizer, "initialize"):
            tok_settings = validate_init_settings(
                self.tokenizer.initialize,  # type: ignore[union-attr]
@ -1332,8 +1335,8 @@ class Language:
        DOCS: https://spacy.io/api/language#resume_training
        """
        ops = get_current_ops()
-        if self.vocab.vectors.data.shape[1] >= 1:
-            self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
+        if self.vocab.vectors.shape[1] >= 1:
+            self.vocab.vectors.to_ops(ops)
        for name, proc in self.pipeline:
            if hasattr(proc, "_rehearsal_model"):
                proc._rehearsal_model = deepcopy(proc.model)  # type: ignore[attr-defined]
--- a/spacy/lexeme.pyi
+++ b/spacy/lexeme.pyi
@ -19,7 +19,7 @@ class Lexeme:
    @property
    def vector_norm(self) -> float: ...
    vector: Floats1d
-    rank: str
+    rank: int
    sentiment: float
    @property
    def orth_(self) -> str: ...
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -130,7 +130,9 @@ cdef class Lexeme:
            return 0.0
        vector = self.vector
        xp = get_array_module(vector)
-        return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm))
+        result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
+        # ensure we get a scalar back (numpy does this automatically but cupy doesn't)
+        return result.item()
    
    @property
    def has_vector(self):
--- a/spacy/matcher/dependencymatcher.pyi
+++ b/spacy/matcher/dependencymatcher.pyi
@ -0,0 +1,66 @@
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from .matcher import Matcher
+from ..vocab import Vocab
+from ..tokens.doc import Doc
+from ..tokens.span import Span
+
+class DependencyMatcher:
+    """Match dependency parse tree based on pattern rules."""
+
+    _patterns: Dict[str, List[Any]]
+    _raw_patterns: Dict[str, List[Any]]
+    _tokens_to_key: Dict[str, List[Any]]
+    _root: Dict[str, List[Any]]
+    _tree: Dict[str, List[Any]]
+    _callbacks: Dict[
+        Any, Callable[[DependencyMatcher, Doc, int, List[Tuple[int, List[int]]]], Any]
+    ]
+    _ops: Dict[str, Any]
+    vocab: Vocab
+    _matcher: Matcher
+    def __init__(self, vocab: Vocab, *, validate: bool = ...) -> None: ...
+    def __reduce__(
+        self,
+    ) -> Tuple[
+        Callable[
+            [Vocab, Dict[str, Any], Dict[str, Callable[..., Any]]], DependencyMatcher
+        ],
+        Tuple[
+            Vocab,
+            Dict[str, List[Any]],
+            Dict[
+                str,
+                Callable[
+                    [DependencyMatcher, Doc, int, List[Tuple[int, List[int]]]], Any
+                ],
+            ],
+        ],
+        None,
+        None,
+    ]: ...
+    def __len__(self) -> int: ...
+    def __contains__(self, key: Union[str, int]) -> bool: ...
+    def add(
+        self,
+        key: Union[str, int],
+        patterns: List[List[Dict[str, Any]]],
+        *,
+        on_match: Optional[
+            Callable[[DependencyMatcher, Doc, int, List[Tuple[int, List[int]]]], Any]
+        ] = ...
+    ) -> None: ...
+    def has_key(self, key: Union[str, int]) -> bool: ...
+    def get(
+        self, key: Union[str, int], default: Optional[Any] = ...
+    ) -> Tuple[
+        Optional[
+            Callable[[DependencyMatcher, Doc, int, List[Tuple[int, List[int]]]], Any]
+        ],
+        List[List[Dict[str, Any]]],
+    ]: ...
+    def remove(self, key: Union[str, int]) -> None: ...
+    def __call__(self, doclike: Union[Doc, Span]) -> List[Tuple[int, List[int]]]: ...
+
+def unpickle_matcher(
+    vocab: Vocab, patterns: Dict[str, Any], callbacks: Dict[str, Callable[..., Any]]
+) -> DependencyMatcher: ...
--- a/spacy/matcher/matcher.pyi
+++ b/spacy/matcher/matcher.pyi
@ -1,4 +1,6 @@
-from typing import Any, List, Dict, Tuple, Optional, Callable, Union, Iterator, Iterable
+from typing import Any, List, Dict, Tuple, Optional, Callable, Union
+from typing import Iterator, Iterable, overload
+from ..compat import Literal
 from ..vocab import Vocab
 from ..tokens import Doc, Span

@ -31,12 +33,22 @@ class Matcher:
    ) -> Union[
        Iterator[Tuple[Tuple[Doc, Any], Any]], Iterator[Tuple[Doc, Any]], Iterator[Doc]
    ]: ...
+    @overload
    def __call__(
        self,
        doclike: Union[Doc, Span],
        *,
-        as_spans: bool = ...,
+        as_spans: Literal[False] = ...,
        allow_missing: bool = ...,
        with_alignments: bool = ...
-    ) -> Union[List[Tuple[int, int, int]], List[Span]]: ...
+    ) -> List[Tuple[int, int, int]]: ...
+    @overload
+    def __call__(
+        self,
+        doclike: Union[Doc, Span],
+        *,
+        as_spans: Literal[True],
+        allow_missing: bool = ...,
+        with_alignments: bool = ...
+    ) -> List[Span]: ...
    def _normalize_key(self, key: Any) -> Any: ...
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@ -18,7 +18,7 @@ from ..tokens.doc cimport Doc, get_token_attr_for_matcher
 from ..tokens.span cimport Span
 from ..tokens.token cimport Token
 from ..tokens.morphanalysis cimport MorphAnalysis
-from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH
+from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH, ENT_IOB

 from ..schemas import validate_token_pattern
 from ..errors import Errors, MatchPatternError, Warnings
@ -798,6 +798,9 @@ def _get_attr_values(spec, string_store):
                attr = "SENT_START"
            attr = IDS.get(attr)
        if isinstance(value, str):
+            if attr == ENT_IOB and value in Token.iob_strings():
+                value = Token.iob_strings().index(value)
+            else:
                value = string_store.add(value)
        elif isinstance(value, bool):
            value = int(value)
--- a/spacy/matcher/phrasematcher.pyi
+++ b/spacy/matcher/phrasematcher.pyi
@ -1,6 +1,6 @@
-from typing import List, Tuple, Union, Optional, Callable, Any, Dict
-
-from . import Matcher
+from typing import List, Tuple, Union, Optional, Callable, Any, Dict, overload
+from ..compat import Literal
+from .matcher import Matcher
 from ..vocab import Vocab
 from ..tokens import Doc, Span

@ -14,16 +14,24 @@ class PhraseMatcher:
    def add(
        self,
        key: str,
-        docs: List[List[Dict[str, Any]]],
+        docs: List[Doc],
        *,
        on_match: Optional[
            Callable[[Matcher, Doc, int, List[Tuple[Any, ...]]], Any]
        ] = ...,
    ) -> None: ...
    def remove(self, key: str) -> None: ...
+    @overload
    def __call__(
        self,
        doclike: Union[Doc, Span],
        *,
-        as_spans: bool = ...,
-    ) -> Union[List[Tuple[int, int, int]], List[Span]]: ...
+        as_spans: Literal[False] = ...,
+    ) -> List[Tuple[int, int, int]]: ...
+    @overload
+    def __call__(
+        self,
+        doclike: Union[Doc, Span],
+        *,
+        as_spans: Literal[True],
+    ) -> List[Span]: ...
--- a/spacy/ml/models/multi_task.py
+++ b/spacy/ml/models/multi_task.py
@ -23,7 +23,7 @@ def create_pretrain_vectors(
    maxout_pieces: int, hidden_size: int, loss: str
 ) -> Callable[["Vocab", Model], Model]:
    def create_vectors_objective(vocab: "Vocab", tok2vec: Model) -> Model:
-        if vocab.vectors.data.shape[1] == 0:
+        if vocab.vectors.shape[1] == 0:
            raise ValueError(Errors.E875)
        model = build_cloze_multi_task_model(
            vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces
@ -85,7 +85,7 @@ def get_characters_loss(ops, docs, prediction, nr_char):
    target = ops.asarray(to_categorical(target_ids, n_classes=256), dtype="f")
    target = target.reshape((-1, 256 * nr_char))
    diff = prediction - target
-    loss = (diff ** 2).sum()
+    loss = (diff**2).sum()
    d_target = diff / float(prediction.shape[0])
    return loss, d_target

@ -116,7 +116,7 @@ def build_multi_task_model(
 def build_cloze_multi_task_model(
    vocab: "Vocab", tok2vec: Model, maxout_pieces: int, hidden_size: int
 ) -> Model:
-    nO = vocab.vectors.data.shape[1]
+    nO = vocab.vectors.shape[1]
    output_layer = chain(
        cast(Model[List["Floats2d"], Floats2d], list2array()),
        Maxout(
--- a/spacy/ml/staticvectors.py
+++ b/spacy/ml/staticvectors.py
@ -94,7 +94,7 @@ def init(
    nM = model.get_dim("nM") if model.has_dim("nM") else None
    nO = model.get_dim("nO") if model.has_dim("nO") else None
    if X is not None and len(X):
-        nM = X[0].vocab.vectors.data.shape[1]
+        nM = X[0].vocab.vectors.shape[1]
    if Y is not None:
        nO = Y.data.shape[1]

--- a/spacy/pipeline/_parser_internals/_state.pxd
+++ b/spacy/pipeline/_parser_internals/_state.pxd
@ -1,3 +1,4 @@
+from cython.operator cimport dereference as deref, preincrement as incr
 from libc.string cimport memcpy, memset
 from libc.stdlib cimport calloc, free
 from libc.stdint cimport uint32_t, uint64_t
@ -184,16 +185,20 @@ cdef cppclass StateC:
    int L(int head, int idx) nogil const:
        if idx < 1 or this._left_arcs.size() == 0:
            return -1
-        cdef vector[int] lefts
-        for i in range(this._left_arcs.size()):
-            arc = this._left_arcs.at(i)
+
+        # Work backwards through left-arcs to find the arc at the
+        # requested index more quickly.
+        cdef size_t child_index = 0
+        it = this._left_arcs.const_rbegin()
+        while it != this._left_arcs.rend():
+            arc = deref(it)
            if arc.head == head and arc.child != -1 and arc.child < head:
-                lefts.push_back(arc.child)
-        idx = (<int>lefts.size()) - idx
-        if idx < 0:
+                child_index += 1
+                if child_index == idx:
+                    return arc.child
+            incr(it)
+
        return -1
-        else:
-            return lefts.at(idx)

    int R(int head, int idx) nogil const:
        if idx < 1 or this._right_arcs.size() == 0:
--- a/spacy/pipeline/_parser_internals/arc_eager.pyx
+++ b/spacy/pipeline/_parser_internals/arc_eager.pyx
@ -604,7 +604,7 @@ cdef class ArcEager(TransitionSystem):
                    actions[SHIFT][''] += 1
        if min_freq is not None:
            for action, label_freqs in actions.items():
-                for label, freq in list(label_freqs.items()):
+                for label, freq in label_freqs.copy().items():
                    if freq < min_freq:
                        label_freqs.pop(label)
        # Ensure these actions are present
--- a/spacy/pipeline/pipe.pyi
+++ b/spacy/pipeline/pipe.pyi
@ -26,6 +26,8 @@ class Pipe:
    @property
    def labels(self) -> Tuple[str, ...]: ...
    @property
+    def hide_labels(self) -> bool: ...
+    @property
    def label_data(self) -> Any: ...
    def _require_labels(self) -> None: ...
    def set_error_handler(
--- a/spacy/pipeline/pipe.pyx
+++ b/spacy/pipeline/pipe.pyx
@ -102,6 +102,10 @@ cdef class Pipe:
    def labels(self) -> Tuple[str, ...]:
        return tuple()

+    @property
+    def hide_labels(self) -> bool:
+        return False
+
    @property
    def label_data(self):
        """Optional JSON-serializable data that would be sufficient to recreate
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@ -99,6 +99,10 @@ class SentenceRecognizer(Tagger):
        # are 0
        return tuple(["I", "S"])

+    @property
+    def hide_labels(self):
+        return True
+
    @property
    def label_data(self):
        return None
--- a/spacy/pipeline/spancat.py
+++ b/spacy/pipeline/spancat.py
@ -377,7 +377,7 @@ class SpanCategorizer(TrainablePipe):
        # If the prediction is 0.9 and it's false, the gradient will be
        # 0.9 (0.9 - 0.0)
        d_scores = scores - target
-        loss = float((d_scores ** 2).sum())
+        loss = float((d_scores**2).sum())
        return loss, d_scores

    def initialize(
@ -412,7 +412,7 @@ class SpanCategorizer(TrainablePipe):
        self._require_labels()
        if subbatch:
            docs = [eg.x for eg in subbatch]
-            spans = self.suggester(docs)
+            spans = build_ngram_suggester(sizes=[1])(docs)
            Y = self.model.ops.alloc2f(spans.dataXd.shape[0], len(self.labels))
            self.model.initialize(X=(docs, spans), Y=Y)
        else:
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@ -281,7 +281,7 @@ class TextCategorizer(TrainablePipe):
        bp_scores(gradient)
        if sgd is not None:
            self.finish_update(sgd)
-        losses[self.name] += (gradient ** 2).sum()
+        losses[self.name] += (gradient**2).sum()
        return losses

    def _examples_to_truth(
@ -315,7 +315,7 @@ class TextCategorizer(TrainablePipe):
        not_missing = self.model.ops.asarray(not_missing)  # type: ignore
        d_scores = (scores - truths) / scores.shape[0]
        d_scores *= not_missing
-        mean_square_error = (d_scores ** 2).sum(axis=1).mean()
+        mean_square_error = (d_scores**2).sum(axis=1).mean()
        return float(mean_square_error), d_scores

    def add_label(self, label: str) -> int:
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@ -1,5 +1,6 @@
 from typing import Dict, List, Union, Optional, Any, Callable, Type, Tuple
 from typing import Iterable, TypeVar, TYPE_CHECKING
+from .compat import Literal
 from enum import Enum
 from pydantic import BaseModel, Field, ValidationError, validator, create_model
 from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool
@ -209,6 +210,7 @@ NumberValue = Union[TokenPatternNumber, StrictInt, StrictFloat]
 UnderscoreValue = Union[
    TokenPatternString, TokenPatternNumber, str, int, float, list, bool
 ]
+IobValue = Literal["", "I", "O", "B", 0, 1, 2, 3]


 class TokenPattern(BaseModel):
@ -222,6 +224,7 @@ class TokenPattern(BaseModel):
    lemma: Optional[StringValue] = None
    shape: Optional[StringValue] = None
    ent_type: Optional[StringValue] = None
+    ent_iob: Optional[IobValue] = None
    ent_id: Optional[StringValue] = None
    ent_kb_id: Optional[StringValue] = None
    norm: Optional[StringValue] = None
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@ -567,6 +567,7 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
        "Merging the docs is fun.",
        "",
        "They don't think alike. ",
+        "",
        "Another doc.",
    ]
    en_texts_without_empty = [t for t in en_texts if len(t)]
@ -574,9 +575,9 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
    en_docs = [en_tokenizer(text) for text in en_texts]
    en_docs[0].spans["group"] = [en_docs[0][1:4]]
    en_docs[2].spans["group"] = [en_docs[2][1:4]]
-    en_docs[3].spans["group"] = [en_docs[3][0:1]]
+    en_docs[4].spans["group"] = [en_docs[4][0:1]]
    span_group_texts = sorted(
-        [en_docs[0][1:4].text, en_docs[2][1:4].text, en_docs[3][0:1].text]
+        [en_docs[0][1:4].text, en_docs[2][1:4].text, en_docs[4][0:1].text]
    )
    de_doc = de_tokenizer(de_text)
    Token.set_extension("is_ambiguous", default=False)
@ -683,6 +684,7 @@ def test_has_annotation(en_vocab):
    attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "HEAD", "ENT_IOB", "ENT_TYPE")
    for attr in attrs:
        assert not doc.has_annotation(attr)
+        assert not doc.has_annotation(attr, require_complete=True)

    doc[0].tag_ = "A"
    doc[0].pos_ = "X"
@ -708,6 +710,27 @@ def test_has_annotation(en_vocab):
        assert doc.has_annotation(attr, require_complete=True)


+def test_has_annotation_sents(en_vocab):
+    doc = Doc(en_vocab, words=["Hello", "beautiful", "world"])
+    attrs = ("SENT_START", "IS_SENT_START", "IS_SENT_END")
+    for attr in attrs:
+        assert not doc.has_annotation(attr)
+        assert not doc.has_annotation(attr, require_complete=True)
+
+    # The first token (index 0) is always assumed to be a sentence start,
+    # and ignored by the check in doc.has_annotation
+
+    doc[1].is_sent_start = False
+    for attr in attrs:
+        assert doc.has_annotation(attr)
+        assert not doc.has_annotation(attr, require_complete=True)
+
+    doc[2].is_sent_start = False
+    for attr in attrs:
+        assert doc.has_annotation(attr)
+        assert doc.has_annotation(attr, require_complete=True)
+
+
 def test_is_flags_deprecated(en_tokenizer):
    doc = en_tokenizer("test")
    with pytest.deprecated_call():
--- a/spacy/tests/lang/test_attrs.py
+++ b/spacy/tests/lang/test_attrs.py
@ -1,4 +1,5 @@
 import pytest
+from spacy.attrs import intify_attrs, ENT_IOB

 from spacy.attrs import IS_ALPHA, LEMMA, NORM, ORTH, intify_attrs
 from spacy.lang.en.stop_words import STOP_WORDS
@ -33,6 +34,38 @@ def test_attrs_do_deprecated(text):
    assert int_attrs == {ORTH: 10, IS_ALPHA: True}


+def test_attrs_ent_iob_intify():
+    int_attrs = intify_attrs({"ENT_IOB": ""})
+    assert int_attrs == {ENT_IOB: 0}
+
+    int_attrs = intify_attrs({"ENT_IOB": "I"})
+    assert int_attrs == {ENT_IOB: 1}
+
+    int_attrs = intify_attrs({"ENT_IOB": "O"})
+    assert int_attrs == {ENT_IOB: 2}
+
+    int_attrs = intify_attrs({"ENT_IOB": "B"})
+    assert int_attrs == {ENT_IOB: 3}
+
+    int_attrs = intify_attrs({ENT_IOB: ""})
+    assert int_attrs == {ENT_IOB: 0}
+
+    int_attrs = intify_attrs({ENT_IOB: "I"})
+    assert int_attrs == {ENT_IOB: 1}
+
+    int_attrs = intify_attrs({ENT_IOB: "O"})
+    assert int_attrs == {ENT_IOB: 2}
+
+    int_attrs = intify_attrs({ENT_IOB: "B"})
+    assert int_attrs == {ENT_IOB: 3}
+
+    with pytest.raises(ValueError):
+        int_attrs = intify_attrs({"ENT_IOB": "XX"})
+
+    with pytest.raises(ValueError):
+        int_attrs = intify_attrs({ENT_IOB: "XX"})
+
+
@pytest.mark.parametrize("text,match", [(",", True), (" ", False), ("a", False)])
 def test_lex_attrs_is_punct(text, match):
    assert is_punct(text) == match
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@ -642,3 +642,30 @@ def test_matcher_no_zero_length(en_vocab):
    matcher = Matcher(en_vocab)
    matcher.add("TEST", [[{"TAG": "C", "OP": "?"}]])
    assert len(matcher(doc)) == 0
+
+
+def test_matcher_ent_iob_key(en_vocab):
+    """Test that patterns with ent_iob works correctly."""
+    matcher = Matcher(en_vocab)
+    matcher.add("Rule", [[{"ENT_IOB": "I"}]])
+    doc1 = Doc(en_vocab, words=["I", "visited", "New", "York", "and", "California"])
+    doc1.ents = [Span(doc1, 2, 4, label="GPE"), Span(doc1, 5, 6, label="GPE")]
+    doc2 = Doc(en_vocab, words=["I", "visited", "my", "friend", "Alicia"])
+    doc2.ents = [Span(doc2, 4, 5, label="PERSON")]
+    matches1 = [doc1[start:end].text for _, start, end in matcher(doc1)]
+    matches2 = [doc2[start:end].text for _, start, end in matcher(doc2)]
+    assert len(matches1) == 1
+    assert matches1[0] == "York"
+    assert len(matches2) == 0
+
+    matcher = Matcher(en_vocab)  # Test iob pattern with operators
+    matcher.add("Rule", [[{"ENT_IOB": "I", "OP": "+"}]])
+    doc = Doc(
+        en_vocab, words=["I", "visited", "my", "friend", "Anna", "Maria", "Esperanza"]
+    )
+    doc.ents = [Span(doc, 4, 7, label="PERSON")]
+    matches = [doc[start:end].text for _, start, end in matcher(doc)]
+    assert len(matches) == 3
+    assert matches[0] == "Maria"
+    assert matches[1] == "Maria Esperanza"
+    assert matches[2] == "Esperanza"
--- a/spacy/tests/matcher/test_pattern_validation.py
+++ b/spacy/tests/matcher/test_pattern_validation.py
@ -12,6 +12,7 @@ TEST_PATTERNS = [
    ([{"IS_PUNCT": True, "OP": "$"}], 1, 1),
    ([{"_": "foo"}], 1, 1),
    ('[{"TEXT": "foo"}, {"LOWER": "bar"}]', 1, 1),
+    ([{"ENT_IOB": "foo"}], 1, 1),
    ([1, 2, 3], 3, 1),
    # Bad patterns flagged outside of Matcher
    ([{"_": {"foo": "bar", "baz": {"IN": "foo"}}}], 2, 0),  # prev: (1, 0)
--- a/spacy/tests/package/test_requirements.py
+++ b/spacy/tests/package/test_requirements.py
@ -12,6 +12,7 @@ def test_build_dependencies():
        "flake8",
        "hypothesis",
        "pre-commit",
+        "black",
        "mypy",
        "types-dataclasses",
        "types-mock",
--- a/spacy/tests/pipeline/test_senter.py
+++ b/spacy/tests/pipeline/test_senter.py
@ -97,3 +97,7 @@ def test_overfitting_IO():
    ]
    assert_equal(batch_deps_1, batch_deps_2)
    assert_equal(batch_deps_1, no_batch_deps)
+
+    # test internal pipe labels vs. Language.pipe_labels with hidden labels
+    assert nlp.get_pipe("senter").labels == ("I", "S")
+    assert "senter" not in nlp.pipe_labels
--- a/spacy/tests/pipeline/test_spancat.py
+++ b/spacy/tests/pipeline/test_spancat.py
@ -80,6 +80,8 @@ def test_explicit_labels():
    assert spancat.labels == ("PERSON", "LOC")


+# TODO figure out why this is flaky
+@pytest.mark.skip(reason="Test is unreliable for unknown reason")
 def test_doc_gc():
    # If the Doc object is garbage collected, the spans won't be functional afterwards
    nlp = Language()
@ -97,6 +99,7 @@ def test_doc_gc():
        assert isinstance(spangroups, SpanGroups)
        for key, spangroup in spangroups.items():
            assert isinstance(spangroup, SpanGroup)
+            # XXX This fails with length 0 sometimes
            assert len(spangroup) > 0
            with pytest.raises(RuntimeError):
                span = spangroup[0]
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@ -12,14 +12,18 @@ from spacy.cli._util import is_subpath_of, load_project_config
 from spacy.cli._util import parse_config_overrides, string_to_list
 from spacy.cli._util import substitute_project_variables
 from spacy.cli._util import validate_project_commands
+from spacy.cli.debug_data import _compile_gold, _get_labels_from_model
+from spacy.cli.debug_data import _get_labels_from_spancat
 from spacy.cli.download import get_compatibility, get_version
 from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config
 from spacy.cli.package import get_third_party_dependencies
+from spacy.cli.package import _is_permitted_package_name
 from spacy.cli.validate import get_model_pkgs
 from spacy.lang.en import English
 from spacy.lang.nl import Dutch
 from spacy.language import Language
 from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
+from spacy.tokens import Doc
 from spacy.training import Example, docs_to_json, offsets_to_biluo_tags
 from spacy.training.converters import conll_ner_to_docs, conllu_to_docs
 from spacy.training.converters import iob_to_docs
@ -665,3 +669,54 @@ def test_get_third_party_dependencies():
 )
 def test_is_subpath_of(parent, child, expected):
    assert is_subpath_of(parent, child) == expected
+
+
+@pytest.mark.slow
+@pytest.mark.parametrize(
+    "factory_name,pipe_name",
+    [
+        ("ner", "ner"),
+        ("ner", "my_ner"),
+        ("spancat", "spancat"),
+        ("spancat", "my_spancat"),
+    ],
+)
+def test_get_labels_from_model(factory_name, pipe_name):
+    labels = ("A", "B")
+
+    nlp = English()
+    pipe = nlp.add_pipe(factory_name, name=pipe_name)
+    for label in labels:
+        pipe.add_label(label)
+    nlp.initialize()
+    assert nlp.get_pipe(pipe_name).labels == labels
+    if factory_name == "spancat":
+        assert _get_labels_from_spancat(nlp)[pipe.key] == set(labels)
+    else:
+        assert _get_labels_from_model(nlp, factory_name) == set(labels)
+
+
+def test_permitted_package_names():
+    # https://www.python.org/dev/peps/pep-0426/#name
+    assert _is_permitted_package_name("Meine_Bäume") == False
+    assert _is_permitted_package_name("_package") == False
+    assert _is_permitted_package_name("package_") == False
+    assert _is_permitted_package_name(".package") == False
+    assert _is_permitted_package_name("package.") == False
+    assert _is_permitted_package_name("-package") == False
+    assert _is_permitted_package_name("package-") == False
+
+    
+def test_debug_data_compile_gold():
+    nlp = English()
+    pred = Doc(nlp.vocab, words=["Token", ".", "New", "York", "City"])
+    ref = Doc(nlp.vocab, words=["Token", ".", "New York City"], sent_starts=[True, False, True], ents=["O", "O", "B-ENT"])
+    eg = Example(pred, ref)
+    data = _compile_gold([eg], ["ner"], nlp, True)
+    assert data["boundary_cross_ents"] == 0
+
+    pred = Doc(nlp.vocab, words=["Token", ".", "New", "York", "City"])
+    ref = Doc(nlp.vocab, words=["Token", ".", "New York City"], sent_starts=[True, False, True], ents=["O", "B-ENT", "I-ENT"])
+    eg = Example(pred, ref)
+    data = _compile_gold([eg], ["ner"], nlp, True)
+    assert data["boundary_cross_ents"] == 1
--- a/spacy/tests/tokenizer/test_tokenizer.py
+++ b/spacy/tests/tokenizer/test_tokenizer.py
@ -9,6 +9,7 @@ from spacy.tokenizer import Tokenizer
 from spacy.tokens import Doc
 from spacy.training import Example
 from spacy.util import compile_prefix_regex, compile_suffix_regex, ensure_path
+from spacy.util import compile_infix_regex
 from spacy.vocab import Vocab
 from spacy.symbols import ORTH

@ -503,3 +504,20 @@ def test_tokenizer_prefix_suffix_overlap_lookbehind(en_vocab):
    assert tokens == ["a", "10", "."]
    explain_tokens = [t[1] for t in tokenizer.explain("a10.")]
    assert tokens == explain_tokens
+
+
+def test_tokenizer_infix_prefix(en_vocab):
+    # the prefix and suffix matches overlap in the suffix lookbehind
+    infixes = ["±"]
+    suffixes = ["%"]
+    infix_re = compile_infix_regex(infixes)
+    suffix_re = compile_suffix_regex(suffixes)
+    tokenizer = Tokenizer(
+        en_vocab,
+        infix_finditer=infix_re.finditer,
+        suffix_search=suffix_re.search,
+    )
+    tokens = [t.text for t in tokenizer("±10%")]
+    assert tokens == ["±10", "%"]
+    explain_tokens = [t[1] for t in tokenizer.explain("±10%")]
+    assert tokens == explain_tokens
--- a/spacy/tests/vocab_vectors/test_similarity.py
+++ b/spacy/tests/vocab_vectors/test_similarity.py
@ -35,6 +35,7 @@ def test_vectors_similarity_LL(vocab, vectors):
    assert lex1.vector_norm != 0
    assert lex2.vector_norm != 0
    assert lex1.vector[0] != lex2.vector[0] and lex1.vector[1] != lex2.vector[1]
+    assert isinstance(lex1.similarity(lex2), float)
    assert numpy.isclose(lex1.similarity(lex2), get_cosine(vec1, vec2))
    assert numpy.isclose(lex2.similarity(lex2), lex1.similarity(lex1))

@ -47,25 +48,46 @@ def test_vectors_similarity_TT(vocab, vectors):
    assert doc[0].vector_norm != 0
    assert doc[1].vector_norm != 0
    assert doc[0].vector[0] != doc[1].vector[0] and doc[0].vector[1] != doc[1].vector[1]
+    assert isinstance(doc[0].similarity(doc[1]), float)
    assert numpy.isclose(doc[0].similarity(doc[1]), get_cosine(vec1, vec2))
    assert numpy.isclose(doc[1].similarity(doc[0]), doc[0].similarity(doc[1]))


+def test_vectors_similarity_SS(vocab, vectors):
+    [(word1, vec1), (word2, vec2)] = vectors
+    doc = Doc(vocab, words=[word1, word2])
+    assert isinstance(doc[0:1].similarity(doc[0:2]), float)
+    assert doc[0:1].similarity(doc[0:2]) == doc[0:2].similarity(doc[0:1])
+
+
+def test_vectors_similarity_DD(vocab, vectors):
+    [(word1, vec1), (word2, vec2)] = vectors
+    doc1 = Doc(vocab, words=[word1, word2])
+    doc2 = Doc(vocab, words=[word2, word1])
+    assert isinstance(doc1.similarity(doc2), float)
+    assert doc1.similarity(doc2) == doc2.similarity(doc1)
+
+
 def test_vectors_similarity_TD(vocab, vectors):
    [(word1, vec1), (word2, vec2)] = vectors
    doc = Doc(vocab, words=[word1, word2])
    with pytest.warns(UserWarning):
+        assert isinstance(doc.similarity(doc[0]), float)
+        assert isinstance(doc[0].similarity(doc), float)
        assert doc.similarity(doc[0]) == doc[0].similarity(doc)


-def test_vectors_similarity_DS(vocab, vectors):
-    [(word1, vec1), (word2, vec2)] = vectors
-    doc = Doc(vocab, words=[word1, word2])
-    assert doc.similarity(doc[:2]) == doc[:2].similarity(doc)
-
-
 def test_vectors_similarity_TS(vocab, vectors):
    [(word1, vec1), (word2, vec2)] = vectors
    doc = Doc(vocab, words=[word1, word2])
    with pytest.warns(UserWarning):
+        assert isinstance(doc[:2].similarity(doc[0]), float)
+        assert isinstance(doc[0].similarity(doc[-2]), float)
        assert doc[:2].similarity(doc[0]) == doc[0].similarity(doc[:2])
+
+
+def test_vectors_similarity_DS(vocab, vectors):
+    [(word1, vec1), (word2, vec2)] = vectors
+    doc = Doc(vocab, words=[word1, word2])
+    assert isinstance(doc.similarity(doc[:2]), float)
+    assert doc.similarity(doc[:2]) == doc[:2].similarity(doc)
--- a/spacy/tests/vocab_vectors/test_vectors.py
+++ b/spacy/tests/vocab_vectors/test_vectors.py
@ -421,7 +421,7 @@ def test_vector_is_oov():
 def test_init_vectors_unset():
    v = Vectors(shape=(10, 10))
    assert v.is_full is False
-    assert v.data.shape == (10, 10)
+    assert v.shape == (10, 10)

    with pytest.raises(ValueError):
        v = Vectors(shape=(10, 10), mode="floret")
@ -514,7 +514,7 @@ def test_floret_vectors(floret_vectors_vec_str, floret_vectors_hashvec_str):
    # rows: 2 rows per ngram
    rows = OPS.xp.asarray(
        [
-            h % nlp.vocab.vectors.data.shape[0]
+            h % nlp.vocab.vectors.shape[0]
            for ngram in ngrams
            for h in nlp.vocab.vectors._get_ngram_hashes(ngram)
        ],
@ -544,17 +544,17 @@ def test_floret_vectors(floret_vectors_vec_str, floret_vectors_hashvec_str):
    # an empty key returns 0s
    assert_equal(
        OPS.to_numpy(nlp.vocab[""].vector),
-        numpy.zeros((nlp.vocab.vectors.data.shape[0],)),
+        numpy.zeros((nlp.vocab.vectors.shape[0],)),
    )
    # an empty batch returns 0s
    assert_equal(
        OPS.to_numpy(nlp.vocab.vectors.get_batch([""])),
-        numpy.zeros((1, nlp.vocab.vectors.data.shape[0])),
+        numpy.zeros((1, nlp.vocab.vectors.shape[0])),
    )
    # an empty key within a batch returns 0s
    assert_equal(
        OPS.to_numpy(nlp.vocab.vectors.get_batch(["a", "", "b"])[1]),
-        numpy.zeros((nlp.vocab.vectors.data.shape[0],)),
+        numpy.zeros((nlp.vocab.vectors.shape[0],)),
    )

    # the loaded ngram vector table cannot be modified
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -45,10 +45,12 @@ cdef class Tokenizer:
            `re.compile(string).search` to match suffixes.
        `infix_finditer` (callable): A function matching the signature of
            `re.compile(string).finditer` to find infixes.
-        token_match (callable): A boolean function matching strings to be
+        token_match (callable): A function matching the signature of
+            `re.compile(string).match`, for matching strings to be
            recognized as tokens.
-        url_match (callable): A boolean function matching strings to be
-            recognized as tokens after considering prefixes and suffixes.
+        url_match (callable): A function matching the signature of
+            `re.compile(string).match`, for matching strings to be
+            recognized as urls.

        EXAMPLE:
            >>> tokenizer = Tokenizer(nlp.vocab)
@ -681,6 +683,8 @@ cdef class Tokenizer:
                    infixes = infix_finditer(substring)
                    offset = 0
                    for match in infixes:
+                        if offset == 0 and match.start() == 0:
+                            continue
                        if substring[offset : match.start()]:
                            tokens.append(("TOKEN", substring[offset : match.start()]))
                        if substring[match.start() : match.end()]:
--- a/spacy/tokens/doc.pyi
+++ b/spacy/tokens/doc.pyi
@ -10,7 +10,7 @@ from ..lexeme import Lexeme
 from ..vocab import Vocab
 from .underscore import Underscore
 from pathlib import Path
-import numpy
+import numpy as np

 class DocMethod(Protocol):
    def __call__(self: Doc, *args: Any, **kwargs: Any) -> Any: ...  # type: ignore[misc]
@ -26,7 +26,7 @@ class Doc:
    user_hooks: Dict[str, Callable[..., Any]]
    user_token_hooks: Dict[str, Callable[..., Any]]
    user_span_hooks: Dict[str, Callable[..., Any]]
-    tensor: numpy.ndarray
+    tensor: np.ndarray[Any, np.dtype[np.float_]]
    user_data: Dict[str, Any]
    has_unknown_spaces: bool
    _context: Any
@ -144,7 +144,7 @@ class Doc:
    ) -> Doc: ...
    def to_array(
        self, py_attr_ids: Union[int, str, List[Union[int, str]]]
-    ) -> numpy.ndarray: ...
+    ) -> np.ndarray[Any, np.dtype[np.float_]]: ...
    @staticmethod
    def from_docs(
        docs: List[Doc],
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -420,6 +420,8 @@ cdef class Doc:
        cdef int range_start = 0
        if attr == "IS_SENT_START" or attr == self.vocab.strings["IS_SENT_START"]:
            attr = SENT_START
+        elif attr == "IS_SENT_END" or attr == self.vocab.strings["IS_SENT_END"]:
+            attr = SENT_START
        attr = intify_attr(attr)
        # adjust attributes
        if attr == HEAD:
@ -616,7 +618,7 @@ cdef class Doc:
        """
        if "has_vector" in self.user_hooks:
            return self.user_hooks["has_vector"](self)
-        elif self.vocab.vectors.data.size:
+        elif self.vocab.vectors.size:
            return True
        elif self.tensor.size:
            return True
@ -641,7 +643,7 @@ cdef class Doc:
            if not len(self):
                self._vector = xp.zeros((self.vocab.vectors_length,), dtype="f")
                return self._vector
-            elif self.vocab.vectors.data.size > 0:
+            elif self.vocab.vectors.size > 0:
                self._vector = sum(t.vector for t in self) / len(self)
                return self._vector
            elif self.tensor.size > 0:
@ -1183,7 +1185,7 @@ cdef class Doc:
                token_offset = -1
                for doc in docs[:-1]:
                    token_offset += len(doc)
-                    if not (len(doc) > 0 and doc[-1].is_space):
+                    if len(doc) > 0 and not doc[-1].is_space:
                        concat_spaces[token_offset] = True

        concat_array = numpy.concatenate(arrays)
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -364,7 +364,9 @@ cdef class Span:
            return 0.0
        vector = self.vector
        xp = get_array_module(vector)
-        return xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
+        result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
+        # ensure we get a scalar back (numpy does this automatically but cupy doesn't)
+        return result.item()
    
    cpdef np.ndarray to_array(self, object py_attr_ids):
        """Given a list of M attribute IDs, export the tokens to a numpy
@ -497,7 +499,7 @@ cdef class Span:
        """
        if "has_vector" in self.doc.user_span_hooks:
            return self.doc.user_span_hooks["has_vector"](self)
-        elif self.vocab.vectors.data.size > 0:
+        elif self.vocab.vectors.size > 0:
            return any(token.has_vector for token in self)
        elif self.doc.tensor.size > 0:
            return True
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -20,6 +20,7 @@ from .doc cimport set_children_from_heads

 from .. import parts_of_speech
 from ..errors import Errors, Warnings
+from ..attrs import IOB_STRINGS
 from .underscore import Underscore, get_ext_args


@ -209,7 +210,9 @@ cdef class Token:
            return 0.0
        vector = self.vector
        xp = get_array_module(vector)
-        return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm))
+        result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
+        # ensure we get a scalar back (numpy does this automatically but cupy doesn't)
+        return result.item()
    
    def has_morph(self):
        """Check whether the token has annotated morph information.
@ -484,8 +487,6 @@ cdef class Token:

        RETURNS (bool / None): Whether the token starts a sentence.
            None if unknown.
-
-        DOCS: https://spacy.io/api/token#is_sent_start
        """
        def __get__(self):
            if self.c.sent_start == 0:
@ -743,7 +744,7 @@ cdef class Token:

    @classmethod
    def iob_strings(cls):
-        return ("", "I", "O", "B")
+        return IOB_STRINGS

    @property
    def ent_iob_(self):
--- a/spacy/tokens/underscore.py
+++ b/spacy/tokens/underscore.py
@ -1,17 +1,31 @@
-from typing import Dict, Any
+from typing import Dict, Any, List, Optional, Tuple, Union, TYPE_CHECKING
 import functools
 import copy
-
 from ..errors import Errors

+if TYPE_CHECKING:
+    from .doc import Doc
+    from .span import Span
+    from .token import Token
+

 class Underscore:
    mutable_types = (dict, list, set)
    doc_extensions: Dict[Any, Any] = {}
    span_extensions: Dict[Any, Any] = {}
    token_extensions: Dict[Any, Any] = {}
+    _extensions: Dict[str, Any]
+    _obj: Union["Doc", "Span", "Token"]
+    _start: Optional[int]
+    _end: Optional[int]

-    def __init__(self, extensions, obj, start=None, end=None):
+    def __init__(
+        self,
+        extensions: Dict[str, Any],
+        obj: Union["Doc", "Span", "Token"],
+        start: Optional[int] = None,
+        end: Optional[int] = None,
+    ):
        object.__setattr__(self, "_extensions", extensions)
        object.__setattr__(self, "_obj", obj)
        # Assumption is that for doc values, _start and _end will both be None
@ -23,12 +37,12 @@ class Underscore:
        object.__setattr__(self, "_start", start)
        object.__setattr__(self, "_end", end)

-    def __dir__(self):
+    def __dir__(self) -> List[str]:
        # Hack to enable autocomplete on custom extensions
        extensions = list(self._extensions.keys())
        return ["set", "get", "has"] + extensions

-    def __getattr__(self, name):
+    def __getattr__(self, name: str) -> Any:
        if name not in self._extensions:
            raise AttributeError(Errors.E046.format(name=name))
        default, method, getter, setter = self._extensions[name]
@ -56,7 +70,7 @@ class Underscore:
                return new_default
            return default

-    def __setattr__(self, name, value):
+    def __setattr__(self, name: str, value: Any):
        if name not in self._extensions:
            raise AttributeError(Errors.E047.format(name=name))
        default, method, getter, setter = self._extensions[name]
@ -65,28 +79,30 @@ class Underscore:
        else:
            self._doc.user_data[self._get_key(name)] = value

-    def set(self, name, value):
+    def set(self, name: str, value: Any):
        return self.__setattr__(name, value)

-    def get(self, name):
+    def get(self, name: str) -> Any:
        return self.__getattr__(name)

-    def has(self, name):
+    def has(self, name: str) -> bool:
        return name in self._extensions

-    def _get_key(self, name):
+    def _get_key(self, name: str) -> Tuple[str, str, Optional[int], Optional[int]]:
        return ("._.", name, self._start, self._end)

    @classmethod
-    def get_state(cls):
+    def get_state(cls) -> Tuple[Dict[Any, Any], Dict[Any, Any], Dict[Any, Any]]:
        return cls.token_extensions, cls.span_extensions, cls.doc_extensions

    @classmethod
-    def load_state(cls, state):
+    def load_state(
+        cls, state: Tuple[Dict[Any, Any], Dict[Any, Any], Dict[Any, Any]]
+    ) -> None:
        cls.token_extensions, cls.span_extensions, cls.doc_extensions = state


-def get_ext_args(**kwargs):
+def get_ext_args(**kwargs: Any):
    """Validate and convert arguments. Reused in Doc, Token and Span."""
    default = kwargs.get("default")
    getter = kwargs.get("getter")
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@ -164,7 +164,7 @@ def load_vectors_into_model(
        len(vectors_nlp.vocab.vectors.keys()) == 0
        and vectors_nlp.vocab.vectors.mode != VectorsMode.floret
    ) or (
-        vectors_nlp.vocab.vectors.data.shape[0] == 0
+        vectors_nlp.vocab.vectors.shape[0] == 0
        and vectors_nlp.vocab.vectors.mode == VectorsMode.floret
    ):
        logger.warning(Warnings.W112.format(name=name))
--- a/spacy/util.py
+++ b/spacy/util.py
@ -871,7 +871,6 @@ def get_package_path(name: str) -> Path:
    name (str): Package name.
    RETURNS (Path): Path to installed package.
    """
-    name = name.lower()  # use lowercase version to be safe
    # Here we're importing the module just to find it. This is worryingly
    # indirect, but it's otherwise very difficult to find the package.
    pkg = importlib.import_module(name)
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@ -1,5 +1,5 @@
 cimport numpy as np
-from libc.stdint cimport uint32_t
+from libc.stdint cimport uint32_t, uint64_t
 from cython.operator cimport dereference as deref
 from libcpp.set cimport set as cppset
 from murmurhash.mrmr cimport hash128_x64
@ -10,7 +10,7 @@ from typing import cast
 import warnings
 from enum import Enum
 import srsly
-from thinc.api import get_array_module, get_current_ops
+from thinc.api import Ops, get_array_module, get_current_ops
 from thinc.backends import get_array_ops
 from thinc.types import Floats2d

@ -146,7 +146,7 @@ cdef class Vectors:

        DOCS: https://spacy.io/api/vectors#size
        """
-        return self.data.shape[0] * self.data.shape[1]
+        return self.data.size

    @property
    def is_full(self):
@ -274,7 +274,7 @@ cdef class Vectors:
            self.data = resized_array
        self._sync_unset()
        removed_items = []
-        for key, row in list(self.key2row.items()):
+        for key, row in self.key2row.copy().items():
            if row >= shape[0]:
                self.key2row.pop(key)
                removed_items.append((key, row))
@ -353,12 +353,18 @@ cdef class Vectors:
        key (str): The string key.
        RETURNS: A list of the integer hashes.
        """
-        cdef uint32_t[4] out
+        # MurmurHash3_x64_128 returns an array of 2 uint64_t values.
+        cdef uint64_t[2] out
        chars = s.encode("utf8")
        cdef char* utf8_string = chars
        hash128_x64(utf8_string, len(chars), self.hash_seed, &out)
-        rows = [out[i] for i in range(min(self.hash_count, 4))]
-        return rows
+        rows = [
+            out[0] & 0xffffffffu,
+            out[0] >> 32,
+            out[1] & 0xffffffffu,
+            out[1] >> 32,
+        ]
+        return rows[:min(self.hash_count, 4)]

    def _get_ngrams(self, unicode key):
        """Get all padded ngram strings using the ngram settings.
@ -511,6 +517,9 @@ cdef class Vectors:
                    for i in range(len(queries)) ], dtype="uint64")
        return (keys, best_rows, scores)

+    def to_ops(self, ops: Ops):
+        self.data = ops.asarray(self.data)
+
    def _get_cfg(self):
        if self.mode == Mode.default:
            return {
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -283,7 +283,7 @@ cdef class Vocab:

    @property
    def vectors_length(self):
-        return self.vectors.data.shape[1]
+        return self.vectors.shape[1]

    def reset_vectors(self, *, width=None, shape=None):
        """Drop the current vector table. Because all vectors must be the same
@ -294,7 +294,7 @@ cdef class Vocab:
        elif shape is not None:
            self.vectors = Vectors(strings=self.strings, shape=shape)
        else:
-            width = width if width is not None else self.vectors.data.shape[1]
+            width = width if width is not None else self.vectors.shape[1]
            self.vectors = Vectors(strings=self.strings, shape=(self.vectors.shape[0], width))

    def prune_vectors(self, nr_row, batch_size=1024):
--- a/website/docs/api/corpus.md
+++ b/website/docs/api/corpus.md
@ -79,6 +79,7 @@ train/test skew.
 | `max_length`    | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~ |
 | `limit`         | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~                                                     |
 | `augmenter`     | Optional data augmentation callback. ~~Callable[[Language, Example], Iterable[Example]]~~                                                           |
+| `shuffle`       | Whether to shuffle the examples. Defaults to `False`. ~~bool~~                                                                                      |

 ## Corpus.\_\_call\_\_ {#call tag="method"}

--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@ -304,7 +304,7 @@ ancestor is found, e.g. if span excludes a necessary ancestor.

 ## Doc.has_annotation {#has_annotation tag="method"}

-Check whether the doc contains annotation on a token attribute.
+Check whether the doc contains annotation on a [`Token` attribute](/api/token#attributes).

 <Infobox title="Changed in v3.0" variant="warning">

--- a/website/docs/api/matcher.md
+++ b/website/docs/api/matcher.md
@ -44,6 +44,7 @@ rule-based matching are:
 | `SPACY`                                         | Token has a trailing space. ~~bool~~                                                                                      |
 |  `POS`, `TAG`, `MORPH`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, morphological analysis, dependency label, lemma, shape. ~~str~~       |
 | `ENT_TYPE`                                      | The token's entity label. ~~str~~                                                                                         |
+| `ENT_IOB`                                       | The IOB part of the token's entity tag. ~~str~~                                                                           |
 | `ENT_ID`                                        | The token's entity ID (`ent_id`). ~~str~~                                                                                 |
 | `ENT_KB_ID`                                     | The token's entity knowledge base ID (`ent_kb_id`). ~~str~~                                                               |
 | `_` <Tag variant="new">2.1</Tag>                | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ |
--- a/website/docs/api/token.md
+++ b/website/docs/api/token.md
@ -349,23 +349,6 @@ A sequence containing the token and all the token's syntactic descendants.
 | ---------- | ------------------------------------------------------------------------------------ |
 | **YIELDS** | A descendant token such that `self.is_ancestor(token)` or `token == self`. ~~Token~~ |

-## Token.is_sent_start {#is_sent_start tag="property" new="2"}
-
-A boolean value indicating whether the token starts a sentence. `None` if
-unknown. Defaults to `True` for the first token in the `Doc`.
-
-> #### Example
->
-> ```python
-> doc = nlp("Give it back! He pleaded.")
-> assert doc[4].is_sent_start
-> assert not doc[5].is_sent_start
-> ```
-
-| Name        | Description                                             |
-| ----------- | ------------------------------------------------------- |
-| **RETURNS** | Whether the token starts a sentence. ~~Optional[bool]~~ |
-
 ## Token.has_vector {#has_vector tag="property" model="vectors"}

 A boolean value indicating whether a word vector is associated with the token.
@ -465,6 +448,8 @@ The L2 norm of the token's vector representation.
 | `is_punct`                                   | Is the token punctuation? ~~bool~~                                                                                                                                                                                                                                   |
 | `is_left_punct`                              | Is the token a left punctuation mark, e.g. `"("` ? ~~bool~~                                                                                                                                                                                                          |
 | `is_right_punct`                             | Is the token a right punctuation mark, e.g. `")"` ? ~~bool~~                                                                                                                                                                                                         |
+| `is_sent_start`                              | Does the token start a sentence? ~~bool~~ or `None` if unknown. Defaults to `True` for the first token in the `Doc`.                                                                                                                                                 |
+| `is_sent_end`                                | Does the token end a sentence? ~~bool~~ or `None` if unknown.                                                                                                                                                                                                        |
 | `is_space`                                   | Does the token consist of whitespace characters? Equivalent to `token.text.isspace()`. ~~bool~~                                                                                                                                                                      |
 | `is_bracket`                                 | Is the token a bracket? ~~bool~~                                                                                                                                                                                                                                     |
 | `is_quote`                                   | Is the token a quotation mark? ~~bool~~                                                                                                                                                                                                                              |
--- a/website/docs/api/vectors.md
+++ b/website/docs/api/vectors.md
@ -371,6 +371,23 @@ Get the vectors for the provided keys efficiently as a batch.
 | ------ | --------------------------------------- |
 | `keys` | The keys. ~~Iterable[Union[int, str]]~~ |

+## Vectors.to_ops {#to_ops tag="method"}
+
+Change the embedding matrix to use different Thinc ops.
+
+> #### Example
+>
+> ```python
+> from thinc.api import NumpyOps
+>
+> vectors.to_ops(NumpyOps())
+>
+> ```
+
+| Name  | Description                                              |
+|-------|----------------------------------------------------------|
+| `ops` | The Thinc ops to switch the embedding matrix to. ~~Ops~~ |
+
 ## Vectors.to_disk {#to_disk tag="method"}

 Save the current state to a directory.
--- a/website/docs/usage/linguistic-features.md
+++ b/website/docs/usage/linguistic-features.md
@ -831,6 +831,8 @@ def tokenizer_pseudo_code(
                infixes = infix_finditer(substring)
                offset = 0
                for match in infixes:
+                    if offset == 0 and match.start() == 0:
+                        continue
                    tokens.append(substring[offset : match.start()])
                    tokens.append(substring[match.start() : match.end()])
                    offset = match.end()
--- a/website/docs/usage/projects.md
+++ b/website/docs/usage/projects.md
@ -213,6 +213,12 @@ format, train a pipeline, evaluate it and export metrics, package it and spin up
 a quick web demo. It looks pretty similar to a config file used to define CI
 pipelines.

+> #### Tip: Multi-line YAML syntax for long values
+>
+> YAML has [multi-line syntax](https://yaml-multiline.info/) that can be 
+> helpful for readability with longer values such as project descriptions or 
+> commands that take several arguments.
+
 ```yaml
 %%GITHUB_PROJECTS/pipelines/tagger_parser_ud/project.yml
 ```
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@ -141,7 +141,8 @@
                "website": "https://www.nr.no/~plison"
            },
            "category": ["pipeline", "standalone", "research", "training"],
-            "tags": []
+            "tags": [],
+            "spacy_version": 3
        },
        {
            "id": "numerizer",
@ -952,6 +953,37 @@
            "category": ["pipeline"],
            "tags": ["lemmatizer", "danish"]
        },
+        {
+            "id": "augmenty",
+            "title": "Augmenty",
+            "slogan": "The cherry on top of your NLP pipeline",
+            "description": "Augmenty is an augmentation library based on spaCy for augmenting texts. Augmenty differs from other augmentation libraries in that it corrects (as far as possible) the token, sentence and document labels under the augmentation.",
+            "github": "kennethenevoldsen/augmenty",
+            "pip": "augmenty",
+            "code_example": [
+                "import spacy",
+                "import augmenty",
+                "",
+                "nlp = spacy.load('en_core_web_md')",
+                "",
+                "docs = nlp.pipe(['Augmenty is a great tool for text augmentation'])",
+                "",
+                "ent_dict = {'ORG': [['spaCy'], ['spaCy', 'Universe']]}",
+                "entity_augmenter = augmenty.load('ents_replace.v1',",
+                "                                 ent_dict = ent_dict, level=1)",
+                "",
+                "for doc in augmenty.docs(docs, augmenter=entity_augmenter, nlp=nlp):",
+                "    print(doc)"
+            ],
+            "thumb": "https://github.com/KennethEnevoldsen/augmenty/blob/master/img/icon.png?raw=true",
+            "author": "Kenneth Enevoldsen",
+            "author_links": {
+                "github": "kennethenevoldsen",
+                "website": "https://www.kennethenevoldsen.com"
+            },
+            "category": ["training", "research"],
+            "tags": ["training", "research", "augmentation"]
+        },
        {
            "id": "dacy",
            "title": "DaCy",
--- a/website/src/templates/universe.js
+++ b/website/src/templates/universe.js
@ -8,10 +8,11 @@ import Title from '../components/title'
 import Grid from '../components/grid'
 import Button from '../components/button'
 import Icon from '../components/icon'
+import Tag from '../components/tag'
 import CodeBlock, { InlineCode } from '../components/code'
 import Aside from '../components/aside'
 import Sidebar from '../components/sidebar'
-import Section from '../components/section'
+import Section, { Hr } from '../components/section'
 import Main from '../components/main'
 import Footer from '../components/footer'
 import { H3, H5, Label, InlineList } from '../components/typography'
@ -121,6 +122,18 @@ const UniverseContent = ({ content = [], categories, theme, pageContext, mdxComp
                        </Grid>
                    </Section>
                )}
+                <section className="search-exclude">
+                    <H3>Found a mistake or something isn't working?</H3>
+                    <p>
+                        If you've come across a universe project that isn't working or is
+                        incompatible with the reported spaCy version, let us know by{' '}
+                        <Link to="https://github.com/explosion/spaCy/discussions/new">
+                            opening a discussion thread
+                        </Link>
+                        .
+                    </p>
+                </section>
+                <Hr />
                <section className="search-exclude">
                    <H3>Submit your project</H3>
                    <p>
@ -168,11 +181,22 @@ UniverseContent.propTypes = {
    mdxComponents: PropTypes.object,
 }

+const SpaCyVersion = ({ version }) => {
+    const versions = !Array.isArray(version) ? [version] : version
+    return versions.map((v, i) => (
+        <>
+            <Tag tooltip={`This project is compatible with spaCy v${v}`}>spaCy v{v}</Tag>{' '}
+        </>
+    ))
+}
+
 const Project = ({ data, components }) => (
    <>
        <Title title={data.title || data.id} teaser={data.slogan} image={data.thumb}>
-            {data.github && (
+            {(data.github || data.spacy_version) && (
                <p>
+                    {data.spacy_version && <SpaCyVersion version={data.spacy_version} />}
+                    {data.github && (
                        <Link to={`https://github.com/${data.github}`} hidden>
                            {[
                                `release/${data.github}/all.svg?style=flat-square`,
@ -180,13 +204,18 @@ const Project = ({ data, components }) => (
                                `stars/${data.github}.svg?style=social&label=Stars`,
                            ].map((url, i) => (
                                <img
-                                style={{ borderRadius: '1em', marginRight: '0.5rem' }}
+                                    style={{
+                                        borderRadius: '1em',
+                                        marginRight: '0.5rem',
+                                        verticalAlign: 'middle',
+                                    }}
                                    key={i}
                                    src={`https://img.shields.io/github/${url}`}
                                    alt=""
                                />
                            ))}
                        </Link>
+                    )}
                </p>
            )}
        </Title>
@ -335,6 +364,7 @@ const query = graphql`
                        url
                        github
                        description
+                        spacy_version
                        pip
                        cran
                        category