diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
index 09de1cd05..fce1a1064 100644
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -1,8 +1,11 @@
 blank_issues_enabled: false
 contact_links:
+  - name: ⚠️ Python 3.10 Support
+    url: https://github.com/explosion/spaCy/discussions/9418
+    about: Python 3.10 wheels haven't been released yet, see the link for details.
   - name: 🗯 Discussions Forum
     url: https://github.com/explosion/spaCy/discussions
-    about: Usage questions, general discussion and anything else that isn't a bug report.
+    about: Install issues, usage questions, general discussion and anything else that isn't a bug report.
   - name: 📖 spaCy FAQ & Troubleshooting
     url: https://github.com/explosion/spaCy/discussions/8226
     about: Before you post, check out the FAQ for answers to common community questions!
diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml
index 50e81799e..543804b9f 100644
--- a/.github/azure-steps.yml
+++ b/.github/azure-steps.yml
@@ -100,3 +100,8 @@ steps:
       python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
     displayName: 'Test assemble CLI vectors warning'
     condition: eq(variables['python_version'], '3.8')
+
+  - script: |
+      python .github/validate_universe_json.py website/meta/universe.json
+    displayName: 'Test website/meta/universe.json'
+    condition: eq(variables['python_version'], '3.8')
diff --git a/.github/validate_universe_json.py b/.github/validate_universe_json.py
new file mode 100644
index 000000000..b96b7b347
--- /dev/null
+++ b/.github/validate_universe_json.py
@@ -0,0 +1,19 @@
+import json
+import re
+import sys
+from pathlib import Path
+
+
+def validate_json(document):
+    universe_file = Path(document)
+    with universe_file.open() as f:
+        universe_data = json.load(f)
+        for entry in universe_data["resources"]:
+            if "github" in entry:
+                assert not re.match(
+                    r"^(http:)|^(https:)", entry["github"]
+                ), "Github field should be user/repo, not a url"
+
+
+if __name__ == "__main__":
+    validate_json(str(sys.argv[1]))
diff --git a/.github/workflows/explosionbot.yml b/.github/workflows/explosionbot.yml
new file mode 100644
index 000000000..7d9ee45e9
--- /dev/null
+++ b/.github/workflows/explosionbot.yml
@@ -0,0 +1,27 @@
+name: Explosion Bot
+
+on:
+  issue_comment:
+    types:
+      - created
+      - edited
+
+jobs:
+  explosion-bot:
+    runs-on: ubuntu-18.04
+    steps:
+      - name: Dump GitHub context
+        env:
+          GITHUB_CONTEXT: ${{ toJson(github) }}
+        run: echo "$GITHUB_CONTEXT"
+      - uses: actions/checkout@v1
+      - uses: actions/setup-python@v1
+      - name: Install and run explosion-bot
+        run: |
+          pip install git+https://${{ secrets.EXPLOSIONBOT_TOKEN }}@github.com/explosion/explosion-bot
+          python -m explosionbot
+        env:
+          INPUT_TOKEN: ${{ secrets.EXPLOSIONBOT_TOKEN }}
+          INPUT_BK_TOKEN: ${{ secrets.BUILDKITE_SECRET }}
+          ENABLED_COMMANDS: "test_gpu"
+          ALLOWED_TEAMS: "spaCy"
\ No newline at end of file
diff --git a/CITATION b/CITATION
deleted file mode 100644
index bdaa90677..000000000
--- a/CITATION
+++ /dev/null
@@ -1,8 +0,0 @@
-@software{spacy,
-  author = {Honnibal, Matthew and Montani, Ines and Van Landeghem, Sofie and Boyd, Adriane},
-  title = {{spaCy: Industrial-strength Natural Language Processing in Python}},
-  year = 2020,
-  publisher = {Zenodo},
-  doi = {10.5281/zenodo.1212303},
-  url = {https://doi.org/10.5281/zenodo.1212303}
-}
diff --git a/CITATION.cff b/CITATION.cff
new file mode 100644
index 000000000..88c05b2a3
--- /dev/null
+++ b/CITATION.cff
@@ -0,0 +1,16 @@
+cff-version: 1.2.0
+preferred-citation:
+  type: article
+  message: "If you use spaCy, please cite it as below."
+  authors:
+  - family-names: "Honnibal"
+    given-names: "Matthew"
+  - family-names: "Montani"
+    given-names: "Ines"
+  - family-names: "Van Landeghem"
+    given-names: "Sofie"
+  - family-names: "Boyd"
+    given-names: "Adriane"
+  title: "spaCy: Industrial-strength Natural Language Processing in Python"
+  doi: "10.5281/zenodo.1212303"
+  year: 2020
diff --git a/MANIFEST.in b/MANIFEST.in
index 99fc174bd..d022223cd 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,5 +1,5 @@
 recursive-include include *.h
-recursive-include spacy *.pyx *.pxd *.txt *.cfg *.jinja
+recursive-include spacy *.pyx *.pxd *.txt *.cfg *.jinja *.toml
 include LICENSE
 include README.md
 include pyproject.toml
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index ac80b8a10..844946845 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -16,6 +16,8 @@ pr:
     exclude:
       - "website/*"
       - "*.md"
+    include:
+      - "website/meta/universe.json"
 
 jobs:
   # Perform basic checks for most important errors (syntax etc.) Uses the config
diff --git a/setup.cfg b/setup.cfg
index 4313612d4..e3a9af5c1 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -124,7 +124,8 @@ exclude =
 
 [tool:pytest]
 markers =
-    slow
+    slow: mark a test as slow
+    issue: reference specific issue
 
 [mypy]
 ignore_missing_imports = True
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 9fd87dbc1..664fc2aaf 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, Dict, Any
 from pathlib import Path
 from wasabi import msg
 import typer
@@ -7,7 +7,7 @@ import sys
 
 from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
 from ._util import import_code, setup_gpu
-from ..training.loop import train
+from ..training.loop import train as train_nlp
 from ..training.initialize import init_nlp
 from .. import util
 
@@ -40,6 +40,18 @@ def train_cli(
     DOCS: https://spacy.io/api/cli#train
     """
     util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
+    overrides = parse_config_overrides(ctx.args)
+    import_code(code_path)
+    train(config_path, output_path, use_gpu=use_gpu, overrides=overrides)
+
+
+def train(
+    config_path: Path,
+    output_path: Optional[Path] = None,
+    *,
+    use_gpu: int = -1,
+    overrides: Dict[str, Any] = util.SimpleFrozenDict(),
+):
     # Make sure all files and paths exists if they are needed
     if not config_path or (str(config_path) != "-" and not config_path.exists()):
         msg.fail("Config file not found", config_path, exits=1)
@@ -50,8 +62,6 @@ def train_cli(
             output_path.mkdir(parents=True)
             msg.good(f"Created output directory: {output_path}")
         msg.info(f"Saving to output directory: {output_path}")
-    overrides = parse_config_overrides(ctx.args)
-    import_code(code_path)
     setup_gpu(use_gpu)
     with show_validation_error(config_path):
         config = util.load_config(config_path, overrides=overrides, interpolate=False)
@@ -60,4 +70,4 @@ def train_cli(
         nlp = init_nlp(config, use_gpu=use_gpu)
     msg.good("Initialized pipeline")
     msg.divider("Training pipeline")
-    train(nlp, output_path, use_gpu=use_gpu, stdout=sys.stdout, stderr=sys.stderr)
+    train_nlp(nlp, output_path, use_gpu=use_gpu, stdout=sys.stdout, stderr=sys.stderr)
diff --git a/spacy/errors.py b/spacy/errors.py
index 120828fd6..4b617ecf3 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -25,7 +25,7 @@ def setup_default_warnings():
         filter_warning("once", error_msg=Warnings.W036.format(name=pipe))
 
     # warn once about lemmatizer without required POS
-    filter_warning("once", error_msg="[W108]")
+    filter_warning("once", error_msg=Warnings.W108)
 
 
 def filter_warning(action: str, error_msg: str):
@@ -170,8 +170,8 @@ class Warnings:
             "call the {matcher} on each Doc object.")
     W107 = ("The property `Doc.{prop}` is deprecated. Use "
             "`Doc.has_annotation(\"{attr}\")` instead.")
-    W108 = ("The rule-based lemmatizer did not find POS annotation for the "
-            "token '{text}'. Check that your pipeline includes components that "
+    W108 = ("The rule-based lemmatizer did not find POS annotation for one or "
+            "more tokens. Check that your pipeline includes components that "
             "assign token.pos, typically 'tagger'+'attribute_ruler' or "
             "'morphologizer'.")
     W109 = ("Unable to save user hooks while serializing the doc. Re-add any "
@@ -658,7 +658,9 @@ class Errors:
             "{nO} - cannot add any more labels.")
     E923 = ("It looks like there is no proper sample data to initialize the "
             "Model of component '{name}'. To check your input data paths and "
-            "annotation, run: python -m spacy debug data config.cfg")
+            "annotation, run: python -m spacy debug data config.cfg "
+            "and include the same config override values you would specify "
+            "for the 'spacy train' command.")
     E924 = ("The '{name}' component does not seem to be initialized properly. "
             "This is likely a bug in spaCy, so feel free to open an issue: "
             "https://github.com/explosion/spaCy/issues")
@@ -793,7 +795,7 @@ class Errors:
             "to token boundaries.")
     E982 = ("The `Token.ent_iob` attribute should be an integer indexing "
             "into {values}, but found {value}.")
-    E983 = ("Invalid key for '{dict}': {key}. Available keys: "
+    E983 = ("Invalid key(s) for '{dict}': {key}. Available keys: "
             "{keys}")
     E984 = ("Invalid component config for '{name}': component block needs either "
             "a key `factory` specifying the registered function used to "
diff --git a/spacy/lang/ca/lemmatizer.py b/spacy/lang/ca/lemmatizer.py
index 2518eb720..2fd012912 100644
--- a/spacy/lang/ca/lemmatizer.py
+++ b/spacy/lang/ca/lemmatizer.py
@@ -76,6 +76,6 @@ class CatalanLemmatizer(Lemmatizer):
             forms.append(self.lookup_lemmatize(token)[0])
         if not forms:
             forms.append(string)
-        forms = list(set(forms))
+        forms = list(dict.fromkeys(forms))
         self.cache[cache_key] = forms
         return forms
diff --git a/spacy/lang/fr/lemmatizer.py b/spacy/lang/fr/lemmatizer.py
index bb5a270ab..c6422cf96 100644
--- a/spacy/lang/fr/lemmatizer.py
+++ b/spacy/lang/fr/lemmatizer.py
@@ -75,6 +75,6 @@ class FrenchLemmatizer(Lemmatizer):
             forms.append(self.lookup_lemmatize(token)[0])
         if not forms:
             forms.append(string)
-        forms = list(set(forms))
+        forms = list(dict.fromkeys(forms))
         self.cache[cache_key] = forms
         return forms
diff --git a/spacy/lang/nl/lemmatizer.py b/spacy/lang/nl/lemmatizer.py
index 6c025dcf6..4f6b2ef30 100644
--- a/spacy/lang/nl/lemmatizer.py
+++ b/spacy/lang/nl/lemmatizer.py
@@ -97,7 +97,7 @@ class DutchLemmatizer(Lemmatizer):
                     return forms
                 else:
                     oov_forms.append(form)
-        forms = list(set(oov_forms))
+        forms = list(dict.fromkeys(oov_forms))
         # Back-off through remaining return value candidates.
         if forms:
             for form in forms:
diff --git a/spacy/lang/ru/lemmatizer.py b/spacy/lang/ru/lemmatizer.py
index 185e09718..a56938641 100644
--- a/spacy/lang/ru/lemmatizer.py
+++ b/spacy/lang/ru/lemmatizer.py
@@ -58,7 +58,7 @@ class RussianLemmatizer(Lemmatizer):
         if not len(filtered_analyses):
             return [string.lower()]
         if morphology is None or (len(morphology) == 1 and POS in morphology):
-            return list(set([analysis.normal_form for analysis in filtered_analyses]))
+            return list(dict.fromkeys([analysis.normal_form for analysis in filtered_analyses]))
         if univ_pos in ("ADJ", "DET", "NOUN", "PROPN"):
             features_to_compare = ["Case", "Number", "Gender"]
         elif univ_pos == "NUM":
@@ -89,7 +89,7 @@ class RussianLemmatizer(Lemmatizer):
                 filtered_analyses.append(analysis)
         if not len(filtered_analyses):
             return [string.lower()]
-        return list(set([analysis.normal_form for analysis in filtered_analyses]))
+        return list(dict.fromkeys([analysis.normal_form for analysis in filtered_analyses]))
 
     def pymorphy2_lookup_lemmatize(self, token: Token) -> List[str]:
         string = token.text
diff --git a/spacy/language.py b/spacy/language.py
index d87f86bd3..fd3773f82 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -707,8 +707,9 @@ class Language:
         source_config = source.config.interpolate()
         pipe_config = util.copy_config(source_config["components"][source_name])
         self._pipe_configs[name] = pipe_config
-        for s in source.vocab.strings:
-            self.vocab.strings.add(s)
+        if self.vocab.strings != source.vocab.strings:
+            for s in source.vocab.strings:
+                self.vocab.strings.add(s)
         return pipe, pipe_config["factory"]
 
     def add_pipe(
@@ -1379,6 +1380,9 @@ class Language:
             scorer = Scorer(**kwargs)
         # reset annotation in predicted docs and time tokenization
         start_time = timer()
+        # this is purely for timing
+        for eg in examples:
+            self.make_doc(eg.reference.text)
         # apply all pipeline components
         for name, pipe in self.pipeline:
             kwargs = component_cfg.get(name, {})
@@ -1708,6 +1712,7 @@ class Language:
         # them here so they're only loaded once
         source_nlps = {}
         source_nlp_vectors_hashes = {}
+        vocab_b = None
         for pipe_name in config["nlp"]["pipeline"]:
             if pipe_name not in pipeline:
                 opts = ", ".join(pipeline.keys())
@@ -1730,14 +1735,22 @@ class Language:
                         raw_config=raw_config,
                     )
                 else:
+                    # We need the sourced components to reference the same
+                    # vocab without modifying the current vocab state **AND**
+                    # we still want to load the source model vectors to perform
+                    # the vectors check. Since the source vectors clobber the
+                    # current ones, we save the original vocab state and
+                    # restore after this loop. Existing strings are preserved
+                    # during deserialization, so they do not need any
+                    # additional handling.
+                    if vocab_b is None:
+                        vocab_b = nlp.vocab.to_bytes(exclude=["lookups", "strings"])
                     model = pipe_cfg["source"]
                     if model not in source_nlps:
-                        # We only need the components here and we intentionally
-                        # do not load the model with the same vocab because
-                        # this would cause the vectors to be copied into the
-                        # current nlp object (all the strings will be added in
-                        # create_pipe_from_source)
-                        source_nlps[model] = util.load_model(model)
+                        # Load with the same vocab, adding any strings
+                        source_nlps[model] = util.load_model(
+                            model, vocab=nlp.vocab, exclude=["lookups"]
+                        )
                     source_name = pipe_cfg.get("component", pipe_name)
                     listeners_replaced = False
                     if "replace_listeners" in pipe_cfg:
@@ -1764,6 +1777,9 @@ class Language:
                     # Delete from cache if listeners were replaced
                     if listeners_replaced:
                         del source_nlps[model]
+        # Restore the original vocab after sourcing if necessary
+        if vocab_b is not None:
+            nlp.vocab.from_bytes(vocab_b)
         disabled_pipes = [*config["nlp"]["disabled"], *disable]
         nlp._disabled = set(p for p in disabled_pipes if p not in exclude)
         nlp.batch_size = config["nlp"]["batch_size"]
diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx
index f204ce224..a602ba737 100644
--- a/spacy/matcher/dependencymatcher.pyx
+++ b/spacy/matcher/dependencymatcher.pyx
@@ -177,13 +177,14 @@ cdef class DependencyMatcher:
 
         # Add 'RIGHT_ATTRS' to self._patterns[key]
         _patterns = [[[pat["RIGHT_ATTRS"]] for pat in pattern] for pattern in patterns]
+        pattern_offset = len(self._patterns[key])
         self._patterns[key].extend(_patterns)
 
         # Add each node pattern of all the input patterns individually to the
         # matcher. This enables only a single instance of Matcher to be used.
         # Multiple adds are required to track each node pattern.
         tokens_to_key_list = []
-        for i, current_patterns in enumerate(_patterns):
+        for i, current_patterns in enumerate(_patterns, start=pattern_offset):
 
             # Preallocate list space
             tokens_to_key = [None] * len(current_patterns)
@@ -263,7 +264,9 @@ cdef class DependencyMatcher:
         self._raw_patterns.pop(key)
         self._tree.pop(key)
         self._root.pop(key)
-        self._tokens_to_key.pop(key)
+        for mklist in self._tokens_to_key.pop(key):
+            for mkey in mklist:
+                self._matcher.remove(mkey)
 
     def _get_keys_to_position_maps(self, doc):
         """
diff --git a/spacy/pipeline/lemmatizer.py b/spacy/pipeline/lemmatizer.py
index 74f502d80..5adae10d2 100644
--- a/spacy/pipeline/lemmatizer.py
+++ b/spacy/pipeline/lemmatizer.py
@@ -208,7 +208,7 @@ class Lemmatizer(Pipe):
         univ_pos = token.pos_.lower()
         if univ_pos in ("", "eol", "space"):
             if univ_pos == "":
-                warnings.warn(Warnings.W108.format(text=string))
+                warnings.warn(Warnings.W108)
             return [string.lower()]
         # See Issue #435 for example of where this logic is requied.
         if self.is_base_form(token):
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index a5dedcc87..10982bac1 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -4,6 +4,7 @@ from spacy.util import get_lang_class
 
 def pytest_addoption(parser):
     parser.addoption("--slow", action="store_true", help="include slow tests")
+    parser.addoption("--issue", action="store", help="test specific issues")
 
 
 def pytest_runtest_setup(item):
@@ -16,10 +17,24 @@ def pytest_runtest_setup(item):
         # options weren't given.
         return item.config.getoption(f"--{opt}", False)
 
+    # Integration of boolean flags
     for opt in ["slow"]:
         if opt in item.keywords and not getopt(opt):
             pytest.skip(f"need --{opt} option to run")
 
+    # Special integration to mark tests with issue numbers
+    issues = getopt("issue")
+    if isinstance(issues, str):
+        if "issue" in item.keywords:
+            # Convert issues provided on the CLI to list of ints
+            issue_nos = [int(issue.strip()) for issue in issues.split(",")]
+            # Get all issues specified by decorators and check if they're provided
+            issue_refs = [mark.args[0] for mark in item.iter_markers(name="issue")]
+            if not any([ref in issue_nos for ref in issue_refs]):
+                pytest.skip(f"not referencing specified issues: {issue_nos}")
+        else:
+            pytest.skip("not referencing any issues")
+
 
 # Fixtures for language tokenizers (languages sorted alphabetically)
 
diff --git a/spacy/tests/matcher/test_dependency_matcher.py b/spacy/tests/matcher/test_dependency_matcher.py
index 0e1eae588..61ae43c52 100644
--- a/spacy/tests/matcher/test_dependency_matcher.py
+++ b/spacy/tests/matcher/test_dependency_matcher.py
@@ -368,3 +368,87 @@ def test_dependency_matcher_span_user_data(en_tokenizer):
         assert doc_match[0] == span_match[0]
         for doc_t_i, span_t_i in zip(doc_match[1], span_match[1]):
             assert doc_t_i == span_t_i + offset
+
+
+def test_dependency_matcher_order_issue(en_tokenizer):
+    # issue from #9263
+    doc = en_tokenizer("I like text")
+    doc[2].head = doc[1]
+
+    # this matches on attrs but not rel op
+    pattern1 = [
+        {"RIGHT_ID": "root", "RIGHT_ATTRS": {"ORTH": "like"}},
+        {
+            "LEFT_ID": "root",
+            "RIGHT_ID": "r",
+            "RIGHT_ATTRS": {"ORTH": "text"},
+            "REL_OP": "<",
+        },
+    ]
+
+    # this matches on rel op but not attrs
+    pattern2 = [
+        {"RIGHT_ID": "root", "RIGHT_ATTRS": {"ORTH": "like"}},
+        {
+            "LEFT_ID": "root",
+            "RIGHT_ID": "r",
+            "RIGHT_ATTRS": {"ORTH": "fish"},
+            "REL_OP": ">",
+        },
+    ]
+
+    matcher = DependencyMatcher(en_tokenizer.vocab)
+
+    # This should behave the same as the next pattern
+    matcher.add("check", [pattern1, pattern2])
+    matches = matcher(doc)
+
+    assert matches == []
+
+    # use a new matcher
+    matcher = DependencyMatcher(en_tokenizer.vocab)
+    # adding one at a time under same label gets a match
+    matcher.add("check", [pattern1])
+    matcher.add("check", [pattern2])
+    matches = matcher(doc)
+
+    assert matches == []
+
+
+def test_dependency_matcher_remove(en_tokenizer):
+    # issue from #9263
+    doc = en_tokenizer("The red book")
+    doc[1].head = doc[2]
+
+    # this matches
+    pattern1 = [
+        {"RIGHT_ID": "root", "RIGHT_ATTRS": {"ORTH": "book"}},
+        {
+            "LEFT_ID": "root",
+            "RIGHT_ID": "r",
+            "RIGHT_ATTRS": {"ORTH": "red"},
+            "REL_OP": ">",
+        },
+    ]
+
+    # add and then remove it
+    matcher = DependencyMatcher(en_tokenizer.vocab)
+    matcher.add("check", [pattern1])
+    matcher.remove("check")
+
+    # this matches on rel op but not attrs
+    pattern2 = [
+        {"RIGHT_ID": "root", "RIGHT_ATTRS": {"ORTH": "flag"}},
+        {
+            "LEFT_ID": "root",
+            "RIGHT_ID": "r",
+            "RIGHT_ATTRS": {"ORTH": "blue"},
+            "REL_OP": ">",
+        },
+    ]
+
+    # Adding this new pattern with the same label, which should not match
+    matcher.add("check", [pattern2])
+    matches = matcher(doc)
+
+    assert matches == []
diff --git a/spacy/tests/package/__init__.py b/spacy/tests/package/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/spacy/tests/pipeline/test_spancat.py b/spacy/tests/pipeline/test_spancat.py
index 7b759f8f6..d4d0617d7 100644
--- a/spacy/tests/pipeline/test_spancat.py
+++ b/spacy/tests/pipeline/test_spancat.py
@@ -114,7 +114,7 @@ def test_make_spangroup(max_positive, nr_results):
     doc = nlp.make_doc("Greater London")
     ngram_suggester = registry.misc.get("spacy.ngram_suggester.v1")(sizes=[1, 2])
     indices = ngram_suggester([doc])[0].dataXd
-    assert_array_equal(indices, numpy.asarray([[0, 1], [1, 2], [0, 2]]))
+    assert_array_equal(OPS.to_numpy(indices), numpy.asarray([[0, 1], [1, 2], [0, 2]]))
     labels = ["Thing", "City", "Person", "GreatCity"]
     scores = numpy.asarray(
         [[0.2, 0.4, 0.3, 0.1], [0.1, 0.6, 0.2, 0.4], [0.8, 0.7, 0.3, 0.9]], dtype="f"
diff --git a/spacy/tests/regression/test_issue5501-6000.py b/spacy/tests/regression/test_issue5501-6000.py
index a35de92fa..355ffffeb 100644
--- a/spacy/tests/regression/test_issue5501-6000.py
+++ b/spacy/tests/regression/test_issue5501-6000.py
@@ -49,8 +49,8 @@ def test_issue5551(textcat_config):
     # All results should be the same because of the fixed seed
     assert len(results) == 3
     ops = get_current_ops()
-    assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[1]))
-    assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[2]))
+    assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[1]), decimal=5)
+    assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[2]), decimal=5)
 
 
 def test_issue5838():
diff --git a/spacy/tests/regression/test_issue7716.py b/spacy/tests/regression/test_issue7716.py
new file mode 100644
index 000000000..811952792
--- /dev/null
+++ b/spacy/tests/regression/test_issue7716.py
@@ -0,0 +1,54 @@
+import pytest
+from thinc.api import Adam
+from spacy.attrs import NORM
+from spacy.vocab import Vocab
+from spacy import registry
+from spacy.training import Example
+from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
+from spacy.tokens import Doc
+from spacy.pipeline import DependencyParser
+
+
+@pytest.fixture
+def vocab():
+    return Vocab(lex_attr_getters={NORM: lambda s: s})
+
+
+def _parser_example(parser):
+    doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
+    gold = {"heads": [1, 1, 3, 3], "deps": ["right", "ROOT", "left", "ROOT"]}
+    return Example.from_dict(doc, gold)
+
+
+@pytest.fixture
+def parser(vocab):
+    vocab.strings.add("ROOT")
+    cfg = {"model": DEFAULT_PARSER_MODEL}
+    model = registry.resolve(cfg, validate=True)["model"]
+    parser = DependencyParser(vocab, model)
+    parser.cfg["token_vector_width"] = 4
+    parser.cfg["hidden_width"] = 32
+    # parser.add_label('right')
+    parser.add_label("left")
+    parser.initialize(lambda: [_parser_example(parser)])
+    sgd = Adam(0.001)
+
+    for i in range(10):
+        losses = {}
+        doc = Doc(vocab, words=["a", "b", "c", "d"])
+        example = Example.from_dict(
+            doc, {"heads": [1, 1, 3, 3], "deps": ["left", "ROOT", "left", "ROOT"]}
+        )
+        parser.update([example], sgd=sgd, losses=losses)
+    return parser
+
+
+@pytest.mark.xfail(reason="Not fixed yet")
+def test_partial_annotation(parser):
+    doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
+    doc[2].is_sent_start = False
+    # Note that if the following line is used, then doc[2].is_sent_start == False
+    # doc[3].is_sent_start = False
+
+    doc = parser(doc)
+    assert doc[2].is_sent_start == False
diff --git a/spacy/tests/regression/test_issue8168.py b/spacy/tests/regression/test_issue8168.py
index fbddf643c..e3f3b5cfa 100644
--- a/spacy/tests/regression/test_issue8168.py
+++ b/spacy/tests/regression/test_issue8168.py
@@ -1,6 +1,8 @@
+import pytest
 from spacy.lang.en import English
 
 
+@pytest.mark.issue(8168)
 def test_issue8168():
     nlp = English()
     ruler = nlp.add_pipe("entity_ruler")
diff --git a/spacy/tests/test_models.py b/spacy/tests/test_models.py
index 47540198a..2306cabb7 100644
--- a/spacy/tests/test_models.py
+++ b/spacy/tests/test_models.py
@@ -193,6 +193,7 @@ def test_models_update_consistently(seed, dropout, model_func, kwargs, get_X):
     assert_array_almost_equal(
         model1.ops.to_numpy(get_all_params(model1)),
         model2.ops.to_numpy(get_all_params(model2)),
+        decimal=5,
     )
 
 
diff --git a/spacy/tests/training/test_readers.py b/spacy/tests/training/test_readers.py
index f53660818..1f262c011 100644
--- a/spacy/tests/training/test_readers.py
+++ b/spacy/tests/training/test_readers.py
@@ -82,15 +82,15 @@ def test_cat_readers(reader, additional_config):
 
     [nlp]
     lang = "en"
-    pipeline = ["tok2vec", "textcat"]
+    pipeline = ["tok2vec", "textcat_multilabel"]
 
     [components]
 
     [components.tok2vec]
     factory = "tok2vec"
 
-    [components.textcat]
-    factory = "textcat"
+    [components.textcat_multilabel]
+    factory = "textcat_multilabel"
     """
     config = Config().from_str(nlp_config_string)
     config["corpora"]["@readers"] = reader
diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py
index 5e7d170f8..1d26b968c 100644
--- a/spacy/tokens/_serialize.py
+++ b/spacy/tokens/_serialize.py
@@ -8,7 +8,7 @@ from thinc.api import NumpyOps
 from .doc import Doc
 from ..vocab import Vocab
 from ..compat import copy_reg
-from ..attrs import SPACY, ORTH, intify_attr
+from ..attrs import SPACY, ORTH, intify_attr, IDS
 from ..errors import Errors
 from ..util import ensure_path, SimpleFrozenList
 
@@ -64,7 +64,13 @@ class DocBin:
 
         DOCS: https://spacy.io/api/docbin#init
         """
-        attrs = sorted([intify_attr(attr) for attr in attrs])
+        int_attrs = [intify_attr(attr) for attr in attrs]
+        if None in int_attrs:
+            non_valid = [attr for attr in attrs if intify_attr(attr) is None]
+            raise KeyError(
+                Errors.E983.format(dict="attrs", key=non_valid, keys=IDS.keys())
+            ) from None
+        attrs = sorted(int_attrs)
         self.version = "0.1"
         self.attrs = [attr for attr in attrs if attr != ORTH and attr != SPACY]
         self.attrs.insert(0, ORTH)  # Ensure ORTH is always attrs[0]
diff --git a/spacy/training/__init__.py b/spacy/training/__init__.py
index 22f1e64b1..a4feb01f4 100644
--- a/spacy/training/__init__.py
+++ b/spacy/training/__init__.py
@@ -1,4 +1,4 @@
-from .corpus import Corpus  # noqa: F401
+from .corpus import Corpus, JsonlCorpus  # noqa: F401
 from .example import Example, validate_examples, validate_get_examples  # noqa: F401
 from .alignment import Alignment  # noqa: F401
 from .augment import dont_augment, orth_variants_augmenter  # noqa: F401
diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py
index bd014f75f..4eb8ea276 100644
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@@ -144,7 +144,12 @@ def load_vectors_into_model(
 ) -> None:
     """Load word vectors from an installed model or path into a model instance."""
     try:
-        vectors_nlp = load_model(name)
+        # Load with the same vocab, which automatically adds the vectors to
+        # the current nlp object. Exclude lookups so they are not modified.
+        exclude = ["lookups"]
+        if not add_strings:
+            exclude.append("strings")
+        vectors_nlp = load_model(name, vocab=nlp.vocab, exclude=exclude)
     except ConfigValidationError as e:
         title = f"Config validation error for vectors {name}"
         desc = (
@@ -158,15 +163,8 @@ def load_vectors_into_model(
     if len(vectors_nlp.vocab.vectors.keys()) == 0:
         logger.warning(Warnings.W112.format(name=name))
 
-    nlp.vocab.vectors = vectors_nlp.vocab.vectors
     for lex in nlp.vocab:
         lex.rank = nlp.vocab.vectors.key2row.get(lex.orth, OOV_RANK)
-    if add_strings:
-        # I guess we should add the strings from the vectors_nlp model?
-        # E.g. if someone does a similarity query, they might expect the strings.
-        for key in nlp.vocab.vectors.key2row:
-            if key in vectors_nlp.vocab.strings:
-                nlp.vocab.strings.add(vectors_nlp.vocab.strings[key])
 
 
 def init_tok2vec(
diff --git a/spacy/util.py b/spacy/util.py
index fc1c0e76d..b25be5361 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -1475,7 +1475,7 @@ def get_arg_names(func: Callable) -> List[str]:
     RETURNS (List[str]): The argument names.
     """
     argspec = inspect.getfullargspec(func)
-    return list(set([*argspec.args, *argspec.kwonlyargs]))
+    return list(dict.fromkeys([*argspec.args, *argspec.kwonlyargs]))
 
 
 def combine_score_weights(
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index 402528f28..ef4435656 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -530,7 +530,6 @@ cdef class Vocab:
 
         setters = {
             "strings": lambda b: self.strings.from_bytes(b),
-            "lexemes": lambda b: self.lexemes_from_bytes(b),
             "vectors": lambda b: serialize_vectors(b),
             "lookups": lambda b: self.lookups.from_bytes(b),
         }
diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index aadeebd77..470d11a3a 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -260,16 +260,18 @@ $ python -m spacy convert [input_file] [output_dir] [--converter] [--file-type]
 
 | Name                                             | Description                                                                                                                               |
 | ------------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------- |
-| `input_file`                                     | Input file. ~~Path (positional)~~                                                                                                         |
+| `input_path`                                     | Input file or directory. ~~Path (positional)~~                                                                                            |
 | `output_dir`                                     | Output directory for converted file. Defaults to `"-"`, meaning data will be written to `stdout`. ~~Optional[Path] \(option)~~            |
 | `--converter`, `-c` <Tag variant="new">2</Tag>   | Name of converter to use (see below). ~~str (option)~~                                                                                    |
 | `--file-type`, `-t` <Tag variant="new">2.1</Tag> | Type of file to create. Either `spacy` (default) for binary [`DocBin`](/api/docbin) data or `json` for v2.x JSON format. ~~str (option)~~ |
 | `--n-sents`, `-n`                                | Number of sentences per document. Supported for: `conll`, `conllu`, `iob`, `ner` ~~int (option)~~                                         |
 | `--seg-sents`, `-s` <Tag variant="new">2.2</Tag> | Segment sentences. Supported for: `conll`, `ner` ~~bool (flag)~~                                                                          |
-| `--base`, `-b`                                   | Trained spaCy pipeline for sentence segmentation to use as base (for `--seg-sents`). ~~Optional[str](option)~~                            |
+| `--base`, `-b`, `--model`                        | Trained spaCy pipeline for sentence segmentation to use as base (for `--seg-sents`). ~~Optional[str](option)~~                            |
 | `--morphology`, `-m`                             | Enable appending morphology to tags. Supported for: `conllu` ~~bool (flag)~~                                                              |
+| `--merge-subtokens`, `-T`                        | Merge CoNLL-U subtokens ~~bool (flag)~~                                                                                                   |
 | `--ner-map`, `-nm`                               | NER tag mapping (as JSON-encoded dict of entity types). Supported for: `conllu` ~~Optional[Path](option)~~                                |
 | `--lang`, `-l` <Tag variant="new">2.1</Tag>      | Language code (if tokenizer required). ~~Optional[str] \(option)~~                                                                        |
+| `--concatenate`, `-C`                            | Concatenate output to a single file ~~bool (flag)~~                                                                                       |
 | `--help`, `-h`                                   | Show help message and available arguments. ~~bool (flag)~~                                                                                |
 | **CREATES**                                      | Binary [`DocBin`](/api/docbin) training data that can be used with [`spacy train`](/api/cli#train).                                       |
 
diff --git a/website/docs/api/token.md b/website/docs/api/token.md
index 44c92d1ee..44a2ea9e8 100644
--- a/website/docs/api/token.md
+++ b/website/docs/api/token.md
@@ -474,8 +474,8 @@ The L2 norm of the token's vector representation.
 | `like_email`                                 | Does the token resemble an email address? ~~bool~~                                                                                                                                                                                                                   |
 | `is_oov`                                     | Is the token out-of-vocabulary (i.e. does it not have a word vector)? ~~bool~~                                                                                                                                                                                       |
 | `is_stop`                                    | Is the token part of a "stop list"? ~~bool~~                                                                                                                                                                                                                         |
-| `pos`                                        | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). ~~int~~                                                                                                                                               |
-| `pos_`                                       | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). ~~str~~                                                                                                                                               |
+| `pos`                                        | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/u/pos/). ~~int~~                                                                                                                                                    |
+| `pos_`                                       | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/u/pos/). ~~str~~                                                                                                                                                    |
 | `tag`                                        | Fine-grained part-of-speech. ~~int~~                                                                                                                                                                                                                                 |
 | `tag_`                                       | Fine-grained part-of-speech. ~~str~~                                                                                                                                                                                                                                 |
 | `morph` <Tag variant="new">3</Tag>           | Morphological analysis. ~~MorphAnalysis~~                                                                                                                                                                                                                            |
diff --git a/website/docs/api/vocab.md b/website/docs/api/vocab.md
index 40a3c3b22..c37b27a0e 100644
--- a/website/docs/api/vocab.md
+++ b/website/docs/api/vocab.md
@@ -325,6 +325,5 @@ serialization by passing in the string names via the `exclude` argument.
 | Name      | Description                                           |
 | --------- | ----------------------------------------------------- |
 | `strings` | The strings in the [`StringStore`](/api/stringstore). |
-| `lexemes` | The lexeme data.                                      |
 | `vectors` | The word vectors, if available.                       |
 | `lookups` | The lookup tables, if available.                      |
diff --git a/website/docs/usage/101/_pos-deps.md b/website/docs/usage/101/_pos-deps.md
index a531b245e..93ad0961a 100644
--- a/website/docs/usage/101/_pos-deps.md
+++ b/website/docs/usage/101/_pos-deps.md
@@ -25,7 +25,7 @@ for token in doc:
 
 > - **Text:** The original word text.
 > - **Lemma:** The base form of the word.
-> - **POS:** The simple [UPOS](https://universaldependencies.org/docs/u/pos/)
+> - **POS:** The simple [UPOS](https://universaldependencies.org/u/pos/)
 >   part-of-speech tag.
 > - **Tag:** The detailed part-of-speech tag.
 > - **Dep:** Syntactic dependency, i.e. the relation between tokens.
diff --git a/website/docs/usage/index.md b/website/docs/usage/index.md
index 665d334f8..707dd3215 100644
--- a/website/docs/usage/index.md
+++ b/website/docs/usage/index.md
@@ -284,7 +284,9 @@ $ python -m pytest --pyargs %%SPACY_PKG_NAME --slow        # basic and slow test
 ## Troubleshooting guide {#troubleshooting}
 
 This section collects some of the most common errors you may come across when
-installing, loading and using spaCy, as well as their solutions.
+installing, loading and using spaCy, as well as their solutions. Also see the
+[Discussions FAQ Thread](https://github.com/explosion/spaCy/discussions/8226),
+which is updated more frequently and covers more transitory issues.
 
 > #### Help us improve this guide
 >
@@ -311,62 +313,6 @@ language's `Language` class instead, for example
 
 </Accordion>
 
-<Accordion title="No such option: --no-cache-dir" id="no-cache-dir">
-
-```
-no such option: --no-cache-dir
-```
-
-The `download` command uses pip to install the pipeline packages and sets the
-`--no-cache-dir` flag to prevent it from requiring too much memory.
-[This setting](https://pip.pypa.io/en/stable/reference/pip_install/#caching)
-requires pip v6.0 or newer. Run `pip install -U pip` to upgrade to the latest
-version of pip. To see which version you have installed, run `pip --version`.
-
-</Accordion>
-
-<Accordion title="sre_constants.error: bad character range" id="narrow-unicode">
-
-```
-sre_constants.error: bad character range
-```
-
-In [v2.1](/usage/v2-1), spaCy changed its implementation of regular expressions
-for tokenization to make it up to 2-3 times faster. But this also means that
-it's very important now that you run spaCy with a wide unicode build of Python.
-This means that the build has 1114111 unicode characters available, instead of
-only 65535 in a narrow unicode build. You can check this by running the
-following command:
-
-```bash
-$ python -c "import sys; print(sys.maxunicode)"
-```
-
-If you're running a narrow unicode build, reinstall Python and use a wide
-unicode build instead. You can also rebuild Python and set the
-`--enable-unicode=ucs4` flag.
-
-</Accordion>
-
-<Accordion title="Unknown locale: UTF-8" id="unknown-locale">
-
-```
-ValueError: unknown locale: UTF-8
-```
-
-This error can sometimes occur on OSX and is likely related to a still
-unresolved [Python bug](https://bugs.python.org/issue18378). However, it's easy
-to fix: just add the following to your `~/.bash_profile` or `~/.zshrc` and then
-run `source ~/.bash_profile` or `source ~/.zshrc`. Make sure to add **both
-lines** for `LC_ALL` and `LANG`.
-
-```bash
-$ export LC_ALL=en_US.UTF-8
-$ export LANG=en_US.UTF-8
-```
-
-</Accordion>
-
 <Accordion title="Import error: No module named spacy" id="import-error">
 
 ```
diff --git a/website/meta/universe.json b/website/meta/universe.json
index 28fe058eb..7438a8932 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -1363,20 +1363,19 @@
             "url": "https://explosion.ai/demos/sense2vec",
             "code_example": [
                 "import spacy",
-                "from sense2vec import Sense2VecComponent",
                 "",
-                "nlp = spacy.load('en')",
-                "s2v = Sense2VecComponent('/path/to/reddit_vectors-1.1.0')",
-                "nlp.add_pipe(s2v)",
+                "nlp = spacy.load(\"en_core_web_sm\")",
+                "s2v = nlp.add_pipe(\"sense2vec\")",
+                "s2v.from_disk(\"/path/to/s2v_reddit_2015_md\")",
                 "",
                 "doc = nlp(\"A sentence about natural language processing.\")",
-                "assert doc[3].text == 'natural language processing'",
-                "freq = doc[3]._.s2v_freq",
-                "vector = doc[3]._.s2v_vec",
-                "most_similar = doc[3]._.s2v_most_similar(3)",
-                "# [(('natural language processing', 'NOUN'), 1.0),",
-                "#  (('machine learning', 'NOUN'), 0.8986966609954834),",
-                "#  (('computer vision', 'NOUN'), 0.8636297583580017)]"
+                "assert doc[3:6].text == \"natural language processing\"",
+                "freq = doc[3:6]._.s2v_freq",
+                "vector = doc[3:6]._.s2v_vec",
+                "most_similar = doc[3:6]._.s2v_most_similar(3)",
+                "# [(('machine learning', 'NOUN'), 0.8986967),",
+                "#  (('computer vision', 'NOUN'), 0.8636297),",
+                "#  (('deep learning', 'NOUN'), 0.8573361)]"
             ],
             "category": ["pipeline", "standalone", "visualizers"],
             "tags": ["vectors"],
@@ -2970,11 +2969,10 @@
             "github": "thomasthiebaud/spacy-fastlang",
             "pip": "spacy_fastlang",
             "code_example": [
-                "import spacy",
-                "from spacy_fastlang import LanguageDetector",
+                "import spacy_fastlang",
                 "",
-                "nlp = spacy.load('en_core_web_sm')",
-                "nlp.add_pipe(LanguageDetector())",
+                "nlp = spacy.load(\"en_core_web_sm\")",
+                "nlp.add_pipe(\"language_detector\")",
                 "doc = nlp('Life is like a box of chocolates. You never know what you are gonna get.')",
                 "",
                 "assert doc._.language == 'en'",
@@ -3476,7 +3474,51 @@
                 "github": "bbieniek"
             },
             "category": ["apis"]
-        }
+        },
+        {
+            "id": "phruzz_matcher",
+            "title": "phruzz-matcher",
+            "slogan": "Phrase matcher using RapidFuzz",
+            "description": "Combination of the RapidFuzz library with Spacy PhraseMatcher The goal of this component is to find matches when there were NO \"perfect matches\" due to typos or abbreviations between a Spacy doc and a list of phrases.",
+            "github": "mjvallone/phruzz-matcher",
+            "pip": "phruzz_matcher",
+            "code_example": [
+                "import spacy",
+                "from spacy.language import Language",
+                "from phruzz_matcher.phrase_matcher import PhruzzMatcher",
+                "",
+                "famous_people = [",
+                "        \"Brad Pitt\",",
+                "        \"Demi Moore\",",
+                "        \"Bruce Willis\",",
+                "        \"Jim Carrey\",",
+                "]",
+                "",
+                "@Language.factory(\"phrase_matcher\")",
+                "def phrase_matcher(nlp: Language, name: str):",
+                "    return PhruzzMatcher(nlp, famous_people, \"FAMOUS_PEOPLE\", 85)",
+                "",
+                "nlp = spacy.blank('es')",
+                "nlp.add_pipe(\"phrase_matcher\")",
+                "",
+                "doc = nlp(\"El otro día fui a un bar donde vi a brad pit y a Demi Moore, estaban tomando unas cervezas mientras charlaban de sus asuntos.\")",
+                "print(f\"doc.ents: {doc.ents}\")",
+                "",
+                "#OUTPUT",
+                "#doc.ents: (brad pit, Demi Moore)"
+            ],
+            "thumb": "https://avatars.githubusercontent.com/u/961296?v=4",
+            "image": "",
+            "code_language": "python",
+            "author": "Martin Vallone",
+            "author_links": {
+                "github": "mjvallone",
+                "twitter": "vallotin",
+                "website": "https://fiqus.coop/"
+            },
+            "category": ["pipeline", "research", "standalone"],
+            "tags": ["spacy", "python", "nlp", "ner"]
+        }        
     ],
 
     "categories": [
diff --git a/website/src/templates/models.js b/website/src/templates/models.js
index 21ade5e36..554823ebf 100644
--- a/website/src/templates/models.js
+++ b/website/src/templates/models.js
@@ -34,6 +34,7 @@ const MODEL_META = {
     core_sm: 'Vocabulary, syntax, entities',
     dep: 'Vocabulary, syntax',
     ent: 'Named entities',
+    sent: 'Sentence boundaries',
     pytt: 'PyTorch Transformers',
     trf: 'Transformers',
     vectors: 'Word vectors',
@@ -195,6 +196,7 @@ const Model = ({
     const [isError, setIsError] = useState(true)
     const [meta, setMeta] = useState({})
     const { type, genre, size } = getModelComponents(name)
+    const display_type = type === 'core' && size === 'sm' ? 'core_sm' : type
     const version = useMemo(() => getLatestVersion(name, compatibility, prereleases), [
         name,
         compatibility,
@@ -231,7 +233,7 @@ const Model = ({
 
     const rows = [
         { label: 'Language', tag: langId, content: langName },
-        { label: 'Type', tag: type, content: MODEL_META[type] },
+        { label: 'Type', tag: type, content: MODEL_META[display_type] },
         { label: 'Genre', tag: genre, content: MODEL_META[genre] },
         { label: 'Size', tag: size, content: meta.sizeFull },
         { label: 'Components', content: components, help: MODEL_META.components },