Merge pull request #9612 from adrianeboyd/chore/switch-to-master-v3.2.0

Switch v3.2.0 to master
2025-08-22 04:54:56 +03:00 · 2021-11-03 16:27:34 +01:00 · 2021-11-03 16:27:34 +01:00 · 2bf52c44b1
commit 2bf52c44b1
parent cab9209c3d 07dea324f6
148 changed files with 3381 additions and 1690 deletions
--- a/.github/azure-steps.yml
+++ b/.github/azure-steps.yml
@ -65,8 +65,11 @@ steps:
    condition: eq(${{ parameters.gpu }}, true)

  - script: |
-      python -m spacy download ca_core_news_sm
-      python -m spacy download ca_core_news_md
+      #python -m spacy download ca_core_news_sm
+      #python -m spacy download ca_core_news_md
+      # temporarily install the v3.1.0 models
+      pip install --no-deps https://github.com/explosion/spacy-models/releases/download/ca_core_news_sm-3.1.0/ca_core_news_sm-3.1.0-py3-none-any.whl
+      pip install --no-deps https://github.com/explosion/spacy-models/releases/download/ca_core_news_md-3.1.0/ca_core_news_md-3.1.0-py3-none-any.whl
      python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
    displayName: 'Test download CLI'
    condition: eq(variables['python_version'], '3.8')
@ -95,7 +98,8 @@ steps:

  - script: |
      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
-      PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
+      # temporarily ignore W095
+      PYTHONWARNINGS="error,ignore:[W095]:UserWarning,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
    displayName: 'Test assemble CLI'
    condition: eq(variables['python_version'], '3.8')

--- a/.github/contributors/avi197.md
+++ b/.github/contributors/avi197.md
@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Son Pham             |
+| Company name (if applicable)   |                      |
+| Title or role (if applicable)  |                      |
+| Date                           | 09/10/2021           |
+| GitHub username                | Avi197               |
+| Website (optional)             |                      |
--- a/.github/contributors/fgaim.md
+++ b/.github/contributors/fgaim.md
@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Fitsum Gaim          |
+| Company name (if applicable)   |                      |
+| Title or role (if applicable)  |                      |
+| Date                           | 2021-08-07           |
+| GitHub username                | fgaim                |
+| Website (optional)             |                      |
--- a/.github/contributors/syrull.md
+++ b/.github/contributors/syrull.md
@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Dimitar Ganev |
+| Company name (if applicable)   |  |
+| Title or role (if applicable)  |  |
+| Date                           | 2021/8/2 |
+| GitHub username                | syrull |
+| Website (optional)             |                      |
--- a/.gitignore
+++ b/.gitignore
@ -9,6 +9,7 @@ keys/
 spacy/tests/package/setup.cfg
 spacy/tests/package/pyproject.toml
 spacy/tests/package/requirements.txt
+spacy/tests/universe/universe.json

 # Website
 website/.cache/
--- a/requirements.txt
+++ b/requirements.txt
@ -1,5 +1,6 @@
 # Our libraries
 spacy-legacy>=3.0.8,<3.1.0
+spacy-loggers>=1.0.0,<2.0.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
 thinc>=8.0.12,<8.1.0
@ -17,6 +18,7 @@ requests>=2.13.0,<3.0.0
 tqdm>=4.38.0,<5.0.0
 pydantic>=1.7.4,!=1.8,!=1.8.1,<1.9.0
 jinja2
+langcodes>=3.2.0,<4.0.0
 # Official Python utilities
 setuptools
 packaging>=20.0
--- a/setup.cfg
+++ b/setup.cfg
@ -42,6 +42,7 @@ setup_requires =
 install_requires =
    # Our libraries
    spacy-legacy>=3.0.8,<3.1.0
+    spacy-loggers>=1.0.0,<2.0.0
    murmurhash>=0.28.0,<1.1.0
    cymem>=2.0.2,<2.1.0
    preshed>=3.0.2,<3.1.0
@ -62,6 +63,7 @@ install_requires =
    setuptools
    packaging>=20.0
    typing_extensions>=3.7.4,<4.0.0.0; python_version < "3.8"
+    langcodes>=3.2.0,<4.0.0

 [options.entry_points]
 console_scripts =
@ -69,9 +71,9 @@ console_scripts =

 [options.extras_require]
 lookups =
-    spacy_lookups_data>=1.0.2,<1.1.0
+    spacy_lookups_data>=1.0.3,<1.1.0
 transformers =
-    spacy_transformers>=1.0.1,<1.2.0
+    spacy_transformers>=1.1.2,<1.2.0
 ray =
    spacy_ray>=0.1.0,<1.0.0
 cuda =
--- a/setup.py
+++ b/setup.py
@ -81,6 +81,7 @@ COPY_FILES = {
    ROOT / "setup.cfg": PACKAGE_ROOT / "tests" / "package",
    ROOT / "pyproject.toml": PACKAGE_ROOT / "tests" / "package",
    ROOT / "requirements.txt": PACKAGE_ROOT / "tests" / "package",
+    ROOT / "website" / "meta" / "universe.json": PACKAGE_ROOT / "tests" / "universe",
 }


--- a/spacy/about.py
+++ b/spacy/about.py
@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "3.1.4"
+__version__ = "3.2.0"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __projects__ = "https://github.com/explosion/projects"
--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@ -142,7 +142,7 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
    for name, value in stringy_attrs.items():
        int_key = intify_attr(name)
        if int_key is not None:
-            if strings_map is not None and isinstance(value, basestring):
+            if strings_map is not None and isinstance(value, str):
                if hasattr(strings_map, 'add'):
                    value = strings_map.add(value)
                else:
--- a/spacy/cli/init_pipeline.py
+++ b/spacy/cli/init_pipeline.py
@ -20,6 +20,7 @@ def init_vectors_cli(
    output_dir: Path = Arg(..., help="Pipeline output directory"),
    prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"),
    truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
+    mode: str = Opt("default", "--mode", "-m", help="Vectors mode: default or floret"),
    name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
    jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
@ -34,7 +35,14 @@ def init_vectors_cli(
    nlp = util.get_lang_class(lang)()
    if jsonl_loc is not None:
        update_lexemes(nlp, jsonl_loc)
-    convert_vectors(nlp, vectors_loc, truncate=truncate, prune=prune, name=name)
+    convert_vectors(
+        nlp,
+        vectors_loc,
+        truncate=truncate,
+        prune=prune,
+        name=name,
+        mode=mode,
+    )
    msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
    nlp.to_disk(output_dir)
    msg.good(
--- a/spacy/default_config_pretraining.cfg
+++ b/spacy/default_config_pretraining.cfg
@ -5,6 +5,7 @@ raw_text = null
 max_epochs = 1000
 dropout = 0.2
 n_save_every = null
+n_save_epoch = null
 component = "tok2vec"
 layer = ""
 corpus = "corpora.pretrain"
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -22,6 +22,9 @@ def setup_default_warnings():
    # warn once about lemmatizer without required POS
    filter_warning("once", error_msg=Warnings.W108)

+    # floret vector table cannot be modified
+    filter_warning("once", error_msg="[W114]")
+

 def filter_warning(action: str, error_msg: str):
    """Customize how spaCy should handle a certain warning.
@ -186,6 +189,8 @@ class Warnings(metaclass=ErrorsWithCodes):
            "vectors are not identical to current pipeline vectors.")
    W114 = ("Using multiprocessing with GPU models is not recommended and may "
            "lead to errors.")
+    W115 = ("Skipping {method}: the floret vector table cannot be modified. "
+            "Vectors are calculated from character ngrams.")


 class Errors(metaclass=ErrorsWithCodes):
@ -277,7 +282,7 @@ class Errors(metaclass=ErrorsWithCodes):
            "you forget to call the `set_extension` method?")
    E047 = ("Can't assign a value to unregistered extension attribute "
            "'{name}'. Did you forget to call the `set_extension` method?")
-    E048 = ("Can't import language {lang} from spacy.lang: {err}")
+    E048 = ("Can't import language {lang} or any matching language from spacy.lang: {err}")
    E050 = ("Can't find model '{name}'. It doesn't seem to be a Python "
            "package or a valid path to a data directory.")
    E052 = ("Can't find model directory: {path}")
@ -511,13 +516,24 @@ class Errors(metaclass=ErrorsWithCodes):
    E199 = ("Unable to merge 0-length span at `doc[{start}:{end}]`.")
    E200 = ("Can't yet set {attr} from Span. Vote for this feature on the "
            "issue tracker: http://github.com/explosion/spaCy/issues")
-    E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.")
+    E202 = ("Unsupported {name} mode '{mode}'. Supported modes: {modes}.")

    # New errors added in v3.x
-    E866 = ("A SpanGroup is not functional after the corresponding Doc has "
+    E858 = ("The {mode} vector table does not support this operation. "
+            "{alternative}")
+    E859 = ("The floret vector table cannot be modified.")
+    E860 = ("Can't truncate fasttext-bloom vectors.")
+    E861 = ("No 'keys' should be provided when initializing floret vectors "
+            "with 'minn' and 'maxn'.")
+    E862 = ("'hash_count' must be between 1-4 for floret vectors.")
+    E863 = ("'maxn' must be greater than or equal to 'minn'.")
+    E864 = ("The complete vector table 'data' is required to initialize floret "
+            "vectors.")
+    E865 = ("A SpanGroup is not functional after the corresponding Doc has "
            "been garbage collected. To keep using the spans, make sure that "
            "the corresponding Doc object is still available in the scope of "
            "your function.")
+    E866 = ("Expected a string or 'Doc' as input, but got: {type}.")
    E867 = ("The 'textcat' component requires at least two labels because it "
            "uses mutually exclusive classes where exactly one label is True "
            "for each doc. For binary classification tasks, you can use two "
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@ -124,7 +124,7 @@ cdef class KnowledgeBase:
    def get_alias_strings(self):
        return [self.vocab.strings[x] for x in self._alias_index]

-    def add_entity(self, unicode entity, float freq, vector[float] entity_vector):
+    def add_entity(self, str entity, float freq, vector[float] entity_vector):
        """
        Add an entity to the KB, optionally specifying its log probability based on corpus frequency
        Return the hash of the entity ID/name at the end.
@ -185,15 +185,15 @@ cdef class KnowledgeBase:

            i += 1

-    def contains_entity(self, unicode entity):
+    def contains_entity(self, str entity):
        cdef hash_t entity_hash = self.vocab.strings.add(entity)
        return entity_hash in self._entry_index

-    def contains_alias(self, unicode alias):
+    def contains_alias(self, str alias):
        cdef hash_t alias_hash = self.vocab.strings.add(alias)
        return alias_hash in self._alias_index

-    def add_alias(self, unicode alias, entities, probabilities):
+    def add_alias(self, str alias, entities, probabilities):
        """
        For a given alias, add its potential entities and prior probabilies to the KB.
        Return the alias_hash at the end
@ -239,7 +239,7 @@ cdef class KnowledgeBase:
            raise RuntimeError(Errors.E891.format(alias=alias))
        return alias_hash

-    def append_alias(self, unicode alias, unicode entity, float prior_prob, ignore_warnings=False):
+    def append_alias(self, str alias, str entity, float prior_prob, ignore_warnings=False):
        """
        For an alias already existing in the KB, extend its potential entities with one more.
        Throw a warning if either the alias or the entity is unknown,
@ -286,7 +286,7 @@ cdef class KnowledgeBase:
            alias_entry.probs = probs
            self._aliases_table[alias_index] = alias_entry

-    def get_alias_candidates(self, unicode alias) -> Iterator[Candidate]:
+    def get_alias_candidates(self, str alias) -> Iterator[Candidate]:
        """
        Return candidate entities for an alias. Each candidate defines the entity, the original alias,
        and the prior probability of that alias resolving to that entity.
@ -307,7 +307,7 @@ cdef class KnowledgeBase:
                for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs)
                if entry_index != 0]

-    def get_vector(self, unicode entity):
+    def get_vector(self, str entity):
        cdef hash_t entity_hash = self.vocab.strings[entity]

        # Return an empty list if this entity is unknown in this KB
@ -317,7 +317,7 @@ cdef class KnowledgeBase:

        return self._vectors_table[self._entries[entry_index].vector_index]

-    def get_prior_prob(self, unicode entity, unicode alias):
+    def get_prior_prob(self, str entity, str alias):
        """ Return the prior probability of a given alias being linked to a given entity,
        or return 0.0 when this combination is not known in the knowledge base"""
        cdef hash_t alias_hash = self.vocab.strings[alias]
@ -587,7 +587,7 @@ cdef class Writer:
    def __init__(self, path):
        assert isinstance(path, Path)
        content = bytes(path)
-        cdef bytes bytes_loc = content.encode('utf8') if type(content) == unicode else content
+        cdef bytes bytes_loc = content.encode('utf8') if type(content) == str else content
        self._fp = fopen(<char*>bytes_loc, 'wb')
        if not self._fp:
            raise IOError(Errors.E146.format(path=path))
@ -629,7 +629,7 @@ cdef class Writer:
 cdef class Reader:
    def __init__(self, path):
        content = bytes(path)
-        cdef bytes bytes_loc = content.encode('utf8') if type(content) == unicode else content
+        cdef bytes bytes_loc = content.encode('utf8') if type(content) == str else content
        self._fp = fopen(<char*>bytes_loc, 'rb')
        if not self._fp:
            PyErr_SetFromErrno(IOError)
--- a/spacy/lang/am/punctuation.py
+++ b/spacy/lang/am/punctuation.py
@ -1,7 +1,7 @@
 from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
 from ..char_classes import UNITS, ALPHA_UPPER

-_list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧".strip().split()
+_list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧ ፠ ፨".strip().split()

 _suffixes = (
    _list_punct
--- a/spacy/lang/bg/stop_words.py
+++ b/spacy/lang/bg/stop_words.py
@ -1,265 +1,79 @@
-# Source: https://github.com/Alir3z4/stop-words
-
+"""
+References:
+    https://github.com/Alir3z4/stop-words - Original list, serves as a base.
+    https://postvai.com/books/stop-dumi.pdf - Additions to the original list in order to improve it.
+"""
 STOP_WORDS = set(
    """
-а
-автентичен
-аз
-ако
-ала
-бе
-без
-беше
-би
-бивш
-бивша
-бившо
-бил
-била
-били
-било
-благодаря
-близо
-бъдат
-бъде
-бяха
-в
-вас
-ваш
-ваша
-вероятно
-вече
-взема
-ви
-вие
-винаги
-внимава
-време
-все
-всеки
-всички
-всичко
-всяка
-във
-въпреки
-върху
-г
-ги
-главен
-главна
-главно
-глас
-го
-година
-години
-годишен
-д
-да
-дали
-два
-двама
-двамата
-две
-двете
-ден
-днес
-дни
-до
-добра
-добре
-добро
-добър
-докато
-докога
-дори
-досега
-доста
-друг
-друга
-други
-е
-евтин
-едва
-един
-една
-еднаква
-еднакви
-еднакъв
-едно
-екип
-ето
-живот
-за
-забавям
-зад
-заедно
-заради
-засега
-заспал
-затова
-защо
-защото
-и
-из
-или
-им
-има
-имат
-иска
-й
-каза
-как
-каква
-какво
-както
-какъв
-като
-кога
-когато
-което
-които
-кой
-който
-колко
-която
-къде
-където
-към
-лесен
-лесно
-ли
-лош
-м
-май
-малко
-ме
-между
-мек
-мен
-месец
-ми
-много
-мнозина
-мога
-могат
-може
-мокър
-моля
-момента
-му
-н
-на
-над
-назад
-най
-направи
-напред
-например
-нас
-не
-него
-нещо
-нея
-ни
-ние
-никой
-нито
-нищо
-но
-нов
-нова
-нови
-новина
-някои
-някой
-няколко
-няма
-обаче
-около
-освен
-особено
-от
-отгоре
-отново
-още
-пак
-по
-повече
-повечето
-под
-поне
-поради
-после
-почти
-прави
-пред
-преди
-през
-при
-пък
-първата
-първи
-първо
-пъти
-равен
-равна
-с
-са
-сам
-само
-се
-сега
-си
-син
-скоро
-след
-следващ
-сме
-смях
-според
-сред
-срещу
-сте
-съм
-със
-също
-т
-тази
-така
-такива
-такъв
-там
-твой
-те
-тези
-ти
-т.н.
-то
-това
-тогава
-този
-той
-толкова
-точно
-три
-трябва
-тук
-тъй
-тя
-тях
-у
-утре
-харесва
-хиляди
-ч
-часа
-че
-често
-чрез
-ще
-щом
+а автентичен аз ако ала
+
+бе без беше би бивш бивша бившо бивши бил била били било благодаря близо бъдат
+бъде бъда бяха
+
+в вас ваш ваша вашата вашият вероятно вече взема ви вие винаги внимава време все 
+всеки всички вместо всичко вследствие всъщност всяка втори във въпреки върху
+вътре веднъж 
+
+г ги главен главна главно глас го годно година години годишен
+
+д да дали далеч далече два двама двамата две двете ден днес дни до добра добре 
+добро добър достатъчно докато докога дори досега доста друг друга другаде други
+
+е евтин едва един една еднаква еднакви еднакъв едно екип ето
+
+живот жив
+
+за здравей здрасти знае зная забавям зад зададени заедно заради засега заспал 
+затова запазва започвам защо защото завинаги
+
+и из или им има имат иска искам използвайки изглежда изглеждаше изглеждайки 
+извън имайки
+
+й йо 
+
+каза казва казвайки казвам как каква какво както какъв като кога кауза каузи 
+когато когото което които кой който колко която къде където към край кратък 
+кръгъл
+
+лесен лесно ли летя летиш летим лош
+
+м май малко макар малцина междувременно минус ме между мек мен месец ми мис 
+мисля много мнозина мога могат може мой можем мокър моля момента му
+
+н на над назад най наш навсякъде навътре нагоре направи напред надолу наистина 
+например наопаки наполовина напоследък нека независимо нас насам наскоро 
+настрана необходимо него негов нещо нея ни ние никой нито нищо но нов някак нова 
+нови новина някои някой някога някъде няколко няма
+
+о обаче около описан опитах опитва опитвайки опитвам определен определено освен 
+обикновено осигурява обратно означава особен особено от ох отвъд отгоре отдолу 
+отново отива отивам отидох отсега отделно отколкото откъдето очевидно оттам 
+относно още
+
+п пак по повече повечето под поне просто пряко поради после последен последно 
+посочен почти прави прав прави правя пред преди през при пък първата първи първо 
+път пъти плюс
+
+равен равна различен различни разумен разумно
+
+с са сам само себе сериозно сигурен сигурно се сега си син скоро скорошен след 
+следващ следващия следва следното следователно случва сме смях собствен 
+сравнително смея според сред става срещу съвсем съдържа съдържащ съжалявам 
+съответен съответно сте съм със също
+
+т така техен техни такива такъв твърде там трета твой те тези ти то това 
+тогава този той търси толкова точно три трябва тук тъй тя тях
+
+у утре ужасно употреба успоредно уточнен уточняване
+
+харесва харесали хиляди
+
+ч часа ценя цяло цялостен че често чрез чудя
+
+ще щеше щом щяха
+
 юмрук
-я
-як
+
+я як
 """.split()
 )
--- a/spacy/lang/bg/tokenizer_exceptions.py
+++ b/spacy/lang/bg/tokenizer_exceptions.py
@ -1,10 +1,16 @@
+"""
+References:
+    https://slovored.com/bg/abbr/grammar/ - Additional refs for abbreviations
+    (countries, occupations, fields of studies and more).
+"""
+
 from ...symbols import ORTH, NORM


 _exc = {}

-
-_abbr_exc = [
+# measurements
+for abbr in [
    {ORTH: "м", NORM: "метър"},
    {ORTH: "мм", NORM: "милиметър"},
    {ORTH: "см", NORM: "сантиметър"},
@ -17,51 +23,191 @@ _abbr_exc = [
    {ORTH: "хл", NORM: "хектолиър"},
    {ORTH: "дкл", NORM: "декалитър"},
    {ORTH: "л", NORM: "литър"},
-]
-for abbr in _abbr_exc:
+]:
    _exc[abbr[ORTH]] = [abbr]

-_abbr_line_exc = [
+# line abbreviations
+for abbr in [
    {ORTH: "г-жа", NORM: "госпожа"},
    {ORTH: "г-н", NORM: "господин"},
    {ORTH: "г-ца", NORM: "госпожица"},
    {ORTH: "д-р", NORM: "доктор"},
    {ORTH: "о-в", NORM: "остров"},
    {ORTH: "п-в", NORM: "полуостров"},
-]
-
-for abbr in _abbr_line_exc:
+    {ORTH: "с-у", NORM: "срещу"},
+    {ORTH: "в-у", NORM: "върху"},
+    {ORTH: "м-у", NORM: "между"},
+]:
    _exc[abbr[ORTH]] = [abbr]

-_abbr_dot_exc = [
+# foreign language related abbreviations
+for abbr in [
+    {ORTH: "англ.", NORM: "английски"},
+    {ORTH: "ан.", NORM: "английски термин"},
+    {ORTH: "араб.", NORM: "арабски"},
+    {ORTH: "афр.", NORM: "африкански"},
+    {ORTH: "гр.", NORM: "гръцки"},
+    {ORTH: "лат.", NORM: "латински"},
+    {ORTH: "рим.", NORM: "римски"},
+    {ORTH: "старогр.", NORM: "старогръцки"},
+    {ORTH: "староевр.", NORM: "староеврейски"},
+    {ORTH: "фр.", NORM: "френски"},
+    {ORTH: "хол.", NORM: "холандски"},
+    {ORTH: "швед.", NORM: "шведски"},
+    {ORTH: "шотл.", NORM: "шотландски"},
+    {ORTH: "яп.", NORM: "японски"},
+]:
+    _exc[abbr[ORTH]] = [abbr]
+
+# profession and academic titles abbreviations
+for abbr in [
    {ORTH: "акад.", NORM: "академик"},
-    {ORTH: "ал.", NORM: "алинея"},
    {ORTH: "арх.", NORM: "архитект"},
+    {ORTH: "инж.", NORM: "инженер"},
+    {ORTH: "канц.", NORM: "канцлер"},
+    {ORTH: "проф.", NORM: "професор"},
+    {ORTH: "св.", NORM: "свети"},
+]:
+    _exc[abbr[ORTH]] = [abbr]
+
+# fields of studies
+for abbr in [
+    {ORTH: "агр.", NORM: "агрономия"},
+    {ORTH: "ав.", NORM: "авиация"},
+    {ORTH: "агр.", NORM: "агрономия"},
+    {ORTH: "археол.", NORM: "археология"},
+    {ORTH: "астр.", NORM: "астрономия"},
+    {ORTH: "геод.", NORM: "геодезия"},
+    {ORTH: "геол.", NORM: "геология"},
+    {ORTH: "геом.", NORM: "геометрия"},
+    {ORTH: "гимн.", NORM: "гимнастика"},
+    {ORTH: "грам.", NORM: "граматика"},
+    {ORTH: "жур.", NORM: "журналистика"},
+    {ORTH: "журн.", NORM: "журналистика"},
+    {ORTH: "зем.", NORM: "земеделие"},
+    {ORTH: "икон.", NORM: "икономика"},
+    {ORTH: "лит.", NORM: "литература"},
+    {ORTH: "мат.", NORM: "математика"},
+    {ORTH: "мед.", NORM: "медицина"},
+    {ORTH: "муз.", NORM: "музика"},
+    {ORTH: "печ.", NORM: "печатарство"},
+    {ORTH: "пол.", NORM: "политика"},
+    {ORTH: "псих.", NORM: "психология"},
+    {ORTH: "соц.", NORM: "социология"},
+    {ORTH: "стат.", NORM: "статистика"},
+    {ORTH: "стил.", NORM: "стилистика"},
+    {ORTH: "топогр.", NORM: "топография"},
+    {ORTH: "търг.", NORM: "търговия"},
+    {ORTH: "фарм.", NORM: "фармацевтика"},
+    {ORTH: "фехт.", NORM: "фехтовка"},
+    {ORTH: "физиол.", NORM: "физиология"},
+    {ORTH: "физ.", NORM: "физика"},
+    {ORTH: "фил.", NORM: "философия"},
+    {ORTH: "фин.", NORM: "финанси"},
+    {ORTH: "фолкл.", NORM: "фолклор"},
+    {ORTH: "фон.", NORM: "фонетика"},
+    {ORTH: "фот.", NORM: "фотография"},
+    {ORTH: "футб.", NORM: "футбол"},
+    {ORTH: "хим.", NORM: "химия"},
+    {ORTH: "хир.", NORM: "хирургия"},
+    {ORTH: "ел.", NORM: "електротехника"},
+]:
+    _exc[abbr[ORTH]] = [abbr]
+
+for abbr in [
+    {ORTH: "ал.", NORM: "алинея"},
+    {ORTH: "авт.", NORM: "автоматично"},
+    {ORTH: "адм.", NORM: "администрация"},
+    {ORTH: "арт.", NORM: "артилерия"},
    {ORTH: "бл.", NORM: "блок"},
    {ORTH: "бр.", NORM: "брой"},
    {ORTH: "бул.", NORM: "булевард"},
+    {ORTH: "букв.", NORM: "буквално"},
    {ORTH: "в.", NORM: "век"},
+    {ORTH: "вр.", NORM: "време"},
+    {ORTH: "вм.", NORM: "вместо"},
+    {ORTH: "воен.", NORM: "военен термин"},
    {ORTH: "г.", NORM: "година"},
    {ORTH: "гр.", NORM: "град"},
+    {ORTH: "гл.", NORM: "глагол"},
+    {ORTH: "др.", NORM: "други"},
+    {ORTH: "ез.", NORM: "езеро"},
    {ORTH: "ж.р.", NORM: "женски род"},
-    {ORTH: "инж.", NORM: "инженер"},
+    {ORTH: "жп.", NORM: "железопът"},
+    {ORTH: "застр.", NORM: "застрахователно дело"},
+    {ORTH: "знач.", NORM: "значение"},
+    {ORTH: "и др.", NORM: "и други"},
+    {ORTH: "и под.", NORM: "и подобни"},
+    {ORTH: "и пр.", NORM: "и прочие"},
+    {ORTH: "изр.", NORM: "изречение"},
+    {ORTH: "изт.", NORM: "източен"},
+    {ORTH: "конкр.", NORM: "конкретно"},
    {ORTH: "лв.", NORM: "лев"},
+    {ORTH: "л.", NORM: "лице"},
    {ORTH: "м.р.", NORM: "мъжки род"},
-    {ORTH: "мат.", NORM: "математика"},
-    {ORTH: "мед.", NORM: "медицина"},
+    {ORTH: "мин.вр.", NORM: "минало време"},
+    {ORTH: "мн.ч.", NORM: "множествено число"},
+    {ORTH: "напр.", NORM: "например"},
+    {ORTH: "нар.", NORM: "наречие"},
+    {ORTH: "науч.", NORM: "научен термин"},
+    {ORTH: "непр.", NORM: "неправилно"},
+    {ORTH: "обик.", NORM: "обикновено"},
+    {ORTH: "опред.", NORM: "определение"},
+    {ORTH: "особ.", NORM: "особено"},
+    {ORTH: "ост.", NORM: "остаряло"},
+    {ORTH: "относ.", NORM: "относително"},
+    {ORTH: "отр.", NORM: "отрицателно"},
    {ORTH: "пл.", NORM: "площад"},
-    {ORTH: "проф.", NORM: "професор"},
+    {ORTH: "пад.", NORM: "падеж"},
+    {ORTH: "парл.", NORM: "парламентарен"},
+    {ORTH: "погов.", NORM: "поговорка"},
+    {ORTH: "пон.", NORM: "понякога"},
+    {ORTH: "правосл.", NORM: "православен"},
+    {ORTH: "прибл.", NORM: "приблизително"},
+    {ORTH: "прил.", NORM: "прилагателно име"},
+    {ORTH: "пр.", NORM: "прочие"},
    {ORTH: "с.", NORM: "село"},
    {ORTH: "с.р.", NORM: "среден род"},
-    {ORTH: "св.", NORM: "свети"},
    {ORTH: "сп.", NORM: "списание"},
    {ORTH: "стр.", NORM: "страница"},
+    {ORTH: "сз.", NORM: "съюз"},
+    {ORTH: "сег.", NORM: "сегашно"},
+    {ORTH: "сп.", NORM: "спорт"},
+    {ORTH: "срв.", NORM: "сравни"},
+    {ORTH: "с.ст.", NORM: "селскостопанска техника"},
+    {ORTH: "счет.", NORM: "счетоводство"},
+    {ORTH: "съкр.", NORM: "съкратено"},
+    {ORTH: "съобщ.", NORM: "съобщение"},
+    {ORTH: "същ.", NORM: "съществително"},
+    {ORTH: "текст.", NORM: "текстилен"},
+    {ORTH: "телев.", NORM: "телевизия"},
+    {ORTH: "тел.", NORM: "телефон"},
+    {ORTH: "т.е.", NORM: "тоест"},
+    {ORTH: "т.н.", NORM: "така нататък"},
+    {ORTH: "т.нар.", NORM: "така наречен"},
+    {ORTH: "търж.", NORM: "тържествено"},
    {ORTH: "ул.", NORM: "улица"},
+    {ORTH: "уч.", NORM: "училище"},
+    {ORTH: "унив.", NORM: "университет"},
+    {ORTH: "харт.", NORM: "хартия"},
+    {ORTH: "хидр.", NORM: "хидравлика"},
+    {ORTH: "хран.", NORM: "хранителна"},
+    {ORTH: "църк.", NORM: "църковен термин"},
+    {ORTH: "числ.", NORM: "числително"},
    {ORTH: "чл.", NORM: "член"},
-]
-
-for abbr in _abbr_dot_exc:
+    {ORTH: "ч.", NORM: "число"},
+    {ORTH: "числ.", NORM: "числително"},
+    {ORTH: "шахм.", NORM: "шахмат"},
+    {ORTH: "шах.", NORM: "шахмат"},
+    {ORTH: "юр.", NORM: "юридически"},
+]:
    _exc[abbr[ORTH]] = [abbr]

+# slash abbreviations
+for abbr in [
+    {ORTH: "м/у", NORM: "между"},
+    {ORTH: "с/у", NORM: "срещу"},
+]:
+    _exc[abbr[ORTH]] = [abbr]

 TOKENIZER_EXCEPTIONS = _exc
--- a/spacy/lang/bn/init.py
+++ b/spacy/lang/bn/init.py
@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, Callable
 from thinc.api import Model
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
@ -23,13 +23,25 @@ class Bengali(Language):
@Bengali.factory(
    "lemmatizer",
    assigns=["token.lemma"],
-    default_config={"model": None, "mode": "rule", "overwrite": False},
+    default_config={
+        "model": None,
+        "mode": "rule",
+        "overwrite": False,
+        "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
+    },
    default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
-    nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
+    nlp: Language,
+    model: Optional[Model],
+    name: str,
+    mode: str,
+    overwrite: bool,
+    scorer: Optional[Callable],
 ):
-    return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+    return Lemmatizer(
+        nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
+    )


 __all__ = ["Bengali"]
--- a/spacy/lang/ca/init.py
+++ b/spacy/lang/ca/init.py
@ -1,9 +1,9 @@
-from typing import Optional
+from typing import Optional, Callable

 from thinc.api import Model

 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .syntax_iterators import SYNTAX_ITERATORS
@ -15,6 +15,7 @@ class CatalanDefaults(BaseDefaults):
    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
    infixes = TOKENIZER_INFIXES
    suffixes = TOKENIZER_SUFFIXES
+    prefixes = TOKENIZER_PREFIXES
    stop_words = STOP_WORDS
    lex_attr_getters = LEX_ATTRS
    syntax_iterators = SYNTAX_ITERATORS
@ -28,13 +29,25 @@ class Catalan(Language):
@Catalan.factory(
    "lemmatizer",
    assigns=["token.lemma"],
-    default_config={"model": None, "mode": "rule", "overwrite": False},
+    default_config={
+        "model": None,
+        "mode": "rule",
+        "overwrite": False,
+        "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
+    },
    default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
-    nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
+    nlp: Language,
+    model: Optional[Model],
+    name: str,
+    mode: str,
+    overwrite: bool,
+    scorer: Optional[Callable],
 ):
-    return CatalanLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+    return CatalanLemmatizer(
+        nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
+    )


 __all__ = ["Catalan"]
--- a/spacy/lang/ca/punctuation.py
+++ b/spacy/lang/ca/punctuation.py
@ -1,4 +1,5 @@
 from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
+from ..char_classes import LIST_CURRENCY
 from ..char_classes import CURRENCY
 from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
 from ..char_classes import merge_chars, _units
@ -6,6 +7,14 @@ from ..char_classes import merge_chars, _units

 ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "")

+_prefixes = (
+    ["§", "%", "=", "—", "–", "-", r"\+(?![0-9])"]
+    + LIST_PUNCT
+    + LIST_ELLIPSES
+    + LIST_QUOTES
+    + LIST_CURRENCY
+    + LIST_ICONS
+)

 _infixes = (
    LIST_ELLIPSES
@ -18,6 +27,7 @@ _infixes = (
        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
        r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
        r"(?<=[{a}][{el}])(?=[{a}0-9])".format(a=ALPHA, el=ELISION),
+        r"('ls|'l|'ns|'t|'m|'n|-les|-la|-lo|-li|-los|-me|-nos|-te|-vos|-se|-hi|-ne|-ho)(?![A-Za-z])|(-l'|-m'|-t'|-n')",
    ]
 )

@ -44,3 +54,4 @@ _suffixes = (

 TOKENIZER_INFIXES = _infixes
 TOKENIZER_SUFFIXES = _suffixes
+TOKENIZER_PREFIXES = _prefixes
--- a/spacy/lang/ca/tokenizer_exceptions.py
+++ b/spacy/lang/ca/tokenizer_exceptions.py
@ -18,12 +18,21 @@ for exc_data in [
    {ORTH: "nov.", NORM: "novembre"},
    {ORTH: "dec.", NORM: "desembre"},
    {ORTH: "Dr.", NORM: "doctor"},
+    {ORTH: "Dra.", NORM: "doctora"},
    {ORTH: "Sr.", NORM: "senyor"},
    {ORTH: "Sra.", NORM: "senyora"},
    {ORTH: "Srta.", NORM: "senyoreta"},
    {ORTH: "núm", NORM: "número"},
    {ORTH: "St.", NORM: "sant"},
    {ORTH: "Sta.", NORM: "santa"},
+    {ORTH: "pl.", NORM: "plaça"},
+    {ORTH: "à."},
+    {ORTH: "è."},
+    {ORTH: "é."},
+    {ORTH: "í."},
+    {ORTH: "ò."},
+    {ORTH: "ó."},
+    {ORTH: "ú."},
    {ORTH: "'l"},
    {ORTH: "'ls"},
    {ORTH: "'m"},
@ -34,6 +43,18 @@ for exc_data in [
 ]:
    _exc[exc_data[ORTH]] = [exc_data]

+_exc["del"] = [{ORTH: "d", NORM: "de"}, {ORTH: "el"}]
+_exc["dels"] = [{ORTH: "d", NORM: "de"}, {ORTH: "els"}]
+
+_exc["al"] = [{ORTH: "a"}, {ORTH: "l", NORM: "el"}]
+_exc["als"] = [{ORTH: "a"}, {ORTH: "ls", NORM: "els"}]
+
+_exc["pel"] = [{ORTH: "p", NORM: "per"}, {ORTH: "el"}]
+_exc["pels"] = [{ORTH: "p", NORM: "per"}, {ORTH: "els"}]
+
+_exc["holahola"] = [{ORTH: "holahola", NORM: "cocacola"}]
+
+
 # Times
 _exc["12m."] = [{ORTH: "12"}, {ORTH: "m.", NORM: "p.m."}]

--- a/spacy/lang/el/init.py
+++ b/spacy/lang/el/init.py
@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, Callable
 from thinc.api import Model

 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
@ -28,13 +28,25 @@ class Greek(Language):
@Greek.factory(
    "lemmatizer",
    assigns=["token.lemma"],
-    default_config={"model": None, "mode": "rule", "overwrite": False},
+    default_config={
+        "model": None,
+        "mode": "rule",
+        "overwrite": False,
+        "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
+    },
    default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
-    nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
+    nlp: Language,
+    model: Optional[Model],
+    name: str,
+    mode: str,
+    overwrite: bool,
+    scorer: Optional[Callable],
 ):
-    return GreekLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+    return GreekLemmatizer(
+        nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
+    )


 __all__ = ["Greek"]
--- a/spacy/lang/en/init.py
+++ b/spacy/lang/en/init.py
@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, Callable
 from thinc.api import Model

 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
@ -26,13 +26,25 @@ class English(Language):
@English.factory(
    "lemmatizer",
    assigns=["token.lemma"],
-    default_config={"model": None, "mode": "rule", "overwrite": False},
+    default_config={
+        "model": None,
+        "mode": "rule",
+        "overwrite": False,
+        "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
+    },
    default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
-    nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
+    nlp: Language,
+    model: Optional[Model],
+    name: str,
+    mode: str,
+    overwrite: bool,
+    scorer: Optional[Callable],
 ):
-    return EnglishLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+    return EnglishLemmatizer(
+        nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
+    )


 __all__ = ["English"]
--- a/spacy/lang/en/lemmatizer.py
+++ b/spacy/lang/en/lemmatizer.py
@ -10,7 +10,7 @@ class EnglishLemmatizer(Lemmatizer):
        Check whether we're dealing with an uninflected paradigm, so we can
        avoid lemmatization entirely.

-        univ_pos (unicode / int): The token's universal part-of-speech tag.
+        univ_pos (str / int): The token's universal part-of-speech tag.
        morphology (dict): The token's morphological features following the
            Universal Dependencies scheme.
        """
--- a/spacy/lang/es/init.py
+++ b/spacy/lang/es/init.py
@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, Callable
 from thinc.api import Model
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
@ -26,13 +26,25 @@ class Spanish(Language):
@Spanish.factory(
    "lemmatizer",
    assigns=["token.lemma"],
-    default_config={"model": None, "mode": "rule", "overwrite": False},
+    default_config={
+        "model": None,
+        "mode": "rule",
+        "overwrite": False,
+        "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
+    },
    default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
-    nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
+    nlp: Language,
+    model: Optional[Model],
+    name: str,
+    mode: str,
+    overwrite: bool,
+    scorer: Optional[Callable],
 ):
-    return SpanishLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+    return SpanishLemmatizer(
+        nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
+    )


 __all__ = ["Spanish"]
--- a/spacy/lang/fa/init.py
+++ b/spacy/lang/fa/init.py
@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, Callable
 from thinc.api import Model
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
@ -26,13 +26,25 @@ class Persian(Language):
@Persian.factory(
    "lemmatizer",
    assigns=["token.lemma"],
-    default_config={"model": None, "mode": "rule", "overwrite": False},
+    default_config={
+        "model": None,
+        "mode": "rule",
+        "overwrite": False,
+        "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
+    },
    default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
-    nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
+    nlp: Language,
+    model: Optional[Model],
+    name: str,
+    mode: str,
+    overwrite: bool,
+    scorer: Optional[Callable],
 ):
-    return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+    return Lemmatizer(
+        nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
+    )


 __all__ = ["Persian"]
--- a/spacy/lang/fr/init.py
+++ b/spacy/lang/fr/init.py
@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, Callable

 from thinc.api import Model

@ -31,13 +31,25 @@ class French(Language):
@French.factory(
    "lemmatizer",
    assigns=["token.lemma"],
-    default_config={"model": None, "mode": "rule", "overwrite": False},
+    default_config={
+        "model": None,
+        "mode": "rule",
+        "overwrite": False,
+        "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
+    },
    default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
-    nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
+    nlp: Language,
+    model: Optional[Model],
+    name: str,
+    mode: str,
+    overwrite: bool,
+    scorer: Optional[Callable],
 ):
-    return FrenchLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+    return FrenchLemmatizer(
+        nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
+    )


 __all__ = ["French"]
--- a/spacy/lang/ga/init.py
+++ b/spacy/lang/ga/init.py
@ -1,6 +1,11 @@
+from typing import Optional
+
+from thinc.api import Model
+
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
 from ...language import Language, BaseDefaults
+from .lemmatizer import IrishLemmatizer


 class IrishDefaults(BaseDefaults):
@ -13,4 +18,16 @@ class Irish(Language):
    Defaults = IrishDefaults


+@Irish.factory(
+    "lemmatizer",
+    assigns=["token.lemma"],
+    default_config={"model": None, "mode": "pos_lookup", "overwrite": False},
+    default_score_weights={"lemma_acc": 1.0},
+)
+def make_lemmatizer(
+    nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
+):
+    return IrishLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+
+
 __all__ = ["Irish"]
--- a/spacy/lang/ga/irish_morphology_helpers.py
+++ b/spacy/lang/ga/irish_morphology_helpers.py
@ -1,35 +0,0 @@
-# fmt: off
-consonants = ["b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "z"]
-broad_vowels = ["a", "á", "o", "ó", "u", "ú"]
-slender_vowels = ["e", "é", "i", "í"]
-vowels = broad_vowels + slender_vowels
-# fmt: on
-
-
-def ends_dentals(word):
-    if word != "" and word[-1] in ["d", "n", "t", "s"]:
-        return True
-    else:
-        return False
-
-
-def devoice(word):
-    if len(word) > 2 and word[-2] == "s" and word[-1] == "d":
-        return word[:-1] + "t"
-    else:
-        return word
-
-
-def ends_with_vowel(word):
-    return word != "" and word[-1] in vowels
-
-
-def starts_with_vowel(word):
-    return word != "" and word[0] in vowels
-
-
-def deduplicate(word):
-    if len(word) > 2 and word[-2] == word[-1] and word[-1] in consonants:
-        return word[:-1]
-    else:
-        return word
--- a/spacy/lang/ga/lemmatizer.py
+++ b/spacy/lang/ga/lemmatizer.py
@ -0,0 +1,162 @@
+from typing import List, Dict, Tuple
+
+from ...pipeline import Lemmatizer
+from ...tokens import Token
+
+
+class IrishLemmatizer(Lemmatizer):
+    # This is a lookup-based lemmatiser using data extracted from
+    # BuNaMo (https://github.com/michmech/BuNaMo)
+
+    @classmethod
+    def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
+        if mode == "pos_lookup":
+            # fmt: off
+            required = [
+                "lemma_lookup_adj", "lemma_lookup_adp",
+                "lemma_lookup_noun", "lemma_lookup_verb"
+            ]
+            # fmt: on
+            return (required, [])
+        else:
+            return super().get_lookups_config(mode)
+
+    def pos_lookup_lemmatize(self, token: Token) -> List[str]:
+        univ_pos = token.pos_
+        string = unponc(token.text)
+        if univ_pos not in ["PROPN", "ADP", "ADJ", "NOUN", "VERB"]:
+            return [string.lower()]
+        demutated = demutate(string)
+        secondary = ""
+        if string[0:1].lower() == "h" and string[1:2].lower() in "aáeéiíoóuú":
+            secondary = string[1:]
+        lookup_pos = univ_pos.lower()
+        if univ_pos == "PROPN":
+            lookup_pos = "noun"
+        if token.has_morph():
+            # TODO: lookup is actually required for the genitive forms, but
+            # this is not in BuNaMo, and would not be of use with IDT.
+            if univ_pos == "NOUN" and (
+                "VerbForm=Vnoun" in token.morph or "VerbForm=Inf" in token.morph
+            ):
+                hpref = "Form=HPref" in token.morph
+                return [demutate(string, hpref).lower()]
+            elif univ_pos == "ADJ" and "VerbForm=Part" in token.morph:
+                return [demutate(string).lower()]
+        lookup_table = self.lookups.get_table("lemma_lookup_" + lookup_pos, {})
+
+        def to_list(value):
+            if value is None:
+                value = []
+            elif not isinstance(value, list):
+                value = [value]
+            return value
+
+        if univ_pos == "ADP":
+            return to_list(lookup_table.get(string, string.lower()))
+        ret = []
+        if univ_pos == "PROPN":
+            ret.extend(to_list(lookup_table.get(demutated)))
+            ret.extend(to_list(lookup_table.get(secondary)))
+        else:
+            ret.extend(to_list(lookup_table.get(demutated.lower())))
+            ret.extend(to_list(lookup_table.get(secondary.lower())))
+        if len(ret) == 0:
+            ret = [string.lower()]
+        return ret
+
+
+def demutate(word: str, is_hpref: bool = False) -> str:
+    UVOWELS = "AÁEÉIÍOÓUÚ"
+    LVOWELS = "aáeéiíoóuú"
+    lc = word.lower()
+    # remove eclipsis
+    if lc.startswith("bhf"):
+        word = word[2:]
+    elif lc.startswith("mb"):
+        word = word[1:]
+    elif lc.startswith("gc"):
+        word = word[1:]
+    elif lc.startswith("nd"):
+        word = word[1:]
+    elif lc.startswith("ng"):
+        word = word[1:]
+    elif lc.startswith("bp"):
+        word = word[1:]
+    elif lc.startswith("dt"):
+        word = word[1:]
+    elif word[0:1] == "n" and word[1:2] in UVOWELS:
+        word = word[1:]
+    elif lc.startswith("n-") and word[2:3] in LVOWELS:
+        word = word[2:]
+    # non-standard eclipsis
+    elif lc.startswith("bh-f"):
+        word = word[3:]
+    elif lc.startswith("m-b"):
+        word = word[2:]
+    elif lc.startswith("g-c"):
+        word = word[2:]
+    elif lc.startswith("n-d"):
+        word = word[2:]
+    elif lc.startswith("n-g"):
+        word = word[2:]
+    elif lc.startswith("b-p"):
+        word = word[2:]
+    elif lc.startswith("d-t"):
+        word = word[2:]
+
+    # t-prothesis
+    elif lc.startswith("ts"):
+        word = word[1:]
+    elif lc.startswith("t-s"):
+        word = word[2:]
+
+    # h-prothesis, if known to be present
+    elif is_hpref and word[0:1] == "h":
+        word = word[1:]
+    # h-prothesis, simple case
+    # words can also begin with 'h', but unlike eclipsis,
+    # a hyphen is not used, so that needs to be handled
+    # elsewhere
+    elif word[0:1] == "h" and word[1:2] in UVOWELS:
+        word = word[1:]
+
+    # lenition
+    # this breaks the previous if, to handle super-non-standard
+    # text where both eclipsis and lenition were used.
+    if lc[0:1] in "bcdfgmpst" and lc[1:2] == "h":
+        word = word[0:1] + word[2:]
+
+    return word
+
+
+def unponc(word: str) -> str:
+    # fmt: off
+    PONC = {
+        "ḃ": "bh",
+        "ċ": "ch",
+        "ḋ": "dh",
+        "ḟ": "fh",
+        "ġ": "gh",
+        "ṁ": "mh",
+        "ṗ": "ph",
+        "ṡ": "sh",
+        "ṫ": "th",
+        "Ḃ": "BH",
+        "Ċ": "CH",
+        "Ḋ": "DH",
+        "Ḟ": "FH",
+        "Ġ": "GH",
+        "Ṁ": "MH",
+        "Ṗ": "PH",
+        "Ṡ": "SH",
+        "Ṫ": "TH"
+    }
+    # fmt: on
+    buf = []
+    for ch in word:
+        if ch in PONC:
+            buf.append(PONC[ch])
+        else:
+            buf.append(ch)
+    return "".join(buf)
--- a/spacy/lang/ga/tokenizer_exceptions.py
+++ b/spacy/lang/ga/tokenizer_exceptions.py
@ -9,6 +9,8 @@ _exc = {
    "ded'": [{ORTH: "de", NORM: "de"}, {ORTH: "d'", NORM: "do"}],
    "lem'": [{ORTH: "le", NORM: "le"}, {ORTH: "m'", NORM: "mo"}],
    "led'": [{ORTH: "le", NORM: "le"}, {ORTH: "d'", NORM: "do"}],
+    "théis": [{ORTH: "th", NORM: "tar"}, {ORTH: "éis", NORM: "éis"}],
+    "tréis": [{ORTH: "tr", NORM: "tar"}, {ORTH: "éis", NORM: "éis"}],
 }

 for exc_data in [
--- a/spacy/lang/hu/tokenizer_exceptions.py
+++ b/spacy/lang/hu/tokenizer_exceptions.py
@ -646,5 +646,10 @@ _nums = r"(({ne})|({t})|({on})|({c}))({s})?".format(
 )


+for u in "cfkCFK":
+    _exc[f"°{u}"] = [{ORTH: f"°{u}"}]
+    _exc[f"°{u}."] = [{ORTH: f"°{u}"}, {ORTH: "."}]
+
+
 TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
 TOKEN_MATCH = re.compile(r"^{n}$".format(n=_nums)).match
--- a/spacy/lang/it/init.py
+++ b/spacy/lang/it/init.py
@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, Callable
 from thinc.api import Model

 from .stop_words import STOP_WORDS
@ -23,13 +23,25 @@ class Italian(Language):
@Italian.factory(
    "lemmatizer",
    assigns=["token.lemma"],
-    default_config={"model": None, "mode": "pos_lookup", "overwrite": False},
+    default_config={
+        "model": None,
+        "mode": "pos_lookup",
+        "overwrite": False,
+        "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
+    },
    default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
-    nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
+    nlp: Language,
+    model: Optional[Model],
+    name: str,
+    mode: str,
+    overwrite: bool,
+    scorer: Optional[Callable],
 ):
-    return ItalianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+    return ItalianLemmatizer(
+        nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
+    )


 __all__ = ["Italian"]
--- a/spacy/lang/ja/init.py
+++ b/spacy/lang/ja/init.py
@ -1,21 +1,25 @@
-from typing import Optional, Union, Dict, Any
+from typing import Optional, Union, Dict, Any, Callable
 from pathlib import Path
 import srsly
 from collections import namedtuple
+from thinc.api import Model
+import re

 from .stop_words import STOP_WORDS
 from .syntax_iterators import SYNTAX_ITERATORS
 from .tag_map import TAG_MAP
 from .tag_orth_map import TAG_ORTH_MAP
 from .tag_bigram_map import TAG_BIGRAM_MAP
-from ...compat import copy_reg
 from ...errors import Errors
 from ...language import Language, BaseDefaults
+from ...pipeline import Morphologizer
+from ...pipeline.morphologizer import DEFAULT_MORPH_MODEL
 from ...scorer import Scorer
 from ...symbols import POS
-from ...tokens import Doc
+from ...tokens import Doc, MorphAnalysis
 from ...training import validate_examples
 from ...util import DummyTokenizer, registry, load_config_from_str
+from ...vocab import Vocab
 from ... import util


@ -31,16 +35,21 @@ split_mode = null
@registry.tokenizers("spacy.ja.JapaneseTokenizer")
 def create_tokenizer(split_mode: Optional[str] = None):
    def japanese_tokenizer_factory(nlp):
-        return JapaneseTokenizer(nlp, split_mode=split_mode)
+        return JapaneseTokenizer(nlp.vocab, split_mode=split_mode)

    return japanese_tokenizer_factory


 class JapaneseTokenizer(DummyTokenizer):
-    def __init__(self, nlp: Language, split_mode: Optional[str] = None) -> None:
-        self.vocab = nlp.vocab
+    def __init__(self, vocab: Vocab, split_mode: Optional[str] = None) -> None:
+        self.vocab = vocab
        self.split_mode = split_mode
        self.tokenizer = try_sudachi_import(self.split_mode)
+        # if we're using split mode A we don't need subtokens
+        self.need_subtokens = not (split_mode is None or split_mode == "A")
+
+    def __reduce__(self):
+        return JapaneseTokenizer, (self.vocab, self.split_mode)

    def __call__(self, text: str) -> Doc:
        # convert sudachipy.morpheme.Morpheme to DetailedToken and merge continuous spaces
@ -49,8 +58,8 @@ class JapaneseTokenizer(DummyTokenizer):
        dtokens, spaces = get_dtokens_and_spaces(dtokens, text)

        # create Doc with tag bi-gram based part-of-speech identification rules
-        words, tags, inflections, lemmas, readings, sub_tokens_list = (
-            zip(*dtokens) if dtokens else [[]] * 6
+        words, tags, inflections, lemmas, norms, readings, sub_tokens_list = (
+            zip(*dtokens) if dtokens else [[]] * 7
        )
        sub_tokens_list = list(sub_tokens_list)
        doc = Doc(self.vocab, words=words, spaces=spaces)
@ -68,9 +77,18 @@ class JapaneseTokenizer(DummyTokenizer):
                )
            # if there's no lemma info (it's an unk) just use the surface
            token.lemma_ = dtoken.lemma if dtoken.lemma else dtoken.surface
-        doc.user_data["inflections"] = inflections
-        doc.user_data["reading_forms"] = readings
-        doc.user_data["sub_tokens"] = sub_tokens_list
+            morph = {}
+            if dtoken.inf:
+                # it's normal for this to be empty for non-inflecting types
+                morph["Inflection"] = dtoken.inf
+            token.norm_ = dtoken.norm
+            if dtoken.reading:
+                # punctuation is its own reading, but we don't want values like
+                # "=" here
+                morph["Reading"] = re.sub("[=|]", "_", dtoken.reading)
+            token.morph = MorphAnalysis(self.vocab, morph)
+        if self.need_subtokens:
+            doc.user_data["sub_tokens"] = sub_tokens_list
        return doc

    def _get_dtokens(self, sudachipy_tokens, need_sub_tokens: bool = True):
@ -81,9 +99,10 @@ class JapaneseTokenizer(DummyTokenizer):
            DetailedToken(
                token.surface(),  # orth
                "-".join([xx for xx in token.part_of_speech()[:4] if xx != "*"]),  # tag
-                ",".join([xx for xx in token.part_of_speech()[4:] if xx != "*"]),  # inf
+                ";".join([xx for xx in token.part_of_speech()[4:] if xx != "*"]),  # inf
                token.dictionary_form(),  # lemma
-                token.reading_form(),  # user_data['reading_forms']
+                token.normalized_form(),
+                token.reading_form(),
                sub_tokens_list[idx]
                if sub_tokens_list
                else None,  # user_data['sub_tokens']
@ -105,9 +124,8 @@ class JapaneseTokenizer(DummyTokenizer):
        ]

    def _get_sub_tokens(self, sudachipy_tokens):
-        if (
-            self.split_mode is None or self.split_mode == "A"
-        ):  # do nothing for default split mode
+        # do nothing for default split mode
+        if not self.need_subtokens:
            return None

        sub_tokens_list = []  # list of (list of list of DetailedToken | None)
@ -176,9 +194,33 @@ class Japanese(Language):
    Defaults = JapaneseDefaults


+@Japanese.factory(
+    "morphologizer",
+    assigns=["token.morph", "token.pos"],
+    default_config={
+        "model": DEFAULT_MORPH_MODEL,
+        "overwrite": True,
+        "extend": True,
+        "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"},
+    },
+    default_score_weights={"pos_acc": 0.5, "morph_micro_f": 0.5, "morph_per_feat": None},
+)
+def make_morphologizer(
+    nlp: Language,
+    model: Model,
+    name: str,
+    overwrite: bool,
+    extend: bool,
+    scorer: Optional[Callable],
+):
+    return Morphologizer(
+        nlp.vocab, model, name, overwrite=overwrite, extend=extend, scorer=scorer
+    )
+
+
 # Hold the attributes we need with convenient names
 DetailedToken = namedtuple(
-    "DetailedToken", ["surface", "tag", "inf", "lemma", "reading", "sub_tokens"]
+    "DetailedToken", ["surface", "tag", "inf", "lemma", "norm", "reading", "sub_tokens"]
 )


@ -254,7 +296,7 @@ def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"):
        return text_dtokens, text_spaces
    elif len([word for word in words if not word.isspace()]) == 0:
        assert text.isspace()
-        text_dtokens = [DetailedToken(text, gap_tag, "", text, None, None)]
+        text_dtokens = [DetailedToken(text, gap_tag, "", text, text, None, None)]
        text_spaces = [False]
        return text_dtokens, text_spaces

@ -271,7 +313,7 @@ def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"):
        # space token
        if word_start > 0:
            w = text[text_pos : text_pos + word_start]
-            text_dtokens.append(DetailedToken(w, gap_tag, "", w, None, None))
+            text_dtokens.append(DetailedToken(w, gap_tag, "", w, w, None, None))
            text_spaces.append(False)
            text_pos += word_start

@ -287,16 +329,10 @@ def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"):
    # trailing space token
    if text_pos < len(text):
        w = text[text_pos:]
-        text_dtokens.append(DetailedToken(w, gap_tag, "", w, None, None))
+        text_dtokens.append(DetailedToken(w, gap_tag, "", w, w, None, None))
        text_spaces.append(False)

    return text_dtokens, text_spaces


-def pickle_japanese(instance):
-    return Japanese, tuple()
-
-
-copy_reg.pickle(Japanese, pickle_japanese)
-
 __all__ = ["Japanese"]
--- a/spacy/lang/ko/init.py
+++ b/spacy/lang/ko/init.py
@ -5,11 +5,11 @@ from .tag_map import TAG_MAP
 from .lex_attrs import LEX_ATTRS
 from ...language import Language, BaseDefaults
 from ...tokens import Doc
-from ...compat import copy_reg
 from ...scorer import Scorer
 from ...symbols import POS
 from ...training import validate_examples
 from ...util import DummyTokenizer, registry, load_config_from_str
+from ...vocab import Vocab


 DEFAULT_CONFIG = """
@ -23,17 +23,20 @@ DEFAULT_CONFIG = """
@registry.tokenizers("spacy.ko.KoreanTokenizer")
 def create_tokenizer():
    def korean_tokenizer_factory(nlp):
-        return KoreanTokenizer(nlp)
+        return KoreanTokenizer(nlp.vocab)

    return korean_tokenizer_factory


 class KoreanTokenizer(DummyTokenizer):
-    def __init__(self, nlp: Language):
-        self.vocab = nlp.vocab
+    def __init__(self, vocab: Vocab):
+        self.vocab = vocab
        MeCab = try_mecab_import()  # type: ignore[func-returns-value]
        self.mecab_tokenizer = MeCab("-F%f[0],%f[7]")

+    def __reduce__(self):
+        return KoreanTokenizer, (self.vocab,)
+
    def __del__(self):
        self.mecab_tokenizer.__del__()

@ -106,10 +109,4 @@ def check_spaces(text, tokens):
        yield False


-def pickle_korean(instance):
-    return Korean, tuple()
-
-
-copy_reg.pickle(Korean, pickle_korean)
-
 __all__ = ["Korean"]
--- a/spacy/lang/lex_attrs.py
+++ b/spacy/lang/lex_attrs.py
@ -3,6 +3,7 @@ import unicodedata
 import re

 from .. import attrs
+from .tokenizer_exceptions import URL_MATCH


 _like_email = re.compile(r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)").match
@ -109,6 +110,8 @@ def like_url(text: str) -> bool:
        return True
    if tld.isalpha() and tld in _tlds:
        return True
+    if URL_MATCH(text):
+        return True
    return False


--- a/spacy/lang/mk/init.py
+++ b/spacy/lang/mk/init.py
@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, Callable
 from thinc.api import Model
 from .lemmatizer import MacedonianLemmatizer
 from .stop_words import STOP_WORDS
@ -38,13 +38,25 @@ class Macedonian(Language):
@Macedonian.factory(
    "lemmatizer",
    assigns=["token.lemma"],
-    default_config={"model": None, "mode": "rule", "overwrite": False},
+    default_config={
+        "model": None,
+        "mode": "rule",
+        "overwrite": False,
+        "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
+    },
    default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
-    nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
+    nlp: Language,
+    model: Optional[Model],
+    name: str,
+    mode: str,
+    overwrite: bool,
+    scorer: Optional[Callable],
 ):
-    return MacedonianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+    return MacedonianLemmatizer(
+        nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
+    )


 __all__ = ["Macedonian"]
--- a/spacy/lang/nb/init.py
+++ b/spacy/lang/nb/init.py
@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, Callable
 from thinc.api import Model
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
@ -26,13 +26,25 @@ class Norwegian(Language):
@Norwegian.factory(
    "lemmatizer",
    assigns=["token.lemma"],
-    default_config={"model": None, "mode": "rule", "overwrite": False},
+    default_config={
+        "model": None,
+        "mode": "rule",
+        "overwrite": False,
+        "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
+    },
    default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
-    nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
+    nlp: Language,
+    model: Optional[Model],
+    name: str,
+    mode: str,
+    overwrite: bool,
+    scorer: Optional[Callable],
 ):
-    return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+    return Lemmatizer(
+        nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
+    )


 __all__ = ["Norwegian"]
--- a/spacy/lang/nl/init.py
+++ b/spacy/lang/nl/init.py
@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, Callable

 from thinc.api import Model

@ -30,13 +30,25 @@ class Dutch(Language):
@Dutch.factory(
    "lemmatizer",
    assigns=["token.lemma"],
-    default_config={"model": None, "mode": "rule", "overwrite": False},
+    default_config={
+        "model": None,
+        "mode": "rule",
+        "overwrite": False,
+        "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
+    },
    default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
-    nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
+    nlp: Language,
+    model: Optional[Model],
+    name: str,
+    mode: str,
+    overwrite: bool,
+    scorer: Optional[Callable],
 ):
-    return DutchLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+    return DutchLemmatizer(
+        nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
+    )


 __all__ = ["Dutch"]
--- a/spacy/lang/pl/init.py
+++ b/spacy/lang/pl/init.py
@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, Callable

 from thinc.api import Model

@ -33,13 +33,25 @@ class Polish(Language):
@Polish.factory(
    "lemmatizer",
    assigns=["token.lemma"],
-    default_config={"model": None, "mode": "pos_lookup", "overwrite": False},
+    default_config={
+        "model": None,
+        "mode": "pos_lookup",
+        "overwrite": False,
+        "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
+    },
    default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
-    nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
+    nlp: Language,
+    model: Optional[Model],
+    name: str,
+    mode: str,
+    overwrite: bool,
+    scorer: Optional[Callable],
 ):
-    return PolishLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+    return PolishLemmatizer(
+        nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
+    )


 __all__ = ["Polish"]
--- a/spacy/lang/ru/init.py
+++ b/spacy/lang/ru/init.py
@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, Callable
 from thinc.api import Model

 from .stop_words import STOP_WORDS
@ -22,7 +22,12 @@ class Russian(Language):
@Russian.factory(
    "lemmatizer",
    assigns=["token.lemma"],
-    default_config={"model": None, "mode": "pymorphy2", "overwrite": False},
+    default_config={
+        "model": None,
+        "mode": "pymorphy2",
+        "overwrite": False,
+        "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
+    },
    default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
@ -31,8 +36,11 @@ def make_lemmatizer(
    name: str,
    mode: str,
    overwrite: bool,
+    scorer: Optional[Callable],
 ):
-    return RussianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+    return RussianLemmatizer(
+        nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
+    )


 __all__ = ["Russian"]
--- a/spacy/lang/ru/lemmatizer.py
+++ b/spacy/lang/ru/lemmatizer.py
@ -1,8 +1,9 @@
-from typing import Optional, List, Dict, Tuple
+from typing import Optional, List, Dict, Tuple, Callable

 from thinc.api import Model

 from ...pipeline import Lemmatizer
+from ...pipeline.lemmatizer import lemmatizer_score
 from ...symbols import POS
 from ...tokens import Token
 from ...vocab import Vocab
@ -20,6 +21,7 @@ class RussianLemmatizer(Lemmatizer):
        *,
        mode: str = "pymorphy2",
        overwrite: bool = False,
+        scorer: Optional[Callable] = lemmatizer_score,
    ) -> None:
        if mode == "pymorphy2":
            try:
@ -31,7 +33,7 @@ class RussianLemmatizer(Lemmatizer):
                ) from None
            if getattr(self, "_morph", None) is None:
                self._morph = MorphAnalyzer()
-        super().__init__(vocab, model, name, mode=mode, overwrite=overwrite)
+        super().__init__(vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer)

    def pymorphy2_lemmatize(self, token: Token) -> List[str]:
        string = token.text
--- a/spacy/lang/si/stop_words.py
+++ b/spacy/lang/si/stop_words.py
@ -1,47 +1,195 @@
 STOP_WORDS = set(
    """
-අතර
-එච්චර
-එපමණ
-එලෙස
-එවිට
-ඒ
-කට
-කදී
-කින්
-ක්
-ට
-තුර
-ත්
-ද
-නමුත්
-නොහොත්
-පමණ
-පමණි
-ම
-මෙච්චර
-මෙපමණ
-මෙලෙස
-මෙවිට
-මේ
-ය
-යි
-ලදී
+සහ
+සමග
+සමඟ
+අහා
+ආහ්
+ආ
+ඕහෝ
+අනේ
+අඳෝ
+අපොයි
+අපෝ
+අයියෝ
+ආයි
+ඌයි
+චී
+චිහ්
+චික්
+හෝ‍
+දෝ
+දෝහෝ
+මෙන්
+සේ
+වැනි
+බඳු
+වන්
+අයුරු
+අයුරින්
 ලෙස
-වගේ
+වැඩි
+ශ්‍රී
+හා
+ය
+නිසා
+නිසාවෙන්
+බවට
+බව
+බවෙන්
+නම්
+වැඩි
+සිට
+දී
+මහා
+මහ
+පමණ
+පමණින්
+පමන
 වන
 විට
-විටෙක
-විතර
-විය
-වුව
-වුවත්
-වුවද
-වූ
-සමඟ
+විටින්
+මේ
+මෙලෙස
+මෙයින්
+ඇති
+ලෙස
+සිදු
+වශයෙන්
+යන
+සඳහා
+මගින්
+හෝ‍
+ඉතා
+ඒ
+එම
+ද
+අතර
+විසින්
+සමග
+පිළිබඳව
+පිළිබඳ
+තුළ
+බව
+වැනි
+මහ
+මෙම
+මෙහි
+මේ
+වෙත
+වෙතින්
+වෙතට
+වෙනුවෙන්
+වෙනුවට
+වෙන
+ගැන
+නෑ
+අනුව
+නව
+පිළිබඳ
+විශේෂ
+දැනට
+එහෙන්
+මෙහෙන්
+එහේ
+මෙහේ
+ම
+තවත්
+තව 
 සහ
-හා
+දක්වා
+ට
+ගේ
+එ
+ක
+ක්
+බවත්
+බවද
+මත
+ඇතුලු
+ඇතුළු
+මෙසේ
+වඩා
+වඩාත්ම
+නිති
+නිතිත්
+නිතොර
+නිතර
+ඉක්බිති
+දැන්
+යලි
+පුන
+ඉතින්
+සිට
+සිටන්
+පටන්
+තෙක්
+දක්වා
+සා
+තාක්
+තුවක්
+පවා
+ද
+හෝ‍
+වත්
+විනා
+හැර
+මිස
+මුත්
+කිම
+කිම්
+ඇයි
+මන්ද
 හෙවත්
-හෝ
+නොහොත්
+පතා
+පාසා
+ගානෙ
+තව
+ඉතා
+බොහෝ
+වහා
+සෙද
+සැනින්
+හනික
+එම්බා
+එම්බල
+බොල
+නම්
+වනාහි
+කලී
+ඉඳුරා
+අන්න
+ඔන්න
+මෙන්න
+උදෙසා
+පිණිස
+සඳහා
+අරබයා
+නිසා
+එනිසා
+එබැවින්
+බැවින්
+හෙයින්
+සේක්
+සේක
+ගැන
+අනුව
+පරිදි
+විට
+තෙක්
+මෙතෙක්
+මේතාක්
+තුරු
+තුරා
+තුරාවට
+තුලින්
+නමුත්
+එනමුත්
+වස්
+මෙන්
+ලෙස
+පරිදි
+එහෙත්
 """.split()
 )
--- a/spacy/lang/sv/init.py
+++ b/spacy/lang/sv/init.py
@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, Callable
 from thinc.api import Model
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
@ -29,13 +29,25 @@ class Swedish(Language):
@Swedish.factory(
    "lemmatizer",
    assigns=["token.lemma"],
-    default_config={"model": None, "mode": "rule", "overwrite": False},
+    default_config={
+        "model": None,
+        "mode": "rule",
+        "overwrite": False,
+        "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
+    },
    default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
-    nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
+    nlp: Language,
+    model: Optional[Model],
+    name: str,
+    mode: str,
+    overwrite: bool,
+    scorer: Optional[Callable],
 ):
-    return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+    return Lemmatizer(
+        nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
+    )


 __all__ = ["Swedish"]
--- a/spacy/lang/th/init.py
+++ b/spacy/lang/th/init.py
@ -3,6 +3,7 @@ from .lex_attrs import LEX_ATTRS
 from ...language import Language, BaseDefaults
 from ...tokens import Doc
 from ...util import DummyTokenizer, registry, load_config_from_str
+from ...vocab import Vocab


 DEFAULT_CONFIG = """
@ -16,13 +17,13 @@ DEFAULT_CONFIG = """
@registry.tokenizers("spacy.th.ThaiTokenizer")
 def create_thai_tokenizer():
    def thai_tokenizer_factory(nlp):
-        return ThaiTokenizer(nlp)
+        return ThaiTokenizer(nlp.vocab)

    return thai_tokenizer_factory


 class ThaiTokenizer(DummyTokenizer):
-    def __init__(self, nlp: Language) -> None:
+    def __init__(self, vocab: Vocab) -> None:
        try:
            from pythainlp.tokenize import word_tokenize
        except ImportError:
@ -31,7 +32,7 @@ class ThaiTokenizer(DummyTokenizer):
                "https://github.com/PyThaiNLP/pythainlp"
            ) from None
        self.word_tokenize = word_tokenize
-        self.vocab = nlp.vocab
+        self.vocab = vocab

    def __call__(self, text: str) -> Doc:
        words = list(self.word_tokenize(text))
--- a/spacy/lang/ti/lex_attrs.py
+++ b/spacy/lang/ti/lex_attrs.py
@ -2,7 +2,7 @@ from ...attrs import LIKE_NUM

 _num_words = [
    "ዜሮ",
-    "ሐደ",
+    "ሓደ",
    "ክልተ",
    "ሰለስተ",
    "ኣርባዕተ",
@ -11,66 +11,37 @@ _num_words = [
    "ሸውዓተ",
    "ሽሞንተ",
    "ትሽዓተ",
-    "ኣሰርተ",
-    "ኣሰርተ ሐደ",
-    "ኣሰርተ ክልተ",
-    "ኣሰርተ ሰለስተ",
-    "ኣሰርተ ኣርባዕተ",
-    "ኣሰርተ ሓሙሽተ",
-    "ኣሰርተ ሽድሽተ",
-    "ኣሰርተ ሸውዓተ",
-    "ኣሰርተ ሽሞንተ",
-    "ኣሰርተ ትሽዓተ",
+    "ዓሰርተ",
    "ዕስራ",
    "ሰላሳ",
    "ኣርብዓ",
-    "ሃምሳ",
-    "ስልሳ",
+    "ሓምሳ",
+    "ሱሳ",
    "ሰብዓ",
    "ሰማንያ",
-    "ተስዓ",
+    "ቴስዓ",
    "ሚእቲ",
    "ሺሕ",
    "ሚልዮን",
    "ቢልዮን",
    "ትሪልዮን",
    "ኳድሪልዮን",
-    "ገጅልዮን",
-    "ባዝልዮን",
+    "ጋዚልዮን",
+    "ባዚልዮን"
 ]

+# Tigrinya ordinals above 10 are the same as _num_words but start with "መበል "
 _ordinal_words = [
    "ቀዳማይ",
    "ካልኣይ",
    "ሳልሳይ",
-    "ራብኣይ",
+    "ራብዓይ",
    "ሓምሻይ",
    "ሻድሻይ",
    "ሻውዓይ",
    "ሻምናይ",
-    "ዘጠነኛ",
-    "አስረኛ",
-    "ኣሰርተ አንደኛ",
-    "ኣሰርተ ሁለተኛ",
-    "ኣሰርተ ሶስተኛ",
-    "ኣሰርተ አራተኛ",
-    "ኣሰርተ አምስተኛ",
-    "ኣሰርተ ስድስተኛ",
-    "ኣሰርተ ሰባተኛ",
-    "ኣሰርተ ስምንተኛ",
-    "ኣሰርተ ዘጠነኛ",
-    "ሃያኛ",
-    "ሰላሳኛ" "አርባኛ",
-    "አምሳኛ",
-    "ስድሳኛ",
-    "ሰባኛ",
-    "ሰማንያኛ",
-    "ዘጠናኛ",
-    "መቶኛ",
-    "ሺኛ",
-    "ሚሊዮንኛ",
-    "ቢሊዮንኛ",
-    "ትሪሊዮንኛ",
+    "ታሽዓይ",
+    "ዓስራይ"
 ]


@ -92,7 +63,7 @@ def like_num(text):
    # Check ordinal number
    if text_lower in _ordinal_words:
        return True
-    if text_lower.endswith("ኛ"):
+    if text_lower.endswith("ይ"):
        if text_lower[:-2].isdigit():
            return True

--- a/spacy/lang/ti/punctuation.py
+++ b/spacy/lang/ti/punctuation.py
@ -1,7 +1,7 @@
 from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
 from ..char_classes import UNITS, ALPHA_UPPER

-_list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧".strip().split()
+_list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧ ፠ ፨".strip().split()

 _suffixes = (
    _list_punct
--- a/spacy/lang/ti/stop_words.py
+++ b/spacy/lang/ti/stop_words.py
@ -1,6 +1,27 @@
+# Stop words from Tigrinya Wordcount: https://github.com/fgaim/Tigrinya-WordCount/blob/main/ti_stop_words.txt
+
 # Stop words
 STOP_WORDS = set(
    """
-ግን ግና ንስኻ ንስኺ ንስኻትክን ንስኻትኩም ናትካ ናትኪ ናትክን ናትኩም
+'ምበር 'ሞ 'ቲ 'ታ 'ኳ 'ውን 'ዚ 'የ 'ዩ 'ያ 'ዮም 'ዮን
+ልዕሊ ሒዙ ሒዛ ሕጂ መበል መን መንጎ መጠን ማለት ምስ ምባል
+ምእንቲ ምኽንያቱ ምኽንያት ምዃኑ ምዃንና ምዃኖም
+ስለ ስለዚ ስለዝበላ ሽዑ ቅድሚ በለ በቲ በዚ ብምባል ብተወሳኺ ብኸመይ
+ብዘይ ብዘይካ ብዙሕ ብዛዕባ ብፍላይ ተባሂሉ ነበረ ነቲ ነታ ነቶም
+ነዚ ነይሩ ነገራት ነገር ናብ ናብቲ ናትኩም ናትኪ ናትካ ናትክን
+ናይ ናይቲ ንሕና ንሱ ንሳ ንሳቶም ንስኺ ንስኻ ንስኻትኩም ንስኻትክን ንዓይ
+ኢለ ኢሉ ኢላ ኢልካ ኢሎም ኢና ኢኻ ኢዩ ኣለኹ
+ኣለዉ ኣለዎ ኣሎ ኣብ ኣብቲ ኣብታ ኣብኡ ኣብዚ ኣነ ኣዝዩ ኣይኮነን ኣይኰነን
+እምበር እሞ እተን እቲ እታ እቶም እንተ እንተሎ
+ኣላ እንተኾነ እንታይ እንከሎ እኳ እዋን እውን እዚ እዛ እዞም
+እየ እየን እዩ እያ እዮም
+ከሎ ከመይ ከም ከምቲ ከምኡ ከምዘሎ
+ከምዚ ከኣ ኩሉ ካልእ ካልኦት ካብ ካብቲ ካብቶም ክሳብ ክሳዕ ክብል
+ክንደይ ክንዲ ክኸውን ኮይኑ ኰይኑ ኵሉ ኸም ኸኣ ወይ
+ዋላ ዘለና ዘለዉ ዘለዋ ዘለዎ ዘለዎም ዘላ ዘሎ ዘይብሉ  
+ዝርከብ ዝበሃል ዝበለ ዝብል ዝተባህለ ዝተኻየደ ዝተፈላለየ ዝተፈላለዩ
+ዝነበረ ዝነበረት ዝነበሩ ዝካየድ ዝኸውን ዝኽእል ዝኾነ ዝዀነ
+የለን ይቕረብ ይብል ይኸውን ይኹን ይኽእል ደኣ ድሕሪ ድማ
+ገለ ገሊጹ ገና ገይሩ ግና ግን ጥራይ
 """.split()
 )
--- a/spacy/lang/tokenizer_exceptions.py
+++ b/spacy/lang/tokenizer_exceptions.py
@ -250,3 +250,9 @@ o.0

 for orth in emoticons:
    BASE_EXCEPTIONS[orth] = [{ORTH: orth}]
+
+
+# Moved from a suffix setting due to #9155 removing prefixes from consideration
+# for lookbehinds
+for u in "cfkCFK":
+    BASE_EXCEPTIONS[f"°{u}."] = [{ORTH: "°"}, {ORTH: f"{u}"}, {ORTH: "."}]
--- a/spacy/lang/uk/init.py
+++ b/spacy/lang/uk/init.py
@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, Callable

 from thinc.api import Model

@ -23,13 +23,25 @@ class Ukrainian(Language):
@Ukrainian.factory(
    "lemmatizer",
    assigns=["token.lemma"],
-    default_config={"model": None, "mode": "pymorphy2", "overwrite": False},
+    default_config={
+        "model": None,
+        "mode": "pymorphy2",
+        "overwrite": False,
+        "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
+    },
    default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
-    nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
+    nlp: Language,
+    model: Optional[Model],
+    name: str,
+    mode: str,
+    overwrite: bool,
+    scorer: Optional[Callable],
 ):
-    return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+    return UkrainianLemmatizer(
+        nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
+    )


 __all__ = ["Ukrainian"]
--- a/spacy/lang/uk/lemmatizer.py
+++ b/spacy/lang/uk/lemmatizer.py
@ -1,8 +1,9 @@
-from typing import Optional
+from typing import Optional, Callable

 from thinc.api import Model

 from ..ru.lemmatizer import RussianLemmatizer
+from ...pipeline.lemmatizer import lemmatizer_score
 from ...vocab import Vocab


@ -15,6 +16,7 @@ class UkrainianLemmatizer(RussianLemmatizer):
        *,
        mode: str = "pymorphy2",
        overwrite: bool = False,
+        scorer: Optional[Callable] = lemmatizer_score,
    ) -> None:
        if mode == "pymorphy2":
            try:
@ -27,4 +29,4 @@ class UkrainianLemmatizer(RussianLemmatizer):
                ) from None
            if getattr(self, "_morph", None) is None:
                self._morph = MorphAnalyzer(lang="uk")
-        super().__init__(vocab, model, name, mode=mode, overwrite=overwrite)
+        super().__init__(vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer)
--- a/spacy/lang/vi/init.py
+++ b/spacy/lang/vi/init.py
@ -9,6 +9,7 @@ from .lex_attrs import LEX_ATTRS
 from ...language import Language, BaseDefaults
 from ...tokens import Doc
 from ...util import DummyTokenizer, registry, load_config_from_str
+from ...vocab import Vocab
 from ... import util


@ -24,14 +25,14 @@ use_pyvi = true
@registry.tokenizers("spacy.vi.VietnameseTokenizer")
 def create_vietnamese_tokenizer(use_pyvi: bool = True):
    def vietnamese_tokenizer_factory(nlp):
-        return VietnameseTokenizer(nlp, use_pyvi=use_pyvi)
+        return VietnameseTokenizer(nlp.vocab, use_pyvi=use_pyvi)

    return vietnamese_tokenizer_factory


 class VietnameseTokenizer(DummyTokenizer):
-    def __init__(self, nlp: Language, use_pyvi: bool = False):
-        self.vocab = nlp.vocab
+    def __init__(self, vocab: Vocab, use_pyvi: bool = False):
+        self.vocab = vocab
        self.use_pyvi = use_pyvi
        if self.use_pyvi:
            try:
@ -45,6 +46,9 @@ class VietnameseTokenizer(DummyTokenizer):
                )
                raise ImportError(msg) from None

+    def __reduce__(self):
+        return VietnameseTokenizer, (self.vocab, self.use_pyvi)
+
    def __call__(self, text: str) -> Doc:
        if self.use_pyvi:
            words = self.pyvi_tokenize(text)
--- a/spacy/lang/vi/examples.py
+++ b/spacy/lang/vi/examples.py
@ -0,0 +1,18 @@
+
+"""
+Example sentences to test spaCy and its language models.
+>>> from spacy.lang.vi.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "Đây là đâu, tôi là ai?",
+    "Căn phòng có nhiều cửa sổ nên nó khá sáng",
+    "Đại dịch COVID vừa qua đã gây ảnh hưởng rất lớn tới nhiều doanh nghiệp lớn nhỏ.",
+    "Thành phố Hồ Chí Minh đã bị ảnh hưởng nặng nề trong thời gian vừa qua.",
+    "Ông bạn đang ở đâu thế?",
+    "Ai là người giải phóng đất nước Việt Nam khỏi ách đô hộ?",
+    "Vị tướng nào là người đã làm nên chiến thắng lịch sử Điện Biên Phủ?",
+    "Làm việc nhiều chán quá, đi chơi đâu đi?",
+]
--- a/spacy/lang/vi/lex_attrs.py
+++ b/spacy/lang/vi/lex_attrs.py
@ -9,11 +9,14 @@ _num_words = [
    "bốn",
    "năm",
    "sáu",
+    "bảy",
    "bẩy",
    "tám",
    "chín",
    "mười",
+    "chục",
    "trăm",
+    "nghìn",
    "tỷ",
 ]

--- a/spacy/lang/zh/init.py
+++ b/spacy/lang/zh/init.py
@ -11,6 +11,7 @@ from ...scorer import Scorer
 from ...tokens import Doc
 from ...training import validate_examples, Example
 from ...util import DummyTokenizer, registry, load_config_from_str
+from ...vocab import Vocab
 from .lex_attrs import LEX_ATTRS
 from .stop_words import STOP_WORDS
 from ... import util
@ -48,14 +49,14 @@ class Segmenter(str, Enum):
@registry.tokenizers("spacy.zh.ChineseTokenizer")
 def create_chinese_tokenizer(segmenter: Segmenter = Segmenter.char):
    def chinese_tokenizer_factory(nlp):
-        return ChineseTokenizer(nlp, segmenter=segmenter)
+        return ChineseTokenizer(nlp.vocab, segmenter=segmenter)

    return chinese_tokenizer_factory


 class ChineseTokenizer(DummyTokenizer):
-    def __init__(self, nlp: Language, segmenter: Segmenter = Segmenter.char):
-        self.vocab = nlp.vocab
+    def __init__(self, vocab: Vocab, segmenter: Segmenter = Segmenter.char):
+        self.vocab = vocab
        self.segmenter = (
            segmenter.value if isinstance(segmenter, Segmenter) else segmenter
        )
--- a/spacy/language.py
+++ b/spacy/language.py
@ -115,7 +115,7 @@ class Language:

    Defaults (class): Settings, data and factory methods for creating the `nlp`
        object and processing pipeline.
-    lang (str): Two-letter language ID, i.e. ISO code.
+    lang (str): IETF language code, such as 'en'.

    DOCS: https://spacy.io/api/language
    """
@ -228,6 +228,7 @@ class Language:
            "vectors": len(self.vocab.vectors),
            "keys": self.vocab.vectors.n_keys,
            "name": self.vocab.vectors.name,
+            "mode": self.vocab.vectors.mode,
        }
        self._meta["labels"] = dict(self.pipe_labels)
        # TODO: Adding this back to prevent breaking people's code etc., but
@ -978,7 +979,7 @@ class Language:

    def __call__(
        self,
-        text: str,
+        text: Union[str, Doc],
        *,
        disable: Iterable[str] = SimpleFrozenList(),
        component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
@ -987,7 +988,9 @@ class Language:
        and can contain arbitrary whitespace. Alignment into the original string
        is preserved.

-        text (str): The text to be processed.
+        text (Union[str, Doc]): If `str`, the text to be processed. If `Doc`,
+            the doc will be passed directly to the pipeline, skipping
+            `Language.make_doc`.
        disable (List[str]): Names of the pipeline components to disable.
        component_cfg (Dict[str, dict]): An optional dictionary with extra
            keyword arguments for specific components.
@ -995,7 +998,7 @@ class Language:

        DOCS: https://spacy.io/api/language#call
        """
-        doc = self.make_doc(text)
+        doc = self._ensure_doc(text)
        if component_cfg is None:
            component_cfg = {}
        for name, proc in self.pipeline:
@ -1080,6 +1083,20 @@ class Language:
            )
        return self.tokenizer(text)

+    def _ensure_doc(self, doc_like: Union[str, Doc]) -> Doc:
+        """Create a Doc if need be, or raise an error if the input is not a Doc or a string."""
+        if isinstance(doc_like, Doc):
+            return doc_like
+        if isinstance(doc_like, str):
+            return self.make_doc(doc_like)
+        raise ValueError(Errors.E866.format(type=type(doc_like)))
+
+    def _ensure_doc_with_context(self, doc_like: Union[str, Doc], context: Any) -> Doc:
+        """Create a Doc if need be and add as_tuples context, or raise an error if the input is not a Doc or a string."""
+        doc = self._ensure_doc(doc_like)
+        doc._context = context
+        return doc
+
    def update(
        self,
        examples: Iterable[Example],
@ -1450,7 +1467,7 @@ class Language:
    @overload
    def pipe(
        self,
-        texts: Iterable[str],
+        texts: Iterable[Union[str, Doc]],
        *,
        as_tuples: Literal[False] = ...,
        batch_size: Optional[int] = ...,
@ -1463,7 +1480,7 @@ class Language:
    @overload
    def pipe(  # noqa: F811
        self,
-        texts: Iterable[Tuple[str, _AnyContext]],
+        texts: Iterable[Tuple[Union[str, Doc], _AnyContext]],
        *,
        as_tuples: Literal[True] = ...,
        batch_size: Optional[int] = ...,
@ -1475,7 +1492,9 @@ class Language:

    def pipe(  # noqa: F811
        self,
-        texts: Union[Iterable[str], Iterable[Tuple[str, _AnyContext]]],
+        texts: Union[
+            Iterable[Union[str, Doc]], Iterable[Tuple[Union[str, Doc], _AnyContext]]
+        ],
        *,
        as_tuples: bool = False,
        batch_size: Optional[int] = None,
@ -1485,7 +1504,8 @@ class Language:
    ) -> Union[Iterator[Doc], Iterator[Tuple[Doc, _AnyContext]]]:
        """Process texts as a stream, and yield `Doc` objects in order.

-        texts (Iterable[str]): A sequence of texts to process.
+        texts (Iterable[Union[str, Doc]]): A sequence of texts or docs to
+            process.
        as_tuples (bool): If set to True, inputs should be a sequence of
            (text, context) tuples. Output will then be a sequence of
            (doc, context) tuples. Defaults to False.
@ -1500,23 +1520,24 @@ class Language:
        """
        # Handle texts with context as tuples
        if as_tuples:
-            texts = cast(Iterable[Tuple[str, _AnyContext]], texts)
-            text_context1, text_context2 = itertools.tee(texts)
-            texts = (tc[0] for tc in text_context1)
-            contexts = (tc[1] for tc in text_context2)
+            texts = cast(Iterable[Tuple[Union[str, Doc], _AnyContext]], texts)
+            docs_with_contexts = (
+                self._ensure_doc_with_context(text, context) for text, context in texts
+            )
            docs = self.pipe(
-                texts,
+                docs_with_contexts,
                batch_size=batch_size,
                disable=disable,
                n_process=n_process,
                component_cfg=component_cfg,
            )
-            for doc, context in zip(docs, contexts):
+            for doc in docs:
+                context = doc._context
+                doc._context = None
                yield (doc, context)
            return

-        # At this point, we know that we're dealing with an iterable of plain texts
-        texts = cast(Iterable[str], texts)
+        texts = cast(Iterable[Union[str, Doc]], texts)

        # Set argument defaults
        if n_process == -1:
@ -1551,7 +1572,7 @@ class Language:
            docs = self._multiprocessing_pipe(texts, pipes, n_process, batch_size)
        else:
            # if n_process == 1, no processes are forked.
-            docs = (self.make_doc(text) for text in texts)
+            docs = (self._ensure_doc(text) for text in texts)
            for pipe in pipes:
                docs = pipe(docs)
        for doc in docs:
@ -1570,7 +1591,7 @@ class Language:

    def _multiprocessing_pipe(
        self,
-        texts: Iterable[str],
+        texts: Iterable[Union[str, Doc]],
        pipes: Iterable[Callable[..., Iterator[Doc]]],
        n_process: int,
        batch_size: int,
@ -1596,7 +1617,7 @@ class Language:
        procs = [
            mp.Process(
                target=_apply_pipes,
-                args=(self.make_doc, pipes, rch, sch, Underscore.get_state()),
+                args=(self._ensure_doc, pipes, rch, sch, Underscore.get_state()),
            )
            for rch, sch in zip(texts_q, bytedocs_send_ch)
        ]
@ -1609,11 +1630,12 @@ class Language:
            recv.recv() for recv in cycle(bytedocs_recv_ch)
        )
        try:
-            for i, (_, (byte_doc, byte_error)) in enumerate(
+            for i, (_, (byte_doc, byte_context, byte_error)) in enumerate(
                zip(raw_texts, byte_tuples), 1
            ):
                if byte_doc is not None:
                    doc = Doc(self.vocab).from_bytes(byte_doc)
+                    doc._context = byte_context
                    yield doc
                elif byte_error is not None:
                    error = srsly.msgpack_loads(byte_error)
@ -2138,7 +2160,7 @@ def _copy_examples(examples: Iterable[Example]) -> List[Example]:


 def _apply_pipes(
-    make_doc: Callable[[str], Doc],
+    ensure_doc: Callable[[Union[str, Doc]], Doc],
    pipes: Iterable[Callable[..., Iterator[Doc]]],
    receiver,
    sender,
@ -2146,7 +2168,8 @@ def _apply_pipes(
 ) -> None:
    """Worker for Language.pipe

-    make_doc (Callable[[str,] Doc]): Function to create Doc from text.
+    ensure_doc (Callable[[Union[str, Doc]], Doc]): Function to create Doc from text
+        or raise an error if the input is neither a Doc nor a string.
    pipes (Iterable[Pipe]): The components to apply.
    receiver (multiprocessing.Connection): Pipe to receive text. Usually
        created by `multiprocessing.Pipe()`
@ -2159,16 +2182,16 @@ def _apply_pipes(
    while True:
        try:
            texts = receiver.get()
-            docs = (make_doc(text) for text in texts)
+            docs = (ensure_doc(text) for text in texts)
            for pipe in pipes:
                docs = pipe(docs)  # type: ignore[arg-type, assignment]
            # Connection does not accept unpickable objects, so send list.
-            byte_docs = [(doc.to_bytes(), None) for doc in docs]
-            padding = [(None, None)] * (len(texts) - len(byte_docs))
+            byte_docs = [(doc.to_bytes(), doc._context, None) for doc in docs]
+            padding = [(None, None, None)] * (len(texts) - len(byte_docs))
            sender.send(byte_docs + padding)  # type: ignore[operator]
        except Exception:
-            error_msg = [(None, srsly.msgpack_dumps(traceback.format_exc()))]
-            padding = [(None, None)] * (len(texts) - 1)
+            error_msg = [(None, None, srsly.msgpack_dumps(traceback.format_exc()))]
+            padding = [(None, None, None)] * (len(texts) - 1)
            sender.send(error_msg + padding)


--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -284,7 +284,7 @@ cdef class Lexeme:
        def __get__(self):
            return self.vocab.strings[self.c.lower]

-        def __set__(self, unicode x):
+        def __set__(self, str x):
            self.c.lower = self.vocab.strings.add(x)

    property norm_:
@ -294,7 +294,7 @@ cdef class Lexeme:
        def __get__(self):
            return self.vocab.strings[self.c.norm]

-        def __set__(self, unicode x):
+        def __set__(self, str x):
            self.norm = self.vocab.strings.add(x)

    property shape_:
@ -304,7 +304,7 @@ cdef class Lexeme:
        def __get__(self):
            return self.vocab.strings[self.c.shape]

-        def __set__(self, unicode x):
+        def __set__(self, str x):
            self.c.shape = self.vocab.strings.add(x)

    property prefix_:
@ -314,7 +314,7 @@ cdef class Lexeme:
        def __get__(self):
            return self.vocab.strings[self.c.prefix]

-        def __set__(self, unicode x):
+        def __set__(self, str x):
            self.c.prefix = self.vocab.strings.add(x)

    property suffix_:
@ -324,7 +324,7 @@ cdef class Lexeme:
        def __get__(self):
            return self.vocab.strings[self.c.suffix]

-        def __set__(self, unicode x):
+        def __set__(self, str x):
            self.c.suffix = self.vocab.strings.add(x)

    property lang_:
@ -332,7 +332,7 @@ cdef class Lexeme:
        def __get__(self):
            return self.vocab.strings[self.c.lang]

-        def __set__(self, unicode x):
+        def __set__(self, str x):
            self.c.lang = self.vocab.strings.add(x)

    property flags:
--- a/spacy/matcher/dependencymatcher.pyx
+++ b/spacy/matcher/dependencymatcher.pyx
@ -148,9 +148,9 @@ cdef class DependencyMatcher:
        Creates a token key to be used by the matcher
        """
        return self._normalize_key(
-            unicode(key) + DELIMITER + 
-            unicode(pattern_idx) + DELIMITER + 
-            unicode(token_idx)
+            str(key) + DELIMITER +
+            str(pattern_idx) + DELIMITER +
+            str(token_idx)
        )

    def add(self, key, patterns, *, on_match=None):
@ -424,7 +424,7 @@ cdef class DependencyMatcher:
        return [doc[child.i] for child in doc[node].head.children if child.i < node]

    def _normalize_key(self, key):
-        if isinstance(key, basestring):
+        if isinstance(key, str):
            return self.vocab.strings.add(key)
        else:
            return key
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@ -312,7 +312,7 @@ cdef class Matcher:
        return final_results

    def _normalize_key(self, key):
-        if isinstance(key, basestring):
+        if isinstance(key, str):
            return self.vocab.strings.add(key)
        else:
            return key
@ -360,7 +360,7 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
    for i, token in enumerate(doclike):
        for name, index in extensions.items():
            value = token._.get(name)
-            if isinstance(value, basestring):
+            if isinstance(value, str):
                value = token.vocab.strings[value]
            extra_attr_values[i * nr_extra_attr + index] = value
    # Main loop
@ -786,7 +786,7 @@ def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates):
 def _get_attr_values(spec, string_store):
    attr_values = []
    for attr, value in spec.items():
-        if isinstance(attr, basestring):
+        if isinstance(attr, str):
            attr = attr.upper()
            if attr == '_':
                continue
@ -797,7 +797,7 @@ def _get_attr_values(spec, string_store):
            if attr == "IS_SENT_START":
                attr = "SENT_START"
            attr = IDS.get(attr)
-        if isinstance(value, basestring):
+        if isinstance(value, str):
            value = string_store.add(value)
        elif isinstance(value, bool):
            value = int(value)
@ -938,7 +938,7 @@ def _get_extra_predicates(spec, extra_predicates, vocab):
    seen_predicates = {pred.key: pred.i for pred in extra_predicates}
    output = []
    for attr, value in spec.items():
-        if isinstance(attr, basestring):
+        if isinstance(attr, str):
            if attr == "_":
                output.extend(
                    _get_extension_extra_predicates(
@ -995,7 +995,7 @@ def _get_operators(spec):
              "?": (ZERO_ONE,), "1": (ONE,), "!": (ZERO,)}
    # Fix casing
    spec = {key.upper(): values for key, values in spec.items()
-            if isinstance(key, basestring)}
+            if isinstance(key, str)}
    if "OP" not in spec:
        return (ONE,)
    elif spec["OP"] in lookup:
@ -1013,7 +1013,7 @@ def _get_extensions(spec, string_store, name2index):
        if isinstance(value, dict):
            # Handle predicates (e.g. "IN", in the extra_predicates, not here.
            continue
-        if isinstance(value, basestring):
+        if isinstance(value, str):
            value = string_store.add(value)
        if name not in name2index:
            name2index[name] = len(name2index)
--- a/spacy/ml/staticvectors.py
+++ b/spacy/ml/staticvectors.py
@ -1,11 +1,13 @@
-from typing import List, Tuple, Callable, Optional, cast
+from typing import List, Tuple, Callable, Optional, Sequence, cast
 from thinc.initializers import glorot_uniform_init
 from thinc.util import partial
-from thinc.types import Ragged, Floats2d, Floats1d
+from thinc.types import Ragged, Floats2d, Floats1d, Ints1d
 from thinc.api import Model, Ops, registry

 from ..tokens import Doc
 from ..errors import Errors
+from ..vectors import Mode
+from ..vocab import Vocab


@registry.layers("spacy.StaticVectors.v2")
@ -34,20 +36,32 @@ def StaticVectors(
 def forward(
    model: Model[List[Doc], Ragged], docs: List[Doc], is_train: bool
 ) -> Tuple[Ragged, Callable]:
-    if not sum(len(doc) for doc in docs):
+    token_count = sum(len(doc) for doc in docs)
+    if not token_count:
        return _handle_empty(model.ops, model.get_dim("nO"))
-    key_attr = model.attrs["key_attr"]
-    W = cast(Floats2d, model.ops.as_contig(model.get_param("W")))
-    V = cast(Floats2d, model.ops.asarray(docs[0].vocab.vectors.data))
-    rows = model.ops.flatten(
-        [doc.vocab.vectors.find(keys=doc.to_array(key_attr)) for doc in docs]
+    key_attr: int = model.attrs["key_attr"]
+    keys: Ints1d = model.ops.flatten(
+        cast(Sequence, [doc.to_array(key_attr) for doc in docs])
    )
+    vocab: Vocab = docs[0].vocab
+    W = cast(Floats2d, model.ops.as_contig(model.get_param("W")))
+    if vocab.vectors.mode == Mode.default:
+        V = cast(Floats2d, model.ops.asarray(vocab.vectors.data))
+        rows = vocab.vectors.find(keys=keys)
+        V = model.ops.as_contig(V[rows])
+    elif vocab.vectors.mode == Mode.floret:
+        V = cast(Floats2d, vocab.vectors.get_batch(keys))
+        V = model.ops.as_contig(V)
+    else:
+        raise RuntimeError(Errors.E896)
    try:
-        vectors_data = model.ops.gemm(model.ops.as_contig(V[rows]), W, trans2=True)
+        vectors_data = model.ops.gemm(V, W, trans2=True)
    except ValueError:
        raise RuntimeError(Errors.E896)
-    # Convert negative indices to 0-vectors (TODO: more options for UNK tokens)
-    vectors_data[rows < 0] = 0
+    if vocab.vectors.mode == Mode.default:
+        # Convert negative indices to 0-vectors
+        # TODO: more options for UNK tokens
+        vectors_data[rows < 0] = 0
    output = Ragged(
        vectors_data, model.ops.asarray([len(doc) for doc in docs], dtype="i")  # type: ignore
    )
@ -63,7 +77,7 @@ def forward(
        model.inc_grad(
            "W",
            model.ops.gemm(
-                cast(Floats2d, d_output.data), model.ops.as_contig(V[rows]), trans1=True
+                cast(Floats2d, d_output.data), model.ops.as_contig(V), trans1=True
            ),
        )
        return []
--- a/spacy/pipeline/_parser_internals/arc_eager.pyx
+++ b/spacy/pipeline/_parser_internals/arc_eager.pyx
@ -17,7 +17,7 @@ from ...errors import Errors
 from thinc.extra.search cimport Beam

 cdef weight_t MIN_SCORE = -90000
-cdef attr_t SUBTOK_LABEL = hash_string(u'subtok')
+cdef attr_t SUBTOK_LABEL = hash_string('subtok')

 DEF NON_MONOTONIC = True

--- a/spacy/pipeline/attributeruler.py
+++ b/spacy/pipeline/attributeruler.py
@ -5,15 +5,15 @@ from pathlib import Path

 from .pipe import Pipe
 from ..errors import Errors
-from ..training import validate_examples, Example
+from ..training import Example
 from ..language import Language
 from ..matcher import Matcher
 from ..scorer import Scorer
-from ..symbols import IDS, TAG, POS, MORPH, LEMMA
+from ..symbols import IDS
 from ..tokens import Doc, Span
 from ..tokens._retokenize import normalize_token_attrs, set_token_attrs
 from ..vocab import Vocab
-from ..util import SimpleFrozenList
+from ..util import SimpleFrozenList, registry
 from .. import util


@ -23,9 +23,41 @@ TagMapType = Dict[str, Dict[Union[int, str], Union[int, str]]]
 MorphRulesType = Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]]


-@Language.factory("attribute_ruler", default_config={"validate": False})
-def make_attribute_ruler(nlp: Language, name: str, validate: bool):
-    return AttributeRuler(nlp.vocab, name, validate=validate)
+@Language.factory(
+    "attribute_ruler",
+    default_config={
+        "validate": False,
+        "scorer": {"@scorers": "spacy.attribute_ruler_scorer.v1"},
+    },
+)
+def make_attribute_ruler(
+    nlp: Language, name: str, validate: bool, scorer: Optional[Callable]
+):
+    return AttributeRuler(nlp.vocab, name, validate=validate, scorer=scorer)
+
+
+def attribute_ruler_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
+    def morph_key_getter(token, attr):
+        return getattr(token, attr).key
+
+    results = {}
+    results.update(Scorer.score_token_attr(examples, "tag", **kwargs))
+    results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
+    results.update(
+        Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs)
+    )
+    results.update(
+        Scorer.score_token_attr_per_feat(
+            examples, "morph", getter=morph_key_getter, **kwargs
+        )
+    )
+    results.update(Scorer.score_token_attr(examples, "lemma", **kwargs))
+    return results
+
+
+@registry.scorers("spacy.attribute_ruler_scorer.v1")
+def make_attribute_ruler_scorer():
+    return attribute_ruler_score


 class AttributeRuler(Pipe):
@ -36,7 +68,12 @@ class AttributeRuler(Pipe):
    """

    def __init__(
-        self, vocab: Vocab, name: str = "attribute_ruler", *, validate: bool = False
+        self,
+        vocab: Vocab,
+        name: str = "attribute_ruler",
+        *,
+        validate: bool = False,
+        scorer: Optional[Callable] = attribute_ruler_score,
    ) -> None:
        """Create the AttributeRuler. After creation, you can add patterns
        with the `.initialize()` or `.add_patterns()` methods, or load patterns
@ -45,6 +82,10 @@ class AttributeRuler(Pipe):

        vocab (Vocab): The vocab.
        name (str): The pipe name. Defaults to "attribute_ruler".
+        scorer (Optional[Callable]): The scoring method. Defaults to
+            Scorer.score_token_attr for the attributes "tag", "pos", "morph" and
+            "lemma" and Scorer.score_token_attr_per_feat for the attribute
+            "morph".

        RETURNS (AttributeRuler): The AttributeRuler component.

@ -57,6 +98,7 @@ class AttributeRuler(Pipe):
        self.attrs: List[Dict] = []
        self._attrs_unnormed: List[Dict] = []  # store for reference
        self.indices: List[int] = []
+        self.scorer = scorer

    def clear(self) -> None:
        """Reset all patterns."""
@ -228,45 +270,6 @@ class AttributeRuler(Pipe):
            all_patterns.append(p)
        return all_patterns  # type: ignore[return-value]

-    def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
-        """Score a batch of examples.
-
-        examples (Iterable[Example]): The examples to score.
-        RETURNS (Dict[str, Any]): The scores, produced by
-            Scorer.score_token_attr for the attributes "tag", "pos", "morph"
-            and "lemma" for the target token attributes.
-
-        DOCS: https://spacy.io/api/tagger#score
-        """
-
-        def morph_key_getter(token, attr):
-            return getattr(token, attr).key
-
-        validate_examples(examples, "AttributeRuler.score")
-        results = {}
-        attrs = set()  # type: ignore
-        for token_attrs in self.attrs:
-            attrs.update(token_attrs)
-        for attr in attrs:
-            if attr == TAG:
-                results.update(Scorer.score_token_attr(examples, "tag", **kwargs))
-            elif attr == POS:
-                results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
-            elif attr == MORPH:
-                results.update(
-                    Scorer.score_token_attr(
-                        examples, "morph", getter=morph_key_getter, **kwargs
-                    )
-                )
-                results.update(
-                    Scorer.score_token_attr_per_feat(
-                        examples, "morph", getter=morph_key_getter, **kwargs
-                    )
-                )
-            elif attr == LEMMA:
-                results.update(Scorer.score_token_attr(examples, "lemma", **kwargs))
-        return results
-
    def to_bytes(self, exclude: Iterable[str] = SimpleFrozenList()) -> bytes:
        """Serialize the AttributeRuler to a bytestring.

--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.pyx
@ -1,6 +1,6 @@
 # cython: infer_types=True, profile=True, binding=True
 from collections import defaultdict
-from typing import Optional, Iterable
+from typing import Optional, Iterable, Callable
 from thinc.api import Model, Config

 from ._parser_internals.transition_system import TransitionSystem
@ -12,7 +12,7 @@ from ..language import Language
 from ._parser_internals import nonproj
 from ._parser_internals.nonproj import DELIMITER
 from ..scorer import Scorer
-from ..training import validate_examples
+from ..util import registry


 default_model_config = """
@ -46,6 +46,7 @@ DEFAULT_PARSER_MODEL = Config().from_str(default_model_config)["model"]
        "learn_tokens": False,
        "min_action_freq": 30,
        "model": DEFAULT_PARSER_MODEL,
+        "scorer": {"@scorers": "spacy.parser_scorer.v1"},
    },
    default_score_weights={
        "dep_uas": 0.5,
@ -63,7 +64,8 @@ def make_parser(
    moves: Optional[TransitionSystem],
    update_with_oracle_cut_size: int,
    learn_tokens: bool,
-    min_action_freq: int
+    min_action_freq: int,
+    scorer: Optional[Callable],
 ):
    """Create a transition-based DependencyParser component. The dependency parser
    jointly learns sentence segmentation and labelled dependency parsing, and can
@ -100,6 +102,7 @@ def make_parser(
        primarily affects the label accuracy, it can also affect the attachment
        structure, as the labels are used to represent the pseudo-projectivity
        transformation.
+    scorer (Optional[Callable]): The scoring method.
    """
    return DependencyParser(
        nlp.vocab,
@ -115,7 +118,8 @@ def make_parser(
        beam_update_prob=0.0,
        # At some point in the future we can try to implement support for
        # partial annotations, perhaps only in the beam objective.
-        incorrect_spans_key=None
+        incorrect_spans_key=None,
+        scorer=scorer,
    )

@Language.factory(
@ -130,6 +134,7 @@ def make_parser(
        "learn_tokens": False,
        "min_action_freq": 30,
        "model": DEFAULT_PARSER_MODEL,
+        "scorer": {"@scorers": "spacy.parser_scorer.v1"},
    },
    default_score_weights={
        "dep_uas": 0.5,
@ -151,6 +156,7 @@ def make_beam_parser(
    beam_width: int,
    beam_density: float,
    beam_update_prob: float,
+    scorer: Optional[Callable],
 ):
    """Create a transition-based DependencyParser component that uses beam-search.
    The dependency parser jointly learns sentence segmentation and labelled
@ -207,10 +213,41 @@ def make_beam_parser(
        min_action_freq=min_action_freq,
        # At some point in the future we can try to implement support for
        # partial annotations, perhaps only in the beam objective.
-        incorrect_spans_key=None
+        incorrect_spans_key=None,
+        scorer=scorer,
    )


+def parser_score(examples, **kwargs):
+    """Score a batch of examples.
+
+    examples (Iterable[Example]): The examples to score.
+    RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans
+        and Scorer.score_deps.
+
+    DOCS: https://spacy.io/api/dependencyparser#score
+    """
+    def has_sents(doc):
+        return doc.has_annotation("SENT_START")
+
+    def dep_getter(token, attr):
+        dep = getattr(token, attr)
+        dep = token.vocab.strings.as_string(dep).lower()
+        return dep
+    results = {}
+    results.update(Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs))
+    kwargs.setdefault("getter", dep_getter)
+    kwargs.setdefault("ignore_labels", ("p", "punct"))
+    results.update(Scorer.score_deps(examples, "dep", **kwargs))
+    del results["sents_per_type"]
+    return results
+
+
+@registry.scorers("spacy.parser_scorer.v1")
+def make_parser_scorer():
+    return parser_score
+
+
 cdef class DependencyParser(Parser):
    """Pipeline component for dependency parsing.

@ -233,6 +270,7 @@ cdef class DependencyParser(Parser):
        beam_update_prob=0.0,
        multitasks=tuple(),
        incorrect_spans_key=None,
+        scorer=parser_score,
    ):
        """Create a DependencyParser.
        """
@ -249,6 +287,7 @@ cdef class DependencyParser(Parser):
            beam_update_prob=beam_update_prob,
            multitasks=multitasks,
            incorrect_spans_key=incorrect_spans_key,
+            scorer=scorer,
        )

    @property
@ -281,31 +320,6 @@ cdef class DependencyParser(Parser):
                labels.add(label)
        return tuple(sorted(labels))

-    def score(self, examples, **kwargs):
-        """Score a batch of examples.
-
-        examples (Iterable[Example]): The examples to score.
-        RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans
-            and Scorer.score_deps.
-
-        DOCS: https://spacy.io/api/dependencyparser#score
-        """
-        def has_sents(doc):
-            return doc.has_annotation("SENT_START")
-
-        validate_examples(examples, "DependencyParser.score")
-        def dep_getter(token, attr):
-            dep = getattr(token, attr)
-            dep = token.vocab.strings.as_string(dep).lower()
-            return dep
-        results = {}
-        results.update(Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs))
-        kwargs.setdefault("getter", dep_getter)
-        kwargs.setdefault("ignore_labels", ("p", "punct"))
-        results.update(Scorer.score_deps(examples, "dep", **kwargs))
-        del results["sents_per_type"]
-        return results
-
    def scored_parses(self, beams):
        """Return two dictionaries with scores for each beam/doc that was processed:
        one containing (i, head) keys, and another containing (i, label) keys.
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@ -17,10 +17,12 @@ from ..language import Language
 from ..vocab import Vocab
 from ..training import Example, validate_examples, validate_get_examples
 from ..errors import Errors, Warnings
-from ..util import SimpleFrozenList
+from ..util import SimpleFrozenList, registry
 from .. import util
 from ..scorer import Scorer

+# See #9050
+BACKWARD_OVERWRITE = True

 default_model_config = """
 [model]
@ -51,6 +53,8 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
        "incl_context": True,
        "entity_vector_length": 64,
        "get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
+        "overwrite": True,
+        "scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
    },
    default_score_weights={
        "nel_micro_f": 1.0,
@ -69,6 +73,8 @@ def make_entity_linker(
    incl_context: bool,
    entity_vector_length: int,
    get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
+    overwrite: bool,
+    scorer: Optional[Callable],
 ):
    """Construct an EntityLinker component.

@ -82,6 +88,7 @@ def make_entity_linker(
    entity_vector_length (int): Size of encoding vectors in the KB.
    get_candidates (Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]): Function that
        produces a list of candidates, given a certain knowledge base and a textual mention.
+    scorer (Optional[Callable]): The scoring method.
    """
    return EntityLinker(
        nlp.vocab,
@ -93,9 +100,20 @@ def make_entity_linker(
        incl_context=incl_context,
        entity_vector_length=entity_vector_length,
        get_candidates=get_candidates,
+        overwrite=overwrite,
+        scorer=scorer,
    )


+def entity_linker_score(examples, **kwargs):
+    return Scorer.score_links(examples, negative_labels=[EntityLinker.NIL], **kwargs)
+
+
+@registry.scorers("spacy.entity_linker_scorer.v1")
+def make_entity_linker_scorer():
+    return entity_linker_score
+
+
 class EntityLinker(TrainablePipe):
    """Pipeline component for named entity linking.

@ -116,6 +134,8 @@ class EntityLinker(TrainablePipe):
        incl_context: bool,
        entity_vector_length: int,
        get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
+        overwrite: bool = BACKWARD_OVERWRITE,
+        scorer: Optional[Callable] = entity_linker_score,
    ) -> None:
        """Initialize an entity linker.

@ -130,6 +150,8 @@ class EntityLinker(TrainablePipe):
        entity_vector_length (int): Size of encoding vectors in the KB.
        get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
            produces a list of candidates, given a certain knowledge base and a textual mention.
+        scorer (Optional[Callable]): The scoring method. Defaults to
+            Scorer.score_links.

        DOCS: https://spacy.io/api/entitylinker#init
        """
@ -141,11 +163,12 @@ class EntityLinker(TrainablePipe):
        self.incl_prior = incl_prior
        self.incl_context = incl_context
        self.get_candidates = get_candidates
-        self.cfg: Dict[str, Any] = {}
+        self.cfg: Dict[str, Any] = {"overwrite": overwrite}
        self.distance = CosineDistance(normalize=False)
        # how many neighbour sentences to take into account
        # create an empty KB by default. If you want to load a predefined one, specify it in 'initialize'.
        self.kb = empty_kb(entity_vector_length)(self.vocab)
+        self.scorer = scorer

    def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
        """Define the KB of this pipe by providing a function that will
@ -384,23 +407,14 @@ class EntityLinker(TrainablePipe):
        if count_ents != len(kb_ids):
            raise ValueError(Errors.E148.format(ents=count_ents, ids=len(kb_ids)))
        i = 0
+        overwrite = self.cfg["overwrite"]
        for doc in docs:
            for ent in doc.ents:
                kb_id = kb_ids[i]
                i += 1
                for token in ent:
-                    token.ent_kb_id_ = kb_id
-
-    def score(self, examples, **kwargs):
-        """Score a batch of examples.
-
-        examples (Iterable[Example]): The examples to score.
-        RETURNS (Dict[str, Any]): The scores.
-
-        DOCS TODO: https://spacy.io/api/entity_linker#score
-        """
-        validate_examples(examples, "EntityLinker.score")
-        return Scorer.score_links(examples, negative_labels=[self.NIL])
+                    if token.ent_kb_id == 0 or overwrite:
+                        token.ent_kb_id_ = kb_id

    def to_bytes(self, *, exclude=tuple()):
        """Serialize the pipe to a bytestring.
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@ -9,11 +9,10 @@ from .pipe import Pipe
 from ..training import Example
 from ..language import Language
 from ..errors import Errors, Warnings
-from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList
+from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList, registry
 from ..tokens import Doc, Span
 from ..matcher import Matcher, PhraseMatcher
 from ..scorer import get_ner_prf
-from ..training import validate_examples


 DEFAULT_ENT_ID_SEP = "||"
@ -28,6 +27,7 @@ PatternType = Dict[str, Union[str, List[Dict[str, Any]]]]
        "validate": False,
        "overwrite_ents": False,
        "ent_id_sep": DEFAULT_ENT_ID_SEP,
+        "scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"},
    },
    default_score_weights={
        "ents_f": 1.0,
@ -43,6 +43,7 @@ def make_entity_ruler(
    validate: bool,
    overwrite_ents: bool,
    ent_id_sep: str,
+    scorer: Optional[Callable],
 ):
    return EntityRuler(
        nlp,
@ -51,9 +52,19 @@ def make_entity_ruler(
        validate=validate,
        overwrite_ents=overwrite_ents,
        ent_id_sep=ent_id_sep,
+        scorer=scorer,
    )


+def entity_ruler_score(examples, **kwargs):
+    return get_ner_prf(examples)
+
+
+@registry.scorers("spacy.entity_ruler_scorer.v1")
+def make_entity_ruler_scorer():
+    return entity_ruler_score
+
+
 class EntityRuler(Pipe):
    """The EntityRuler lets you add spans to the `Doc.ents` using token-based
    rules or exact phrase matches. It can be combined with the statistical
@ -75,6 +86,7 @@ class EntityRuler(Pipe):
        overwrite_ents: bool = False,
        ent_id_sep: str = DEFAULT_ENT_ID_SEP,
        patterns: Optional[List[PatternType]] = None,
+        scorer: Optional[Callable] = entity_ruler_score,
    ) -> None:
        """Initialize the entity ruler. If patterns are supplied here, they
        need to be a list of dictionaries with a `"label"` and `"pattern"`
@ -95,6 +107,8 @@ class EntityRuler(Pipe):
        overwrite_ents (bool): If existing entities are present, e.g. entities
            added by the model, overwrite them by matches if necessary.
        ent_id_sep (str): Separator used internally for entity IDs.
+        scorer (Optional[Callable]): The scoring method. Defaults to
+            spacy.scorer.get_ner_prf.

        DOCS: https://spacy.io/api/entityruler#init
        """
@ -113,6 +127,7 @@ class EntityRuler(Pipe):
        self._ent_ids = defaultdict(tuple)  # type: ignore
        if patterns is not None:
            self.add_patterns(patterns)
+        self.scorer = scorer

    def __len__(self) -> int:
        """The number of all patterns added to the entity ruler."""
@ -363,10 +378,6 @@ class EntityRuler(Pipe):
            label = f"{label}{self.ent_id_sep}{ent_id}"
        return label

-    def score(self, examples, **kwargs):
-        validate_examples(examples, "EntityRuler.score")
-        return get_ner_prf(examples)
-
    def from_bytes(
        self, patterns_bytes: bytes, *, exclude: Iterable[str] = SimpleFrozenList()
    ) -> "EntityRuler":
--- a/spacy/pipeline/lemmatizer.py
+++ b/spacy/pipeline/lemmatizer.py
@ -12,21 +12,41 @@ from ..lookups import Lookups, load_lookups
 from ..scorer import Scorer
 from ..tokens import Doc, Token
 from ..vocab import Vocab
-from ..training import validate_examples
-from ..util import logger, SimpleFrozenList
+from ..util import logger, SimpleFrozenList, registry
 from .. import util


@Language.factory(
    "lemmatizer",
    assigns=["token.lemma"],
-    default_config={"model": None, "mode": "lookup", "overwrite": False},
+    default_config={
+        "model": None,
+        "mode": "lookup",
+        "overwrite": False,
+        "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
+    },
    default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
-    nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool = False
+    nlp: Language,
+    model: Optional[Model],
+    name: str,
+    mode: str,
+    overwrite: bool,
+    scorer: Optional[Callable],
 ):
-    return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+    return Lemmatizer(
+        nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
+    )
+
+
+def lemmatizer_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
+    return Scorer.score_token_attr(examples, "lemma", **kwargs)
+
+
+@registry.scorers("spacy.lemmatizer_scorer.v1")
+def make_lemmatizer_scorer():
+    return lemmatizer_score


 class Lemmatizer(Pipe):
@ -60,6 +80,7 @@ class Lemmatizer(Pipe):
        *,
        mode: str = "lookup",
        overwrite: bool = False,
+        scorer: Optional[Callable] = lemmatizer_score,
    ) -> None:
        """Initialize a Lemmatizer.

@ -69,6 +90,8 @@ class Lemmatizer(Pipe):
        mode (str): The lemmatizer mode: "lookup", "rule". Defaults to "lookup".
        overwrite (bool): Whether to overwrite existing lemmas. Defaults to
            `False`.
+        scorer (Optional[Callable]): The scoring method. Defaults to
+            Scorer.score_token_attr for the attribute "lemma".

        DOCS: https://spacy.io/api/lemmatizer#init
        """
@ -89,6 +112,7 @@ class Lemmatizer(Pipe):
                raise ValueError(Errors.E1003.format(mode=mode))
            self.lemmatize = getattr(self, mode_attr)
        self.cache = {}  # type: ignore[var-annotated]
+        self.scorer = scorer

    @property
    def mode(self):
@ -247,17 +271,6 @@ class Lemmatizer(Pipe):
        """
        return False

-    def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
-        """Score a batch of examples.
-
-        examples (Iterable[Example]): The examples to score.
-        RETURNS (Dict[str, Any]): The scores.
-
-        DOCS: https://spacy.io/api/lemmatizer#score
-        """
-        validate_examples(examples, "Lemmatizer.score")
-        return Scorer.score_token_attr(examples, "lemma", **kwargs)
-
    def to_disk(
        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
    ):
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@ -1,5 +1,5 @@
 # cython: infer_types=True, profile=True, binding=True
-from typing import Optional, Union, Dict
+from typing import Optional, Union, Dict, Callable
 import srsly
 from thinc.api import SequenceCategoricalCrossentropy, Model, Config
 from itertools import islice
@ -17,7 +17,11 @@ from .tagger import Tagger
 from .. import util
 from ..scorer import Scorer
 from ..training import validate_examples, validate_get_examples
+from ..util import registry

+# See #9050
+BACKWARD_OVERWRITE = True
+BACKWARD_EXTEND = False

 default_model_config = """
 [model]
@ -48,15 +52,35 @@ DEFAULT_MORPH_MODEL = Config().from_str(default_model_config)["model"]
@Language.factory(
    "morphologizer",
    assigns=["token.morph", "token.pos"],
-    default_config={"model": DEFAULT_MORPH_MODEL},
+    default_config={"model": DEFAULT_MORPH_MODEL, "overwrite": True, "extend": False, "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"}},
    default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None},
 )
 def make_morphologizer(
    nlp: Language,
    model: Model,
    name: str,
+    overwrite: bool,
+    extend: bool,
+    scorer: Optional[Callable],
 ):
-    return Morphologizer(nlp.vocab, model, name)
+    return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, scorer=scorer)
+
+
+def morphologizer_score(examples, **kwargs):
+    def morph_key_getter(token, attr):
+        return getattr(token, attr).key
+
+    results = {}
+    results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
+    results.update(Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs))
+    results.update(Scorer.score_token_attr_per_feat(examples,
+        "morph", getter=morph_key_getter, **kwargs))
+    return results
+
+
+@registry.scorers("spacy.morphologizer_scorer.v1")
+def make_morphologizer_scorer():
+    return morphologizer_score


 class Morphologizer(Tagger):
@ -67,6 +91,10 @@ class Morphologizer(Tagger):
        vocab: Vocab,
        model: Model,
        name: str = "morphologizer",
+        *,
+        overwrite: bool = BACKWARD_OVERWRITE,
+        extend: bool = BACKWARD_EXTEND,
+        scorer: Optional[Callable] = morphologizer_score,
    ):
        """Initialize a morphologizer.

@ -74,6 +102,9 @@ class Morphologizer(Tagger):
        model (thinc.api.Model): The Thinc Model powering the pipeline component.
        name (str): The component instance name, used to add entries to the
            losses during training.
+        scorer (Optional[Callable]): The scoring method. Defaults to
+            Scorer.score_token_attr for the attributes "pos" and "morph" and
+            Scorer.score_token_attr_per_feat for the attribute "morph".

        DOCS: https://spacy.io/api/morphologizer#init
        """
@ -85,8 +116,14 @@ class Morphologizer(Tagger):
        # store mappings from morph+POS labels to token-level annotations:
        # 1) labels_morph stores a mapping from morph+POS->morph
        # 2) labels_pos stores a mapping from morph+POS->POS
-        cfg = {"labels_morph": {}, "labels_pos": {}}
+        cfg = {
+            "labels_morph": {},
+            "labels_pos": {},
+            "overwrite": overwrite,
+            "extend": extend,
+        }
        self.cfg = dict(sorted(cfg.items()))
+        self.scorer = scorer

    @property
    def labels(self):
@ -192,14 +229,34 @@ class Morphologizer(Tagger):
            docs = [docs]
        cdef Doc doc
        cdef Vocab vocab = self.vocab
+        cdef bint overwrite = self.cfg["overwrite"]
+        cdef bint extend = self.cfg["extend"]
        for i, doc in enumerate(docs):
            doc_tag_ids = batch_tag_ids[i]
            if hasattr(doc_tag_ids, "get"):
                doc_tag_ids = doc_tag_ids.get()
            for j, tag_id in enumerate(doc_tag_ids):
                morph = self.labels[tag_id]
-                doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels_morph"].get(morph, 0))
-                doc.c[j].pos = self.cfg["labels_pos"].get(morph, 0)
+                # set morph
+                if doc.c[j].morph == 0 or overwrite or extend:
+                    if overwrite and extend:
+                        # morphologizer morph overwrites any existing features
+                        # while extending
+                        extended_morph = Morphology.feats_to_dict(self.vocab.strings[doc.c[j].morph])
+                        extended_morph.update(Morphology.feats_to_dict(self.cfg["labels_morph"].get(morph, 0)))
+                        doc.c[j].morph = self.vocab.morphology.add(extended_morph)
+                    elif extend:
+                        # existing features are preserved and any new features
+                        # are added
+                        extended_morph = Morphology.feats_to_dict(self.cfg["labels_morph"].get(morph, 0))
+                        extended_morph.update(Morphology.feats_to_dict(self.vocab.strings[doc.c[j].morph]))
+                        doc.c[j].morph = self.vocab.morphology.add(extended_morph)
+                    else:
+                        # clobber
+                        doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels_morph"].get(morph, 0))
+                # set POS
+                if doc.c[j].pos == 0 or overwrite:
+                    doc.c[j].pos = self.cfg["labels_pos"].get(morph, 0)

    def get_loss(self, examples, scores):
        """Find the loss and gradient of loss for the batch of documents and
@ -246,24 +303,3 @@ class Morphologizer(Tagger):
        if self.model.ops.xp.isnan(loss):
            raise ValueError(Errors.E910.format(name=self.name))
        return float(loss), d_scores
-
-    def score(self, examples, **kwargs):
-        """Score a batch of examples.
-
-        examples (Iterable[Example]): The examples to score.
-        RETURNS (Dict[str, Any]): The scores, produced by
-            Scorer.score_token_attr for the attributes "pos" and "morph" and
-            Scorer.score_token_attr_per_feat for the attribute "morph".
-
-        DOCS: https://spacy.io/api/morphologizer#score
-        """
-        def morph_key_getter(token, attr):
-            return getattr(token, attr).key
-
-        validate_examples(examples, "Morphologizer.score")
-        results = {}
-        results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
-        results.update(Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs))
-        results.update(Scorer.score_token_attr_per_feat(examples,
-            "morph", getter=morph_key_getter, **kwargs))
-        return results
--- a/spacy/pipeline/ner.pyx
+++ b/spacy/pipeline/ner.pyx
@ -1,6 +1,6 @@
 # cython: infer_types=True, profile=True, binding=True
 from collections import defaultdict
-from typing import Optional, Iterable
+from typing import Optional, Iterable, Callable
 from thinc.api import Model, Config

 from ._parser_internals.transition_system import TransitionSystem
@ -9,7 +9,7 @@ from ._parser_internals.ner cimport BiluoPushDown

 from ..language import Language
 from ..scorer import get_ner_prf, PRFScore
-from ..training import validate_examples
+from ..util import registry


 default_model_config = """
@ -41,7 +41,8 @@ DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"]
        "moves": None,
        "update_with_oracle_cut_size": 100,
        "model": DEFAULT_NER_MODEL,
-        "incorrect_spans_key": None
+        "incorrect_spans_key": None,
+        "scorer": {"@scorers": "spacy.ner_scorer.v1"},
    },
    default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},

@ -52,7 +53,8 @@ def make_ner(
    model: Model,
    moves: Optional[TransitionSystem],
    update_with_oracle_cut_size: int,
-    incorrect_spans_key: Optional[str]=None
+    incorrect_spans_key: Optional[str],
+    scorer: Optional[Callable],
 ):
    """Create a transition-based EntityRecognizer component. The entity recognizer
    identifies non-overlapping labelled spans of tokens.
@ -80,6 +82,7 @@ def make_ner(
    incorrect_spans_key (Optional[str]): Identifies spans that are known
        to be incorrect entity annotations. The incorrect entity annotations
        can be stored in the span group, under this key.
+    scorer (Optional[Callable]): The scoring method.
    """
    return EntityRecognizer(
        nlp.vocab,
@ -92,6 +95,7 @@ def make_ner(
        beam_width=1,
        beam_density=0.0,
        beam_update_prob=0.0,
+        scorer=scorer,
    )

@Language.factory(
@ -104,7 +108,8 @@ def make_ner(
        "beam_density": 0.01,
        "beam_update_prob": 0.5,
        "beam_width": 32,
-        "incorrect_spans_key": None
+        "incorrect_spans_key": None,
+        "scorer": None,
    },
    default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
 )
@ -117,7 +122,8 @@ def make_beam_ner(
    beam_width: int,
    beam_density: float,
    beam_update_prob: float,
-    incorrect_spans_key: Optional[str]=None
+    incorrect_spans_key: Optional[str],
+    scorer: Optional[Callable],
 ):
    """Create a transition-based EntityRecognizer component that uses beam-search.
    The entity recognizer identifies non-overlapping labelled spans of tokens.
@ -153,6 +159,7 @@ def make_beam_ner(
        and are faster to compute.
    incorrect_spans_key (Optional[str]): Optional key into span groups of
        entities known to be non-entities.
+    scorer (Optional[Callable]): The scoring method.
    """
    return EntityRecognizer(
        nlp.vocab,
@ -164,10 +171,20 @@ def make_beam_ner(
        beam_width=beam_width,
        beam_density=beam_density,
        beam_update_prob=beam_update_prob,
-        incorrect_spans_key=incorrect_spans_key
+        incorrect_spans_key=incorrect_spans_key,
+        scorer=scorer,
    )


+def ner_score(examples, **kwargs):
+    return get_ner_prf(examples, **kwargs)
+
+
+@registry.scorers("spacy.ner_scorer.v1")
+def make_ner_scorer():
+    return ner_score
+
+
 cdef class EntityRecognizer(Parser):
    """Pipeline component for named entity recognition.

@ -188,6 +205,7 @@ cdef class EntityRecognizer(Parser):
        beam_update_prob=0.0,
        multitasks=tuple(),
        incorrect_spans_key=None,
+        scorer=ner_score,
    ):
        """Create an EntityRecognizer.
        """
@ -204,6 +222,7 @@ cdef class EntityRecognizer(Parser):
            beam_update_prob=beam_update_prob,
            multitasks=multitasks,
            incorrect_spans_key=incorrect_spans_key,
+            scorer=scorer,
        )

    def add_multitask_objective(self, mt_component):
@ -227,17 +246,6 @@ cdef class EntityRecognizer(Parser):
                     if move[0] in ("B", "I", "L", "U"))
        return tuple(sorted(labels))

-    def score(self, examples, **kwargs):
-        """Score a batch of examples.
-
-        examples (Iterable[Example]): The examples to score.
-        RETURNS (Dict[str, Any]): The NER precision, recall and f-scores.
-
-        DOCS: https://spacy.io/api/entityrecognizer#score
-        """
-        validate_examples(examples, "EntityRecognizer.score")
-        return get_ner_prf(examples)
-
    def scored_ents(self, beams):
        """Return a dictionary of (start, end, label) tuples with corresponding scores
        for each beam/doc that was processed.
--- a/spacy/pipeline/pipe.pyx
+++ b/spacy/pipeline/pipe.pyx
@ -81,6 +81,17 @@ cdef class Pipe:

        DOCS: https://spacy.io/api/pipe#score
        """
+        if hasattr(self, "scorer") and self.scorer is not None:
+            scorer_kwargs = {}
+            # use default settings from cfg (e.g., threshold)
+            if hasattr(self, "cfg") and isinstance(self.cfg, dict):
+                scorer_kwargs.update(self.cfg)
+            # override self.cfg["labels"] with self.labels
+            if hasattr(self, "labels"):
+                scorer_kwargs["labels"] = self.labels
+            # override with kwargs settings
+            scorer_kwargs.update(kwargs)
+            return self.scorer(examples, **scorer_kwargs)
        return {}

    @property
--- a/spacy/pipeline/sentencizer.pyx
+++ b/spacy/pipeline/sentencizer.pyx
@ -1,26 +1,32 @@
 # cython: infer_types=True, profile=True, binding=True
-from typing import Optional, List
+from typing import Optional, List, Callable
 import srsly

 from ..tokens.doc cimport Doc
+
 from .pipe import Pipe
+from .senter import senter_score
 from ..language import Language
 from ..scorer import Scorer
-from ..training import validate_examples
 from .. import util

+# see #9050
+BACKWARD_OVERWRITE = False
+
@Language.factory(
    "sentencizer",
    assigns=["token.is_sent_start", "doc.sents"],
-    default_config={"punct_chars": None},
+    default_config={"punct_chars": None, "overwrite": False, "scorer": {"@scorers": "spacy.senter_scorer.v1"}},
    default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
 )
 def make_sentencizer(
    nlp: Language,
    name: str,
-    punct_chars: Optional[List[str]]
+    punct_chars: Optional[List[str]],
+    overwrite: bool,
+    scorer: Optional[Callable],
 ):
-    return Sentencizer(name, punct_chars=punct_chars)
+    return Sentencizer(name, punct_chars=punct_chars, overwrite=overwrite, scorer=scorer)


 class Sentencizer(Pipe):
@ -41,12 +47,20 @@ class Sentencizer(Pipe):
            '𑩃', '𑪛', '𑪜', '𑱁', '𑱂', '𖩮', '𖩯', '𖫵', '𖬷', '𖬸', '𖭄', '𛲟', '𝪈',
            '｡', '。']

-    def __init__(self, name="sentencizer", *, punct_chars=None):
+    def __init__(
+        self,
+        name="sentencizer",
+        *,
+        punct_chars=None,
+        overwrite=BACKWARD_OVERWRITE,
+        scorer=senter_score,
+    ):
        """Initialize the sentencizer.

        punct_chars (list): Punctuation characters to split on. Will be
            serialized with the nlp object.
-        RETURNS (Sentencizer): The sentencizer component.
+        scorer (Optional[Callable]): The scoring method. Defaults to
+            Scorer.score_spans for the attribute "sents".

        DOCS: https://spacy.io/api/sentencizer#init
        """
@ -55,6 +69,8 @@ class Sentencizer(Pipe):
            self.punct_chars = set(punct_chars)
        else:
            self.punct_chars = set(self.default_punct_chars)
+        self.overwrite = overwrite
+        self.scorer = scorer

    def __call__(self, doc):
        """Apply the sentencizer to a Doc and set Token.is_sent_start.
@ -115,29 +131,12 @@ class Sentencizer(Pipe):
        for i, doc in enumerate(docs):
            doc_tag_ids = batch_tag_ids[i]
            for j, tag_id in enumerate(doc_tag_ids):
-                # Don't clobber existing sentence boundaries
-                if doc.c[j].sent_start == 0:
+                if doc.c[j].sent_start == 0 or self.overwrite:
                    if tag_id:
                        doc.c[j].sent_start = 1
                    else:
                        doc.c[j].sent_start = -1

-    def score(self, examples, **kwargs):
-        """Score a batch of examples.
-
-        examples (Iterable[Example]): The examples to score.
-        RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans.
-
-        DOCS: https://spacy.io/api/sentencizer#score
-        """
-        def has_sents(doc):
-            return doc.has_annotation("SENT_START")
-
-        validate_examples(examples, "Sentencizer.score")
-        results = Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
-        del results["sents_per_type"]
-        return results
-
    def to_bytes(self, *, exclude=tuple()):
        """Serialize the sentencizer to a bytestring.

@ -145,7 +144,7 @@ class Sentencizer(Pipe):

        DOCS: https://spacy.io/api/sentencizer#to_bytes
        """
-        return srsly.msgpack_dumps({"punct_chars": list(self.punct_chars)})
+        return srsly.msgpack_dumps({"punct_chars": list(self.punct_chars), "overwrite": self.overwrite})

    def from_bytes(self, bytes_data, *, exclude=tuple()):
        """Load the sentencizer from a bytestring.
@ -157,6 +156,7 @@ class Sentencizer(Pipe):
        """
        cfg = srsly.msgpack_loads(bytes_data)
        self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars))
+        self.overwrite = cfg.get("overwrite", self.overwrite)
        return self

    def to_disk(self, path, *, exclude=tuple()):
@ -166,7 +166,7 @@ class Sentencizer(Pipe):
        """
        path = util.ensure_path(path)
        path = path.with_suffix(".json")
-        srsly.write_json(path, {"punct_chars": list(self.punct_chars)})
+        srsly.write_json(path, {"punct_chars": list(self.punct_chars), "overwrite": self.overwrite})


    def from_disk(self, path, *, exclude=tuple()):
@ -178,4 +178,5 @@ class Sentencizer(Pipe):
        path = path.with_suffix(".json")
        cfg = srsly.read_json(path)
        self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars))
+        self.overwrite = cfg.get("overwrite", self.overwrite)
        return self
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@ -1,5 +1,6 @@
 # cython: infer_types=True, profile=True, binding=True
 from itertools import islice
+from typing import Optional, Callable

 import srsly
 from thinc.api import Model, SequenceCategoricalCrossentropy, Config
@ -11,8 +12,11 @@ from ..language import Language
 from ..errors import Errors
 from ..scorer import Scorer
 from ..training import validate_examples, validate_get_examples
+from ..util import registry
 from .. import util

+# See #9050
+BACKWARD_OVERWRITE = False

 default_model_config = """
 [model]
@ -34,11 +38,25 @@ DEFAULT_SENTER_MODEL = Config().from_str(default_model_config)["model"]
@Language.factory(
    "senter",
    assigns=["token.is_sent_start"],
-    default_config={"model": DEFAULT_SENTER_MODEL},
+    default_config={"model": DEFAULT_SENTER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.senter_scorer.v1"}},
    default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
 )
-def make_senter(nlp: Language, name: str, model: Model):
-    return SentenceRecognizer(nlp.vocab, model, name)
+def make_senter(nlp: Language, name: str, model: Model, overwrite: bool, scorer: Optional[Callable]):
+    return SentenceRecognizer(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer)
+
+
+def senter_score(examples, **kwargs):
+    def has_sents(doc):
+        return doc.has_annotation("SENT_START")
+
+    results = Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
+    del results["sents_per_type"]
+    return results
+
+
+@registry.scorers("spacy.senter_scorer.v1")
+def make_senter_scorer():
+    return senter_score


 class SentenceRecognizer(Tagger):
@ -46,13 +64,23 @@ class SentenceRecognizer(Tagger):

    DOCS: https://spacy.io/api/sentencerecognizer
    """
-    def __init__(self, vocab, model, name="senter"):
+    def __init__(
+        self,
+        vocab,
+        model,
+        name="senter",
+        *,
+        overwrite=BACKWARD_OVERWRITE,
+        scorer=senter_score,
+    ):
        """Initialize a sentence recognizer.

        vocab (Vocab): The shared vocabulary.
        model (thinc.api.Model): The Thinc Model powering the pipeline component.
        name (str): The component instance name, used to add entries to the
            losses during training.
+        scorer (Optional[Callable]): The scoring method. Defaults to
+            Scorer.score_spans for the attribute "sents".

        DOCS: https://spacy.io/api/sentencerecognizer#init
        """
@ -60,7 +88,8 @@ class SentenceRecognizer(Tagger):
        self.model = model
        self.name = name
        self._rehearsal_model = None
-        self.cfg = {}
+        self.cfg = {"overwrite": overwrite}
+        self.scorer = scorer

    @property
    def labels(self):
@ -85,13 +114,13 @@ class SentenceRecognizer(Tagger):
        if isinstance(docs, Doc):
            docs = [docs]
        cdef Doc doc
+        cdef bint overwrite = self.cfg["overwrite"]
        for i, doc in enumerate(docs):
            doc_tag_ids = batch_tag_ids[i]
            if hasattr(doc_tag_ids, "get"):
                doc_tag_ids = doc_tag_ids.get()
            for j, tag_id in enumerate(doc_tag_ids):
-                # Don't clobber existing sentence boundaries
-                if doc.c[j].sent_start == 0:
+                if doc.c[j].sent_start == 0 or overwrite:
                    if tag_id == 1:
                        doc.c[j].sent_start = 1
                    else:
@ -153,18 +182,3 @@ class SentenceRecognizer(Tagger):

    def add_label(self, label, values=None):
        raise NotImplementedError
-
-    def score(self, examples, **kwargs):
-        """Score a batch of examples.
-
-        examples (Iterable[Example]): The examples to score.
-        RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans.
-        DOCS: https://spacy.io/api/sentencerecognizer#score
-        """
-        def has_sents(doc):
-            return doc.has_annotation("SENT_START")
-
-        validate_examples(examples, "SentenceRecognizer.score")
-        results = Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
-        del results["sents_per_type"]
-        return results
--- a/spacy/pipeline/spancat.py
+++ b/spacy/pipeline/spancat.py
@ -104,6 +104,7 @@ def build_ngram_range_suggester(min_size: int, max_size: int) -> Suggester:
        "max_positive": None,
        "model": DEFAULT_SPANCAT_MODEL,
        "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
+        "scorer": {"@scorers": "spacy.spancat_scorer.v1"},
    },
    default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
 )
@ -113,8 +114,9 @@ def make_spancat(
    suggester: Suggester,
    model: Model[Tuple[List[Doc], Ragged], Floats2d],
    spans_key: str,
-    threshold: float = 0.5,
-    max_positive: Optional[int] = None,
+    scorer: Optional[Callable],
+    threshold: float,
+    max_positive: Optional[int],
 ) -> "SpanCategorizer":
    """Create a SpanCategorizer component. The span categorizer consists of two
    parts: a suggester function that proposes candidate spans, and a labeller
@ -144,9 +146,28 @@ def make_spancat(
        threshold=threshold,
        max_positive=max_positive,
        name=name,
+        scorer=scorer,
    )


+def spancat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
+    kwargs = dict(kwargs)
+    attr_prefix = "spans_"
+    key = kwargs["spans_key"]
+    kwargs.setdefault("attr", f"{attr_prefix}{key}")
+    kwargs.setdefault("allow_overlap", True)
+    kwargs.setdefault(
+        "getter", lambda doc, key: doc.spans.get(key[len(attr_prefix) :], [])
+    )
+    kwargs.setdefault("has_annotation", lambda doc: key in doc.spans)
+    return Scorer.score_spans(examples, **kwargs)
+
+
+@registry.scorers("spacy.spancat_scorer.v1")
+def make_spancat_scorer():
+    return spancat_score
+
+
 class SpanCategorizer(TrainablePipe):
    """Pipeline component to label spans of text.

@ -163,8 +184,25 @@ class SpanCategorizer(TrainablePipe):
        spans_key: str = "spans",
        threshold: float = 0.5,
        max_positive: Optional[int] = None,
+        scorer: Optional[Callable] = spancat_score,
    ) -> None:
        """Initialize the span categorizer.
+        vocab (Vocab): The shared vocabulary.
+        model (thinc.api.Model): The Thinc Model powering the pipeline component.
+        name (str): The component instance name, used to add entries to the
+            losses during training.
+        spans_key (str): Key of the Doc.spans dict to save the spans under.
+            During initialization and training, the component will look for
+            spans on the reference document under the same key. Defaults to
+            `"spans"`.
+        threshold (float): Minimum probability to consider a prediction
+            positive. Spans with a positive prediction will be saved on the Doc.
+            Defaults to 0.5.
+        max_positive (Optional[int]): Maximum number of labels to consider
+            positive per span. Defaults to None, indicating no limit.
+        scorer (Optional[Callable]): The scoring method. Defaults to
+            Scorer.score_spans for the Doc.spans[spans_key] with overlapping
+            spans allowed.

        DOCS: https://spacy.io/api/spancategorizer#init
        """
@ -178,6 +216,7 @@ class SpanCategorizer(TrainablePipe):
        self.suggester = suggester
        self.model = model
        self.name = name
+        self.scorer = scorer

    @property
    def key(self) -> str:
@ -379,26 +418,6 @@ class SpanCategorizer(TrainablePipe):
        else:
            self.model.initialize()

-    def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
-        """Score a batch of examples.
-
-        examples (Iterable[Example]): The examples to score.
-        RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_cats.
-
-        DOCS: https://spacy.io/api/spancategorizer#score
-        """
-        validate_examples(examples, "SpanCategorizer.score")
-        self._validate_categories(examples)
-        kwargs = dict(kwargs)
-        attr_prefix = "spans_"
-        kwargs.setdefault("attr", f"{attr_prefix}{self.key}")
-        kwargs.setdefault("allow_overlap", True)
-        kwargs.setdefault(
-            "getter", lambda doc, key: doc.spans.get(key[len(attr_prefix) :], [])
-        )
-        kwargs.setdefault("has_annotation", lambda doc: self.key in doc.spans)
-        return Scorer.score_spans(examples, **kwargs)
-
    def _validate_categories(self, examples: Iterable[Example]):
        # TODO
        pass
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@ -1,4 +1,5 @@
 # cython: infer_types=True, profile=True, binding=True
+from typing import Callable, Optional
 import numpy
 import srsly
 from thinc.api import Model, set_dropout_rate, SequenceCategoricalCrossentropy, Config
@ -18,8 +19,11 @@ from ..parts_of_speech import X
 from ..errors import Errors, Warnings
 from ..scorer import Scorer
 from ..training import validate_examples, validate_get_examples
+from ..util import registry
 from .. import util

+# See #9050
+BACKWARD_OVERWRITE = False

 default_model_config = """
 [model]
@ -41,10 +45,16 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"]
@Language.factory(
    "tagger",
    assigns=["token.tag"],
-    default_config={"model": DEFAULT_TAGGER_MODEL},
+    default_config={"model": DEFAULT_TAGGER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.tagger_scorer.v1"}},
    default_score_weights={"tag_acc": 1.0},
 )
-def make_tagger(nlp: Language, name: str, model: Model):
+def make_tagger(
+    nlp: Language,
+    name: str,
+    model: Model,
+    overwrite: bool,
+    scorer: Optional[Callable],
+):
    """Construct a part-of-speech tagger component.

    model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
@ -52,7 +62,16 @@ def make_tagger(nlp: Language, name: str, model: Model):
        in size, and be normalized as probabilities (all scores between 0 and 1,
        with the rows summing to 1).
    """
-    return Tagger(nlp.vocab, model, name)
+    return Tagger(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer)
+
+
+def tagger_score(examples, **kwargs):
+    return Scorer.score_token_attr(examples, "tag", **kwargs)
+
+
+@registry.scorers("spacy.tagger_scorer.v1")
+def make_tagger_scorer():
+    return tagger_score


 class Tagger(TrainablePipe):
@ -60,13 +79,23 @@ class Tagger(TrainablePipe):

    DOCS: https://spacy.io/api/tagger
    """
-    def __init__(self, vocab, model, name="tagger"):
+    def __init__(
+        self,
+        vocab,
+        model,
+        name="tagger",
+        *,
+        overwrite=BACKWARD_OVERWRITE,
+        scorer=tagger_score,
+    ):
        """Initialize a part-of-speech tagger.

        vocab (Vocab): The shared vocabulary.
        model (thinc.api.Model): The Thinc Model powering the pipeline component.
        name (str): The component instance name, used to add entries to the
            losses during training.
+        scorer (Optional[Callable]): The scoring method. Defaults to
+            Scorer.score_token_attr for the attribute "tag".

        DOCS: https://spacy.io/api/tagger#init
        """
@ -74,8 +103,9 @@ class Tagger(TrainablePipe):
        self.model = model
        self.name = name
        self._rehearsal_model = None
-        cfg = {"labels": []}
+        cfg = {"labels": [], "overwrite": overwrite}
        self.cfg = dict(sorted(cfg.items()))
+        self.scorer = scorer

    @property
    def labels(self):
@ -135,13 +165,13 @@ class Tagger(TrainablePipe):
            docs = [docs]
        cdef Doc doc
        cdef Vocab vocab = self.vocab
+        cdef bint overwrite = self.cfg["overwrite"]
        for i, doc in enumerate(docs):
            doc_tag_ids = batch_tag_ids[i]
            if hasattr(doc_tag_ids, "get"):
                doc_tag_ids = doc_tag_ids.get()
            for j, tag_id in enumerate(doc_tag_ids):
-                # Don't clobber preset POS tags
-                if doc.c[j].tag == 0:
+                if doc.c[j].tag == 0 or overwrite:
                    doc.c[j].tag = self.vocab.strings[self.labels[tag_id]]

    def update(self, examples, *, drop=0., sgd=None, losses=None):
@ -289,15 +319,3 @@ class Tagger(TrainablePipe):
        self.cfg["labels"].append(label)
        self.vocab.strings.add(label)
        return 1
-
-    def score(self, examples, **kwargs):
-        """Score a batch of examples.
-
-        examples (Iterable[Example]): The examples to score.
-        RETURNS (Dict[str, Any]): The scores, produced by
-            Scorer.score_token_attr for the attributes "tag".
-
-        DOCS: https://spacy.io/api/tagger#score
-        """
-        validate_examples(examples, "Tagger.score")
-        return Scorer.score_token_attr(examples, "tag", **kwargs)
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@ -10,6 +10,7 @@ from ..training import Example, validate_examples, validate_get_examples
 from ..errors import Errors
 from ..scorer import Scorer
 from ..tokens import Doc
+from ..util import registry
 from ..vocab import Vocab


@ -70,7 +71,11 @@ subword_features = true
@Language.factory(
    "textcat",
    assigns=["doc.cats"],
-    default_config={"threshold": 0.5, "model": DEFAULT_SINGLE_TEXTCAT_MODEL},
+    default_config={
+        "threshold": 0.5,
+        "model": DEFAULT_SINGLE_TEXTCAT_MODEL,
+        "scorer": {"@scorers": "spacy.textcat_scorer.v1"},
+    },
    default_score_weights={
        "cats_score": 1.0,
        "cats_score_desc": None,
@ -86,7 +91,11 @@ subword_features = true
    },
 )
 def make_textcat(
-    nlp: Language, name: str, model: Model[List[Doc], List[Floats2d]], threshold: float
+    nlp: Language,
+    name: str,
+    model: Model[List[Doc], List[Floats2d]],
+    threshold: float,
+    scorer: Optional[Callable],
 ) -> "TextCategorizer":
    """Create a TextCategorizer component. The text categorizer predicts categories
    over a whole document. It can learn one or more labels, and the labels are considered
@ -95,8 +104,23 @@ def make_textcat(
    model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
        scores for each category.
    threshold (float): Cutoff to consider a prediction "positive".
+    scorer (Optional[Callable]): The scoring method.
    """
-    return TextCategorizer(nlp.vocab, model, name, threshold=threshold)
+    return TextCategorizer(nlp.vocab, model, name, threshold=threshold, scorer=scorer)
+
+
+def textcat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
+    return Scorer.score_cats(
+        examples,
+        "cats",
+        multi_label=False,
+        **kwargs,
+    )
+
+
+@registry.scorers("spacy.textcat_scorer.v1")
+def make_textcat_scorer():
+    return textcat_score


 class TextCategorizer(TrainablePipe):
@ -106,7 +130,13 @@ class TextCategorizer(TrainablePipe):
    """

    def __init__(
-        self, vocab: Vocab, model: Model, name: str = "textcat", *, threshold: float
+        self,
+        vocab: Vocab,
+        model: Model,
+        name: str = "textcat",
+        *,
+        threshold: float,
+        scorer: Optional[Callable] = textcat_score,
    ) -> None:
        """Initialize a text categorizer for single-label classification.

@ -115,6 +145,8 @@ class TextCategorizer(TrainablePipe):
        name (str): The component instance name, used to add entries to the
            losses during training.
        threshold (float): Cutoff to consider a prediction "positive".
+        scorer (Optional[Callable]): The scoring method. Defaults to
+                Scorer.score_cats for the attribute "cats".

        DOCS: https://spacy.io/api/textcategorizer#init
        """
@ -124,6 +156,7 @@ class TextCategorizer(TrainablePipe):
        self._rehearsal_model = None
        cfg = {"labels": [], "threshold": threshold, "positive_label": None}
        self.cfg = dict(cfg)
+        self.scorer = scorer

    @property
    def labels(self) -> Tuple[str]:
@ -353,26 +386,6 @@ class TextCategorizer(TrainablePipe):
        assert len(label_sample) > 0, Errors.E923.format(name=self.name)
        self.model.initialize(X=doc_sample, Y=label_sample)

-    def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
-        """Score a batch of examples.
-
-        examples (Iterable[Example]): The examples to score.
-        RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_cats.
-
-        DOCS: https://spacy.io/api/textcategorizer#score
-        """
-        validate_examples(examples, "TextCategorizer.score")
-        self._validate_categories(examples)
-        kwargs.setdefault("threshold", self.cfg["threshold"])
-        kwargs.setdefault("positive_label", self.cfg["positive_label"])
-        return Scorer.score_cats(
-            examples,
-            "cats",
-            labels=self.labels,
-            multi_label=False,
-            **kwargs,
-        )
-
    def _validate_categories(self, examples: Iterable[Example]):
        """Check whether the provided examples all have single-label cats annotations."""
        for ex in examples:
--- a/spacy/pipeline/textcat_multilabel.py
+++ b/spacy/pipeline/textcat_multilabel.py
@ -5,10 +5,11 @@ from thinc.api import Model, Config
 from thinc.types import Floats2d

 from ..language import Language
-from ..training import Example, validate_examples, validate_get_examples
+from ..training import Example, validate_get_examples
 from ..errors import Errors
 from ..scorer import Scorer
 from ..tokens import Doc
+from ..util import registry
 from ..vocab import Vocab
 from .textcat import TextCategorizer

@ -70,7 +71,11 @@ subword_features = true
@Language.factory(
    "textcat_multilabel",
    assigns=["doc.cats"],
-    default_config={"threshold": 0.5, "model": DEFAULT_MULTI_TEXTCAT_MODEL},
+    default_config={
+        "threshold": 0.5,
+        "model": DEFAULT_MULTI_TEXTCAT_MODEL,
+        "scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v1"},
+    },
    default_score_weights={
        "cats_score": 1.0,
        "cats_score_desc": None,
@ -86,7 +91,11 @@ subword_features = true
    },
 )
 def make_multilabel_textcat(
-    nlp: Language, name: str, model: Model[List[Doc], List[Floats2d]], threshold: float
+    nlp: Language,
+    name: str,
+    model: Model[List[Doc], List[Floats2d]],
+    threshold: float,
+    scorer: Optional[Callable],
 ) -> "TextCategorizer":
    """Create a TextCategorizer component. The text categorizer predicts categories
    over a whole document. It can learn one or more labels, and the labels are considered
@ -97,7 +106,23 @@ def make_multilabel_textcat(
        scores for each category.
    threshold (float): Cutoff to consider a prediction "positive".
    """
-    return MultiLabel_TextCategorizer(nlp.vocab, model, name, threshold=threshold)
+    return MultiLabel_TextCategorizer(
+        nlp.vocab, model, name, threshold=threshold, scorer=scorer
+    )
+
+
+def textcat_multilabel_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
+    return Scorer.score_cats(
+        examples,
+        "cats",
+        multi_label=True,
+        **kwargs,
+    )
+
+
+@registry.scorers("spacy.textcat_multilabel_scorer.v1")
+def make_textcat_multilabel_scorer():
+    return textcat_multilabel_score


 class MultiLabel_TextCategorizer(TextCategorizer):
@ -113,6 +138,7 @@ class MultiLabel_TextCategorizer(TextCategorizer):
        name: str = "textcat_multilabel",
        *,
        threshold: float,
+        scorer: Optional[Callable] = textcat_multilabel_score,
    ) -> None:
        """Initialize a text categorizer for multi-label classification.

@ -130,6 +156,7 @@ class MultiLabel_TextCategorizer(TextCategorizer):
        self._rehearsal_model = None
        cfg = {"labels": [], "threshold": threshold}
        self.cfg = dict(cfg)
+        self.scorer = scorer

    def initialize(  # type: ignore[override]
        self,
@ -166,24 +193,6 @@ class MultiLabel_TextCategorizer(TextCategorizer):
        assert len(label_sample) > 0, Errors.E923.format(name=self.name)
        self.model.initialize(X=doc_sample, Y=label_sample)

-    def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
-        """Score a batch of examples.
-
-        examples (Iterable[Example]): The examples to score.
-        RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_cats.
-
-        DOCS: https://spacy.io/api/textcategorizer#score
-        """
-        validate_examples(examples, "MultiLabel_TextCategorizer.score")
-        kwargs.setdefault("threshold", self.cfg["threshold"])
-        return Scorer.score_cats(
-            examples,
-            "cats",
-            labels=self.labels,
-            multi_label=True,
-            **kwargs,
-        )
-
    def _validate_categories(self, examples: Iterable[Example]):
        """This component allows any type of single- or multi-label annotations.
        This method overwrites the more strict one from 'textcat'."""
--- a/spacy/pipeline/trainable_pipe.pxd
+++ b/spacy/pipeline/trainable_pipe.pxd
@ -5,3 +5,4 @@ cdef class TrainablePipe(Pipe):
    cdef public Vocab vocab
    cdef public object model
    cdef public object cfg
+    cdef public object scorer
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@ -49,7 +49,8 @@ cdef class Parser(TrainablePipe):
        beam_density=0.0,
        beam_update_prob=0.0,
        multitasks=tuple(),
-        incorrect_spans_key=None
+        incorrect_spans_key=None,
+        scorer=None,
    ):
        """Create a Parser.

@ -86,6 +87,7 @@ cdef class Parser(TrainablePipe):
        incorrect_spans_key (Optional[str]): Identifies spans that are known
            to be incorrect entity annotations. The incorrect entity annotations
            can be stored in the span group, under this key.
+        scorer (Optional[Callable]): The scoring method. Defaults to None.
        """
        self.vocab = vocab
        self.name = name
@ -117,6 +119,7 @@ cdef class Parser(TrainablePipe):
            self.add_multitask_objective(multitask)

        self._rehearsal_model = None
+        self.scorer = scorer

    def __getnewargs_ex__(self):
        """This allows pickling the Parser and its keyword-only init arguments"""
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@ -351,7 +351,8 @@ class ConfigSchemaPretrain(BaseModel):
    # fmt: off
    max_epochs: StrictInt = Field(..., title="Maximum number of epochs to train for")
    dropout: StrictFloat = Field(..., title="Dropout rate")
-    n_save_every: Optional[StrictInt] = Field(..., title="Saving frequency")
+    n_save_every: Optional[StrictInt] = Field(..., title="Saving additional temporary model after n batches within an epoch")
+    n_save_epoch: Optional[StrictInt] = Field(..., title="Saving model after every n epoch")
    optimizer: Optimizer = Field(..., title="The optimizer to use")
    corpus: StrictStr = Field(..., title="Path in the config to the training data")
    batcher: Batcher = Field(..., title="Batcher for the training data")
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@ -247,18 +247,21 @@ class Scorer:
        missing_values: Set[Any] = MISSING_VALUES,  # type: ignore[assignment]
        **cfg,
    ) -> Dict[str, Any]:
-        """Return PRF scores per feat for a token attribute in UFEATS format.
+        """Return micro PRF and PRF scores per feat for a token attribute in
+        UFEATS format.

        examples (Iterable[Example]): Examples to score
        attr (str): The attribute to score.
        getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
            getter(token, attr) should return the value of the attribute for an
            individual token.
-        missing_values (Set[Any]): Attribute values to treat as missing annotation
-            in the reference annotation.
-        RETURNS (dict): A dictionary containing the per-feat PRF scores under
-            the key attr_per_feat.
+        missing_values (Set[Any]): Attribute values to treat as missing
+            annotation in the reference annotation.
+        RETURNS (dict): A dictionary containing the micro PRF scores under the
+            key attr_micro_p/r/f and the per-feat PRF scores under
+            attr_per_feat.
        """
+        micro_score = PRFScore()
        per_feat = {}
        for example in examples:
            pred_doc = example.predicted
@ -300,15 +303,22 @@ class Scorer:
                                    pred_per_feat[field] = set()
                                pred_per_feat[field].add((gold_i, feat))
            for field in per_feat:
+                micro_score.score_set(pred_per_feat.get(field, set()), gold_per_feat.get(field, set()))
                per_feat[field].score_set(
                    pred_per_feat.get(field, set()), gold_per_feat.get(field, set())
                )
-        score_key = f"{attr}_per_feat"
-        if any([len(v) for v in per_feat.values()]):
-            result = {k: v.to_dict() for k, v in per_feat.items()}
-            return {score_key: result}
+        result: Dict[str, Any] = {}
+        if len(micro_score) > 0:
+            result[f"{attr}_micro_p"] = micro_score.precision
+            result[f"{attr}_micro_r"] = micro_score.recall
+            result[f"{attr}_micro_f"] = micro_score.fscore
+            result[f"{attr}_per_feat"] = {k: v.to_dict() for k, v in per_feat.items()}
        else:
-            return {score_key: None}
+            result[f"{attr}_micro_p"] = None
+            result[f"{attr}_micro_r"] = None
+            result[f"{attr}_micro_f"] = None
+            result[f"{attr}_per_feat"] = None
+        return result

    @staticmethod
    def score_spans(
@ -545,7 +555,7 @@ class Scorer:

    @staticmethod
    def score_links(
-        examples: Iterable[Example], *, negative_labels: Iterable[str]
+        examples: Iterable[Example], *, negative_labels: Iterable[str], **cfg
    ) -> Dict[str, Any]:
        """Returns PRF for predicted links on the entity level.
        To disentangle the performance of the NEL from the NER,
@ -721,7 +731,7 @@ class Scorer:
            }


-def get_ner_prf(examples: Iterable[Example]) -> Dict[str, Any]:
+def get_ner_prf(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
    """Compute micro-PRF and per-entity PRF scores for a sequence of examples."""
    score_per_type = defaultdict(PRFScore)
    for eg in examples:
--- a/spacy/strings.pxd
+++ b/spacy/strings.pxd
@ -8,10 +8,10 @@ from murmurhash.mrmr cimport hash64
 from .typedefs cimport attr_t, hash_t


-cpdef hash_t hash_string(unicode string) except 0
+cpdef hash_t hash_string(str string) except 0
 cdef hash_t hash_utf8(char* utf8_string, int length) nogil

-cdef unicode decode_Utf8Str(const Utf8Str* string)
+cdef str decode_Utf8Str(const Utf8Str* string)


 ctypedef union Utf8Str:
@ -25,5 +25,5 @@ cdef class StringStore:
    cdef vector[hash_t] keys
    cdef public PreshMap _map

-    cdef const Utf8Str* intern_unicode(self, unicode py_string)
+    cdef const Utf8Str* intern_unicode(self, str py_string)
    cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length)
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@ -33,7 +33,7 @@ def get_string_id(key):
        return hash_utf8(chars, len(chars))


-cpdef hash_t hash_string(unicode string) except 0:
+cpdef hash_t hash_string(str string) except 0:
    chars = string.encode("utf8")
    return hash_utf8(chars, len(chars))

@ -46,7 +46,7 @@ cdef uint32_t hash32_utf8(char* utf8_string, int length) nogil:
    return hash32(utf8_string, length, 1)


-cdef unicode decode_Utf8Str(const Utf8Str* string):
+cdef str decode_Utf8Str(const Utf8Str* string):
    cdef int i, length
    if string.s[0] < sizeof(string.s) and string.s[0] != 0:
        return string.s[1:string.s[0]+1].decode("utf8")
@ -107,17 +107,17 @@ cdef class StringStore:
    def __getitem__(self, object string_or_id):
        """Retrieve a string from a given hash, or vice versa.

-        string_or_id (bytes, unicode or uint64): The value to encode.
+        string_or_id (bytes, str or uint64): The value to encode.
        Returns (str / uint64): The value to be retrieved.
        """
-        if isinstance(string_or_id, basestring) and len(string_or_id) == 0:
+        if isinstance(string_or_id, str) and len(string_or_id) == 0:
            return 0
        elif string_or_id == 0:
            return ""
        elif string_or_id in SYMBOLS_BY_STR:
            return SYMBOLS_BY_STR[string_or_id]
        cdef hash_t key
-        if isinstance(string_or_id, unicode):
+        if isinstance(string_or_id, str):
            key = hash_string(string_or_id)
            return key
        elif isinstance(string_or_id, bytes):
@ -135,14 +135,14 @@ cdef class StringStore:

    def as_int(self, key):
        """If key is an int, return it; otherwise, get the int value."""
-        if not isinstance(key, basestring):
+        if not isinstance(key, str):
            return key
        else:
            return self[key]

    def as_string(self, key):
        """If key is a string, return it; otherwise, get the string value."""
-        if isinstance(key, basestring):
+        if isinstance(key, str):
            return key
        else:
            return self[key]
@ -153,7 +153,7 @@ cdef class StringStore:
        string (str): The string to add.
        RETURNS (uint64): The string's hash value.
        """
-        if isinstance(string, unicode):
+        if isinstance(string, str):
            if string in SYMBOLS_BY_STR:
                return SYMBOLS_BY_STR[string]
            key = hash_string(string)
@ -189,7 +189,7 @@ cdef class StringStore:
            return True
        elif string in SYMBOLS_BY_STR:
            return True
-        elif isinstance(string, unicode):
+        elif isinstance(string, str):
            key = hash_string(string)
        else:
            string = string.encode("utf8")
@ -269,7 +269,7 @@ cdef class StringStore:
        for string in strings:
            self.add(string)

-    cdef const Utf8Str* intern_unicode(self, unicode py_string):
+    cdef const Utf8Str* intern_unicode(self, str py_string):
        # 0 means missing, but we don't bother offsetting the index.
        cdef bytes byte_string = py_string.encode("utf8")
        return self._intern_utf8(byte_string, len(byte_string))
--- a/spacy/tests/doc/test_pickle_doc.py
+++ b/spacy/tests/doc/test_pickle_doc.py
@ -5,9 +5,11 @@ from spacy.compat import pickle
 def test_pickle_single_doc():
    nlp = Language()
    doc = nlp("pickle roundtrip")
+    doc._context = 3
    data = pickle.dumps(doc, 1)
    doc2 = pickle.loads(data)
    assert doc2.text == "pickle roundtrip"
+    assert doc2._context == 3


 def test_list_of_docs_pickles_efficiently():
--- a/spacy/tests/lang/ca/test_exception.py
+++ b/spacy/tests/lang/ca/test_exception.py
@ -11,7 +11,18 @@ def test_ca_tokenizer_handles_abbr(ca_tokenizer, text, lemma):


 def test_ca_tokenizer_handles_exc_in_text(ca_tokenizer):
-    text = "La Núria i el Pere han vingut aprox. a les 7 de la tarda."
-    tokens = ca_tokenizer(text)
-    assert len(tokens) == 15
-    assert tokens[7].text == "aprox."
+    text = "La Dra. Puig viu a la pl. dels Til·lers."
+    doc = ca_tokenizer(text)
+    assert [t.text for t in doc] == [
+        "La",
+        "Dra.",
+        "Puig",
+        "viu",
+        "a",
+        "la",
+        "pl.",
+        "d",
+        "els",
+        "Til·lers",
+        ".",
+    ]
--- a/spacy/tests/lang/ca/test_prefix_suffix_infix.py
+++ b/spacy/tests/lang/ca/test_prefix_suffix_infix.py
@ -2,7 +2,14 @@ import pytest


@pytest.mark.parametrize(
-    "text,expected_tokens", [("d'un", ["d'", "un"]), ("s'ha", ["s'", "ha"])]
+    "text,expected_tokens",
+    [
+        ("d'un", ["d'", "un"]),
+        ("s'ha", ["s'", "ha"]),
+        ("del", ["d", "el"]),
+        ("cantar-te", ["cantar", "-te"]),
+        ("-hola", ["-", "hola"]),
+    ],
 )
 def test_contractions(ca_tokenizer, text, expected_tokens):
    """Test that the contractions are split into two tokens"""
--- a/spacy/tests/lang/ca/test_text.py
+++ b/spacy/tests/lang/ca/test_text.py
@ -12,17 +12,20 @@ def test_ca_tokenizer_handles_long_text(ca_tokenizer):
    una gerra de cervesa. Ens asseiem -fotògraf i periodista- en una terrassa buida."""

    tokens = ca_tokenizer(text)
-    assert len(tokens) == 140
+    assert len(tokens) == 146


@pytest.mark.parametrize(
    "text,length",
    [
-        ("Perquè va anar-hi?", 4),
+        ("Perquè va anar-hi?", 5),
+        ("El cotxe dels veins.", 6),
        ("“Ah no?”", 5),
        ("""Sí! "Anem", va contestar el Joan Carles""", 11),
        ("Van córrer aprox. 10km", 5),
        ("Llavors perqué...", 3),
+        ("Vull parlar-te'n demà al matí", 8),
+        ("Vull explicar-t'ho demà al matí", 8),
    ],
 )
 def test_ca_tokenizer_handles_cnts(ca_tokenizer, text, length):
--- a/spacy/tests/lang/ja/test_lemmatization.py
+++ b/spacy/tests/lang/ja/test_lemmatization.py
@ -8,3 +8,17 @@ import pytest
 def test_ja_lemmatizer_assigns(ja_tokenizer, word, lemma):
    test_lemma = ja_tokenizer(word)[0].lemma_
    assert test_lemma == lemma
+
+
+@pytest.mark.parametrize(
+    "word,norm",
+    [
+        ("SUMMER", "サマー"),
+        ("食べ物", "食べ物"),
+        ("綜合", "総合"),
+        ("コンピュータ", "コンピューター"),
+    ],
+)
+def test_ja_lemmatizer_norm(ja_tokenizer, word, norm):
+    test_norm = ja_tokenizer(word)[0].norm_
+    assert test_norm == norm
--- a/spacy/tests/lang/ja/test_morphologizer_factory.py
+++ b/spacy/tests/lang/ja/test_morphologizer_factory.py
@ -0,0 +1,9 @@
+import pytest
+from spacy.lang.ja import Japanese
+
+
+def test_ja_morphologizer_factory():
+    pytest.importorskip("sudachipy")
+    nlp = Japanese()
+    morphologizer = nlp.add_pipe("morphologizer")
+    assert morphologizer.cfg["extend"] is True
--- a/spacy/tests/lang/ja/test_serialize.py
+++ b/spacy/tests/lang/ja/test_serialize.py
@ -1,3 +1,5 @@
+import pickle
+
 from spacy.lang.ja import Japanese
 from ...util import make_tempdir

@ -31,3 +33,9 @@ def test_ja_tokenizer_serialize(ja_tokenizer):
        nlp_r.from_disk(d)
        assert nlp_bytes == nlp_r.to_bytes()
        assert nlp_r.tokenizer.split_mode == "B"
+
+
+def test_ja_tokenizer_pickle(ja_tokenizer):
+    b = pickle.dumps(ja_tokenizer)
+    ja_tokenizer_re = pickle.loads(b)
+    assert ja_tokenizer.to_bytes() == ja_tokenizer_re.to_bytes()
--- a/spacy/tests/lang/ja/test_tokenizer.py
+++ b/spacy/tests/lang/ja/test_tokenizer.py
@ -34,22 +34,22 @@ SENTENCE_TESTS = [
 ]

 tokens1 = [
-    DetailedToken(surface="委員", tag="名詞-普通名詞-一般", inf="", lemma="委員", reading="イイン", sub_tokens=None),
-    DetailedToken(surface="会", tag="名詞-普通名詞-一般", inf="", lemma="会", reading="カイ", sub_tokens=None),
+    DetailedToken(surface="委員", tag="名詞-普通名詞-一般", inf="", lemma="委員", norm="委員", reading="イイン", sub_tokens=None),
+    DetailedToken(surface="会", tag="名詞-普通名詞-一般", inf="", lemma="会", norm="会", reading="カイ", sub_tokens=None),
 ]
 tokens2 = [
-    DetailedToken(surface="選挙", tag="名詞-普通名詞-サ変可能", inf="", lemma="選挙", reading="センキョ", sub_tokens=None),
-    DetailedToken(surface="管理", tag="名詞-普通名詞-サ変可能", inf="", lemma="管理", reading="カンリ", sub_tokens=None),
-    DetailedToken(surface="委員", tag="名詞-普通名詞-一般", inf="", lemma="委員", reading="イイン", sub_tokens=None),
-    DetailedToken(surface="会", tag="名詞-普通名詞-一般", inf="", lemma="会", reading="カイ", sub_tokens=None),
+    DetailedToken(surface="選挙", tag="名詞-普通名詞-サ変可能", inf="", lemma="選挙", norm="選挙", reading="センキョ", sub_tokens=None),
+    DetailedToken(surface="管理", tag="名詞-普通名詞-サ変可能", inf="", lemma="管理", norm="管理", reading="カンリ", sub_tokens=None),
+    DetailedToken(surface="委員", tag="名詞-普通名詞-一般", inf="", lemma="委員", norm="委員", reading="イイン", sub_tokens=None),
+    DetailedToken(surface="会", tag="名詞-普通名詞-一般", inf="", lemma="会", norm="会", reading="カイ", sub_tokens=None),
 ]
 tokens3 = [
-    DetailedToken(surface="選挙", tag="名詞-普通名詞-サ変可能", inf="", lemma="選挙", reading="センキョ", sub_tokens=None),
-    DetailedToken(surface="管理", tag="名詞-普通名詞-サ変可能", inf="", lemma="管理", reading="カンリ", sub_tokens=None),
-    DetailedToken(surface="委員会", tag="名詞-普通名詞-一般", inf="", lemma="委員会", reading="イインカイ", sub_tokens=None),
+    DetailedToken(surface="選挙", tag="名詞-普通名詞-サ変可能", inf="", lemma="選挙", norm="選挙", reading="センキョ", sub_tokens=None),
+    DetailedToken(surface="管理", tag="名詞-普通名詞-サ変可能", inf="", lemma="管理", norm="管理", reading="カンリ", sub_tokens=None),
+    DetailedToken(surface="委員会", tag="名詞-普通名詞-一般", inf="", lemma="委員会", norm="委員会", reading="イインカイ", sub_tokens=None),
 ]
 SUB_TOKEN_TESTS = [
-    ("選挙管理委員会", [None, None, None, None], [None, None, [tokens1]], [[tokens2, tokens3]])
+    ("選挙管理委員会", [None, None, [tokens1]], [[tokens2, tokens3]])
 ]
 # fmt: on

@ -111,18 +111,16 @@ def test_ja_tokenizer_split_modes(ja_tokenizer, text, len_a, len_b, len_c):
    assert len(nlp_c(text)) == len_c


-@pytest.mark.parametrize(
-    "text,sub_tokens_list_a,sub_tokens_list_b,sub_tokens_list_c", SUB_TOKEN_TESTS
-)
+@pytest.mark.parametrize("text,sub_tokens_list_b,sub_tokens_list_c", SUB_TOKEN_TESTS)
 def test_ja_tokenizer_sub_tokens(
-    ja_tokenizer, text, sub_tokens_list_a, sub_tokens_list_b, sub_tokens_list_c
+    ja_tokenizer, text, sub_tokens_list_b, sub_tokens_list_c
 ):
    nlp_a = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "A"}}})
    nlp_b = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "B"}}})
    nlp_c = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "C"}}})

-    assert ja_tokenizer(text).user_data["sub_tokens"] == sub_tokens_list_a
-    assert nlp_a(text).user_data["sub_tokens"] == sub_tokens_list_a
+    assert ja_tokenizer(text).user_data.get("sub_tokens") is None
+    assert nlp_a(text).user_data.get("sub_tokens") is None
    assert nlp_b(text).user_data["sub_tokens"] == sub_tokens_list_b
    assert nlp_c(text).user_data["sub_tokens"] == sub_tokens_list_c

@ -132,16 +130,24 @@ def test_ja_tokenizer_sub_tokens(
    [
        (
            "取ってつけた",
-            ("五段-ラ行,連用形-促音便", "", "下一段-カ行,連用形-一般", "助動詞-タ,終止形-一般"),
-            ("トッ", "テ", "ツケ", "タ"),
+            (["五段-ラ行;連用形-促音便"], [], ["下一段-カ行;連用形-一般"], ["助動詞-タ;終止形-一般"]),
+            (["トッ"], ["テ"], ["ツケ"], ["タ"]),
+        ),
+        (
+            "2=3",
+            ([], [], []),
+            (["ニ"], ["_"], ["サン"])
        ),
    ],
 )
 def test_ja_tokenizer_inflections_reading_forms(
    ja_tokenizer, text, inflections, reading_forms
 ):
-    assert ja_tokenizer(text).user_data["inflections"] == inflections
-    assert ja_tokenizer(text).user_data["reading_forms"] == reading_forms
+    tokens = ja_tokenizer(text)
+    test_inflections = [tt.morph.get("Inflection") for tt in tokens]
+    assert test_inflections == list(inflections)
+    test_readings = [tt.morph.get("Reading") for tt in tokens]
+    assert test_readings == list(reading_forms)


 def test_ja_tokenizer_emptyish_texts(ja_tokenizer):
--- a/spacy/tests/lang/ko/test_serialize.py
+++ b/spacy/tests/lang/ko/test_serialize.py
@ -0,0 +1,24 @@
+import pickle
+
+from spacy.lang.ko import Korean
+from ...util import make_tempdir
+
+
+def test_ko_tokenizer_serialize(ko_tokenizer):
+    tokenizer_bytes = ko_tokenizer.to_bytes()
+    nlp = Korean()
+    nlp.tokenizer.from_bytes(tokenizer_bytes)
+    assert tokenizer_bytes == nlp.tokenizer.to_bytes()
+
+    with make_tempdir() as d:
+        file_path = d / "tokenizer"
+        ko_tokenizer.to_disk(file_path)
+        nlp = Korean()
+        nlp.tokenizer.from_disk(file_path)
+        assert tokenizer_bytes == nlp.tokenizer.to_bytes()
+
+
+def test_ko_tokenizer_pickle(ko_tokenizer):
+    b = pickle.dumps(ko_tokenizer)
+    ko_tokenizer_re = pickle.loads(b)
+    assert ko_tokenizer.to_bytes() == ko_tokenizer_re.to_bytes()
--- a/spacy/tests/lang/ky/test_tokenizer.py
+++ b/spacy/tests/lang/ky/test_tokenizer.py
@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
 import pytest


--- a/spacy/tests/lang/test_attrs.py
+++ b/spacy/tests/lang/test_attrs.py
@ -58,9 +58,10 @@ def test_lex_attrs_is_currency(text, match):
        ("www.google.com", True),
        ("google.com", True),
        ("sydney.com", True),
-        ("2girls1cup.org", True),
+        ("1abc2def.org", True),
        ("http://stupid", True),
        ("www.hi", True),
+        ("example.com/example", True),
        ("dog", False),
        ("1.2", False),
        ("1.a", False),
--- a/spacy/tests/lang/th/test_serialize.py
+++ b/spacy/tests/lang/th/test_serialize.py
@ -0,0 +1,24 @@
+import pickle
+
+from spacy.lang.th import Thai
+from ...util import make_tempdir
+
+
+def test_th_tokenizer_serialize(th_tokenizer):
+    tokenizer_bytes = th_tokenizer.to_bytes()
+    nlp = Thai()
+    nlp.tokenizer.from_bytes(tokenizer_bytes)
+    assert tokenizer_bytes == nlp.tokenizer.to_bytes()
+
+    with make_tempdir() as d:
+        file_path = d / "tokenizer"
+        th_tokenizer.to_disk(file_path)
+        nlp = Thai()
+        nlp.tokenizer.from_disk(file_path)
+        assert tokenizer_bytes == nlp.tokenizer.to_bytes()
+
+
+def test_th_tokenizer_pickle(th_tokenizer):
+    b = pickle.dumps(th_tokenizer)
+    th_tokenizer_re = pickle.loads(b)
+    assert th_tokenizer.to_bytes() == th_tokenizer_re.to_bytes()
--- a/spacy/tests/lang/ti/test_text.py
+++ b/spacy/tests/lang/ti/test_text.py
@ -37,7 +37,7 @@ def test_ti_tokenizer_handles_cnts(ti_tokenizer, text, length):
        ("10.000", True),
        ("1000", True),
        ("999,0", True),
-        ("ሐደ", True),
+        ("ሓደ", True),
        ("ክልተ", True),
        ("ትሪልዮን", True),
        ("ከልቢ", False),
--- a/spacy/tests/lang/vi/test_serialize.py
+++ b/spacy/tests/lang/vi/test_serialize.py
@ -1,3 +1,5 @@
+import pickle
+
 from spacy.lang.vi import Vietnamese
 from ...util import make_tempdir

@ -31,3 +33,9 @@ def test_vi_tokenizer_serialize(vi_tokenizer):
        nlp_r.from_disk(d)
        assert nlp_bytes == nlp_r.to_bytes()
        assert nlp_r.tokenizer.use_pyvi is False
+
+
+def test_vi_tokenizer_pickle(vi_tokenizer):
+    b = pickle.dumps(vi_tokenizer)
+    vi_tokenizer_re = pickle.loads(b)
+    assert vi_tokenizer.to_bytes() == vi_tokenizer_re.to_bytes()
--- a/spacy/tests/pipeline/test_attributeruler.py
+++ b/spacy/tests/pipeline/test_attributeruler.py
@ -32,24 +32,6 @@ def pattern_dicts():
    ]


-@registry.misc("attribute_ruler_patterns")
-def attribute_ruler_patterns():
-    return [
-        {
-            "patterns": [[{"ORTH": "a"}], [{"ORTH": "irrelevant"}]],
-            "attrs": {"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"},
-        },
-        # one pattern sets the lemma
-        {"patterns": [[{"ORTH": "test"}]], "attrs": {"LEMMA": "cat"}},
-        # another pattern sets the morphology
-        {
-            "patterns": [[{"ORTH": "test"}]],
-            "attrs": {"MORPH": "Case=Nom|Number=Sing"},
-            "index": 0,
-        },
-    ]
-
-
@pytest.fixture
 def tag_map():
    return {
@ -121,7 +103,25 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
    assert doc.has_annotation("LEMMA")
    assert doc.has_annotation("MORPH")
    nlp.remove_pipe("attribute_ruler")
+
    # initialize with patterns from misc registry
+    @registry.misc("attribute_ruler_patterns")
+    def attribute_ruler_patterns():
+        return [
+            {
+                "patterns": [[{"ORTH": "a"}], [{"ORTH": "irrelevant"}]],
+                "attrs": {"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"},
+            },
+            # one pattern sets the lemma
+            {"patterns": [[{"ORTH": "test"}]], "attrs": {"LEMMA": "cat"}},
+            # another pattern sets the morphology
+            {
+                "patterns": [[{"ORTH": "test"}]],
+                "attrs": {"MORPH": "Case=Nom|Number=Sing"},
+                "index": 0,
+            },
+        ]
+
    nlp.config["initialize"]["components"]["attribute_ruler"] = {
        "patterns": {"@misc": "attribute_ruler_patterns"}
    }
@ -162,6 +162,26 @@ def test_attributeruler_score(nlp, pattern_dicts):
    assert scores["lemma_acc"] == pytest.approx(0.2)
    # no morphs are set
    assert scores["morph_acc"] is None
+    nlp.remove_pipe("attribute_ruler")
+
+    # test with custom scorer
+    @registry.misc("weird_scorer.v1")
+    def make_weird_scorer():
+        def weird_scorer(examples, weird_score, **kwargs):
+            return {"weird_score": weird_score}
+
+        return weird_scorer
+
+    ruler = nlp.add_pipe(
+        "attribute_ruler", config={"scorer": {"@misc": "weird_scorer.v1"}}
+    )
+    ruler.initialize(lambda: [], patterns=pattern_dicts)
+    scores = nlp.evaluate(dev_examples, scorer_cfg={"weird_score": 0.12345})
+    assert scores["weird_score"] == 0.12345
+    assert "token_acc" in scores
+    assert "lemma_acc" not in scores
+    scores = nlp.evaluate(dev_examples, scorer_cfg={"weird_score": 0.23456})
+    assert scores["weird_score"] == 0.23456


 def test_attributeruler_rule_order(nlp):
--- a/spacy/tests/pipeline/test_morphologizer.py
+++ b/spacy/tests/pipeline/test_morphologizer.py
@ -8,6 +8,7 @@ from spacy.language import Language
 from spacy.tests.util import make_tempdir
 from spacy.morphology import Morphology
 from spacy.attrs import MORPH
+from spacy.tokens import Doc


 def test_label_types():
@ -137,6 +138,41 @@ def test_overfitting_IO():
    assert [str(t.morph) for t in doc] == gold_morphs
    assert [t.pos_ for t in doc] == gold_pos_tags

+    # Test overwrite+extend settings
+    # (note that "" is unset, "_" is set and empty)
+    morphs = ["Feat=V", "Feat=N", "_"]
+    doc = Doc(nlp.vocab, words=["blue", "ham", "like"], morphs=morphs)
+    orig_morphs = [str(t.morph) for t in doc]
+    orig_pos_tags = [t.pos_ for t in doc]
+    morphologizer = nlp.get_pipe("morphologizer")
+
+    # don't overwrite or extend
+    morphologizer.cfg["overwrite"] = False
+    doc = morphologizer(doc)
+    assert [str(t.morph) for t in doc] == orig_morphs
+    assert [t.pos_ for t in doc] == orig_pos_tags
+
+    # overwrite and extend
+    morphologizer.cfg["overwrite"] = True
+    morphologizer.cfg["extend"] = True
+    doc = Doc(nlp.vocab, words=["I", "like"], morphs=["Feat=A|That=A|This=A", ""])
+    doc = morphologizer(doc)
+    assert [str(t.morph) for t in doc] == ["Feat=N|That=A|This=A", "Feat=V"]
+
+    # extend without overwriting
+    morphologizer.cfg["overwrite"] = False
+    morphologizer.cfg["extend"] = True
+    doc = Doc(nlp.vocab, words=["I", "like"], morphs=["Feat=A|That=A|This=A", "That=B"])
+    doc = morphologizer(doc)
+    assert [str(t.morph) for t in doc] == ["Feat=A|That=A|This=A", "Feat=V|That=B"]
+
+    # overwrite without extending
+    morphologizer.cfg["overwrite"] = True
+    morphologizer.cfg["extend"] = False
+    doc = Doc(nlp.vocab, words=["I", "like"], morphs=["Feat=A|That=A|This=A", ""])
+    doc = morphologizer(doc)
+    assert [str(t.morph) for t in doc] == ["Feat=N", "Feat=V"]
+
    # Test with unset morph and partial POS
    nlp.remove_pipe("morphologizer")
    nlp.add_pipe("morphologizer")
--- a/spacy/tests/serialize/test_serialize_vocab_strings.py
+++ b/spacy/tests/serialize/test_serialize_vocab_strings.py
@ -1,7 +1,9 @@
 import pytest
 import pickle
+from thinc.api import get_current_ops
 from spacy.vocab import Vocab
 from spacy.strings import StringStore
+from spacy.vectors import Vectors

 from ..util import make_tempdir

@ -129,7 +131,11 @@ def test_serialize_stringstore_roundtrip_disk(strings1, strings2):
@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
 def test_pickle_vocab(strings, lex_attr):
    vocab = Vocab(strings=strings)
+    ops = get_current_ops()
+    vectors = Vectors(data=ops.xp.zeros((10, 10)), mode="floret", hash_count=1)
+    vocab.vectors = vectors
    vocab[strings[0]].norm_ = lex_attr
    vocab_pickled = pickle.dumps(vocab)
    vocab_unpickled = pickle.loads(vocab_pickled)
    assert vocab.to_bytes() == vocab_unpickled.to_bytes()
+    assert vocab_unpickled.vectors.mode == "floret"
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@ -1,5 +1,6 @@
 import pytest
 from click import NoSuchOption
+from packaging.specifiers import SpecifierSet
 from spacy.training import docs_to_json, offsets_to_biluo_tags
 from spacy.training.converters import iob_to_docs, conll_ner_to_docs, conllu_to_docs
 from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
@ -491,19 +492,27 @@ def test_string_to_list_intify(value):
    assert string_to_list(value, intify=True) == [1, 2, 3]


+@pytest.mark.skip(reason="Temporarily skip until v3.2.0 release")
 def test_download_compatibility():
-    model_name = "en_core_web_sm"
-    compatibility = get_compatibility()
-    version = get_version(model_name, compatibility)
-    assert get_minor_version(about.__version__) == get_minor_version(version)
+    spec = SpecifierSet("==" + about.__version__)
+    spec.prereleases = False
+    if about.__version__ in spec:
+        model_name = "en_core_web_sm"
+        compatibility = get_compatibility()
+        version = get_version(model_name, compatibility)
+        assert get_minor_version(about.__version__) == get_minor_version(version)


+@pytest.mark.skip(reason="Temporarily skip until v3.2.0 release")
 def test_validate_compatibility_table():
-    model_pkgs, compat = get_model_pkgs()
-    spacy_version = get_minor_version(about.__version__)
-    current_compat = compat.get(spacy_version, {})
-    assert len(current_compat) > 0
-    assert "en_core_web_sm" in current_compat
+    spec = SpecifierSet("==" + about.__version__)
+    spec.prereleases = False
+    if about.__version__ in spec:
+        model_pkgs, compat = get_model_pkgs()
+        spacy_version = get_minor_version(about.__version__)
+        current_compat = compat.get(spacy_version, {})
+        assert len(current_compat) > 0
+        assert "en_core_web_sm" in current_compat


@pytest.mark.parametrize("component_name", ["ner", "textcat", "spancat", "tagger"])
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@ -8,7 +8,7 @@ from spacy.vocab import Vocab
 from spacy.training import Example
 from spacy.lang.en import English
 from spacy.lang.de import German
-from spacy.util import registry, ignore_error, raise_error
+from spacy.util import registry, ignore_error, raise_error, find_matching_language
 import spacy
 from thinc.api import CupyOps, NumpyOps, get_current_ops

@ -255,6 +255,38 @@ def test_language_pipe_error_handler_custom(en_vocab, n_process):
            assert [doc.text for doc in docs] == ["TEXT 111", "TEXT 333", "TEXT 666"]


+@pytest.mark.parametrize("n_process", [1, 2])
+def test_language_pipe_error_handler_input_as_tuples(en_vocab, n_process):
+    """Test the error handling of nlp.pipe with input as tuples"""
+    Language.component("my_evil_component", func=evil_component)
+    ops = get_current_ops()
+    if isinstance(ops, NumpyOps) or n_process < 2:
+        nlp = English()
+        nlp.add_pipe("my_evil_component")
+        texts = [
+            ("TEXT 111", 111),
+            ("TEXT 222", 222),
+            ("TEXT 333", 333),
+            ("TEXT 342", 342),
+            ("TEXT 666", 666),
+        ]
+        with pytest.raises(ValueError):
+            list(nlp.pipe(texts, as_tuples=True))
+        nlp.set_error_handler(warn_error)
+        logger = logging.getLogger("spacy")
+        with mock.patch.object(logger, "warning") as mock_warning:
+            tuples = list(nlp.pipe(texts, as_tuples=True, n_process=n_process))
+            # HACK/TODO? the warnings in child processes don't seem to be
+            # detected by the mock logger
+            if n_process == 1:
+                mock_warning.assert_called()
+                assert mock_warning.call_count == 2
+                assert len(tuples) + mock_warning.call_count == len(texts)
+            assert (tuples[0][0].text, tuples[0][1]) == ("TEXT 111", 111)
+            assert (tuples[1][0].text, tuples[1][1]) == ("TEXT 333", 333)
+            assert (tuples[2][0].text, tuples[2][1]) == ("TEXT 666", 666)
+
+
@pytest.mark.parametrize("n_process", [1, 2])
 def test_language_pipe_error_handler_pipe(en_vocab, n_process):
    """Test the error handling of a component's pipe method"""
@ -512,6 +544,55 @@ def test_spacy_blank():
    assert nlp.meta["name"] == "my_custom_model"


+@pytest.mark.parametrize(
+    "lang,target",
+    [
+        ("en", "en"),
+        ("fra", "fr"),
+        ("fre", "fr"),
+        ("iw", "he"),
+        ("mo", "ro"),
+        ("mul", "xx"),
+        ("no", "nb"),
+        ("pt-BR", "pt"),
+        ("xx", "xx"),
+        ("zh-Hans", "zh"),
+        ("zh-Hant", None),
+        ("zxx", None),
+    ],
+)
+def test_language_matching(lang, target):
+    """
+    Test that we can look up languages by equivalent or nearly-equivalent
+    language codes.
+    """
+    assert find_matching_language(lang) == target
+
+
+@pytest.mark.parametrize(
+    "lang,target",
+    [
+        ("en", "en"),
+        ("fra", "fr"),
+        ("fre", "fr"),
+        ("iw", "he"),
+        ("mo", "ro"),
+        ("mul", "xx"),
+        ("no", "nb"),
+        ("pt-BR", "pt"),
+        ("xx", "xx"),
+        ("zh-Hans", "zh"),
+    ],
+)
+def test_blank_languages(lang, target):
+    """
+    Test that we can get spacy.blank in various languages, including codes
+    that are defined to be equivalent or that match by CLDR language matching.
+    """
+    nlp = spacy.blank(lang)
+    assert nlp.lang == target
+
+
@pytest.mark.parametrize("value", [False, None, ["x", "y"], Language, Vocab])
 def test_language_init_invalid_vocab(value):
    err_fragment = "invalid value"
@ -540,6 +621,32 @@ def test_language_source_and_vectors(nlp2):
    assert nlp.vocab.vectors.to_bytes() == vectors_bytes


+@pytest.mark.parametrize("n_process", [1, 2])
+def test_pass_doc_to_pipeline(nlp, n_process):
+    texts = ["cats", "dogs", "guinea pigs"]
+    docs = [nlp.make_doc(text) for text in texts]
+    assert not any(len(doc.cats) for doc in docs)
+    doc = nlp(docs[0])
+    assert doc.text == texts[0]
+    assert len(doc.cats) > 0
+    if isinstance(get_current_ops(), NumpyOps) or n_process < 2:
+        docs = nlp.pipe(docs, n_process=n_process)
+        assert [doc.text for doc in docs] == texts
+        assert all(len(doc.cats) for doc in docs)
+
+
+def test_invalid_arg_to_pipeline(nlp):
+    str_list = ["This is a text.", "This is another."]
+    with pytest.raises(ValueError):
+        nlp(str_list)  # type: ignore
+    assert len(list(nlp.pipe(str_list))) == 2
+    int_list = [1, 2, 3]
+    with pytest.raises(ValueError):
+        list(nlp.pipe(int_list))  # type: ignore
+    with pytest.raises(ValueError):
+        nlp(int_list)  # type: ignore
+
+
@pytest.mark.skipif(
    not isinstance(get_current_ops(), CupyOps), reason="test requires GPU"
 )
--- a/Show More
+++ b/Show More