Merge pull request #9612 from adrianeboyd/chore/switch-to-master-v3.2.0

Switch v3.2.0 to master
2025-07-15 02:32:37 +03:00 · 2021-11-03 16:27:34 +01:00 · 2021-11-03 16:27:34 +01:00 · 2bf52c44b1
commit 2bf52c44b1
parent cab9209c3d 07dea324f6
148 changed files with 3381 additions and 1690 deletions
--- a/.github/azure-steps.yml
+++ b/.github/azure-steps.yml
@ -65,8 +65,11 @@ steps:
    condition: eq(${{ parameters.gpu }}, true)
  - script: |
-      python -m spacy download ca_core_news_sm
+      #python -m spacy download ca_core_news_sm
-      python -m spacy download ca_core_news_md
+      #python -m spacy download ca_core_news_md
      # temporarily install the v3.1.0 models
      pip install --no-deps https://github.com/explosion/spacy-models/releases/download/ca_core_news_sm-3.1.0/ca_core_news_sm-3.1.0-py3-none-any.whl
      pip install --no-deps https://github.com/explosion/spacy-models/releases/download/ca_core_news_md-3.1.0/ca_core_news_md-3.1.0-py3-none-any.whl
      python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
    displayName: 'Test download CLI'
    condition: eq(variables['python_version'], '3.8')
@ -95,7 +98,8 @@ steps:
  - script: |
      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
-      PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
+      # temporarily ignore W095
      PYTHONWARNINGS="error,ignore:[W095]:UserWarning,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
    displayName: 'Test assemble CLI'
    condition: eq(variables['python_version'], '3.8')
--- a/.github/contributors/avi197.md
+++ b/.github/contributors/avi197.md
@ -0,0 +1,106 @@
 # spaCy contributor agreement
 This spaCy Contributor Agreement (**"SCA"**) is based on the
 [Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
 The SCA applies to any contribution that you make to any product or project
 managed by us (the **"project"**), and sets out the intellectual property rights
 you grant to us in the contributed materials. The term **"us"** shall mean
 [ExplosionAI GmbH](https://explosion.ai/legal). The term
 **"you"** shall mean the person or entity identified below.
 If you agree to be bound by these terms, fill in the information requested
 below and include the filled-in version with your first pull request, under the
 folder [`.github/contributors/`](/.github/contributors/). The name of the file
 should be your GitHub username, with the extension `.md`. For example, the user
 example_user would create the file `.github/contributors/example_user.md`.
 Read this agreement carefully before signing. These terms and conditions
 constitute a binding legal agreement.
 ## Contributor Agreement
 1. The term "contribution" or "contributed materials" means any source code,
 object code, patch, tool, sample, graphic, specification, manual,
 documentation, or any other material posted or submitted by you to the project.
 2. With respect to any worldwide copyrights, or copyright applications and
 registrations, in your contribution:
    * you hereby assign to us joint ownership, and to the extent that such
    assignment is or becomes invalid, ineffective or unenforceable, you hereby
    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
    royalty-free, unrestricted license to exercise all rights under those
    copyrights. This includes, at our option, the right to sublicense these same
    rights to third parties through multiple levels of sublicensees or other
    licensing arrangements;
    * you agree that each of us can do all things in relation to your
    contribution as if each of us were the sole owners, and if one of us makes
    a derivative work of your contribution, the one who makes the derivative
    work (or has it made will be the sole owner of that derivative work;
    * you agree that you will not assert any moral rights in your contribution
    against us, our licensees or transferees;
    * you agree that we may register a copyright in your contribution and
    exercise all ownership rights associated with it; and
    * you agree that neither of us has any duty to consult with, obtain the
    consent of, pay or render an accounting to the other for any use or
    distribution of your contribution.
 3. With respect to any patents you own, or that you can license without payment
 to any third party, you hereby grant to us a perpetual, irrevocable,
 non-exclusive, worldwide, no-charge, royalty-free license to:
    * make, have made, use, sell, offer to sell, import, and otherwise transfer
    your contribution in whole or in part, alone or in combination with or
    included in any product, work or materials arising out of the project to
    which your contribution was submitted, and
    * at our option, to sublicense these same rights to third parties through
    multiple levels of sublicensees or other licensing arrangements.
 4. Except as set out above, you keep all right, title, and interest in your
 contribution. The rights that you grant to us under these terms are effective
 on the date you first submitted a contribution to us, even if your submission
 took place before the date you sign these terms.
 5. You covenant, represent, warrant and agree that:
    * Each contribution that you submit is and shall be an original work of
    authorship and you can legally grant the rights set out in this SCA;
    * to the best of your knowledge, each contribution will not violate any
    third party's copyrights, trademarks, patents, or other intellectual
    property rights; and
    * each contribution shall be in compliance with U.S. export control laws and
    other applicable export and import laws. You agree to notify us if you
    become aware of any circumstance which would make any of the foregoing
    representations inaccurate in any respect. We may publicly disclose your
    participation in the project, including the fact that you have signed the SCA.
 6. This SCA is governed by the laws of the State of California and applicable
 U.S. Federal law. Any choice of law rules will not apply.
 7. Please place an “x” on one of the applicable statement below. Please do NOT
 mark both statements:
    * [x] I am signing on behalf of myself as an individual and no other person
    or entity, including my employer, has or will have rights with respect to my
    contributions.
    * [ ] I am signing on behalf of my employer or a legal entity and I have the
    actual authority to contractually bind that entity.
 ## Contributor Details
 | Field                          | Entry                |
 |------------------------------- | -------------------- |
 | Name                           | Son Pham             |
 | Company name (if applicable)   |                      |
 | Title or role (if applicable)  |                      |
 | Date                           | 09/10/2021           |
 | GitHub username                | Avi197               |
 | Website (optional)             |                      |
--- a/.github/contributors/fgaim.md
+++ b/.github/contributors/fgaim.md
@ -0,0 +1,106 @@
 # spaCy contributor agreement
 This spaCy Contributor Agreement (**"SCA"**) is based on the
 [Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
 The SCA applies to any contribution that you make to any product or project
 managed by us (the **"project"**), and sets out the intellectual property rights
 you grant to us in the contributed materials. The term **"us"** shall mean
 [ExplosionAI GmbH](https://explosion.ai/legal). The term
 **"you"** shall mean the person or entity identified below.
 If you agree to be bound by these terms, fill in the information requested
 below and include the filled-in version with your first pull request, under the
 folder [`.github/contributors/`](/.github/contributors/). The name of the file
 should be your GitHub username, with the extension `.md`. For example, the user
 example_user would create the file `.github/contributors/example_user.md`.
 Read this agreement carefully before signing. These terms and conditions
 constitute a binding legal agreement.
 ## Contributor Agreement
 1. The term "contribution" or "contributed materials" means any source code,
 object code, patch, tool, sample, graphic, specification, manual,
 documentation, or any other material posted or submitted by you to the project.
 2. With respect to any worldwide copyrights, or copyright applications and
 registrations, in your contribution:
    * you hereby assign to us joint ownership, and to the extent that such
    assignment is or becomes invalid, ineffective or unenforceable, you hereby
    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
    royalty-free, unrestricted license to exercise all rights under those
    copyrights. This includes, at our option, the right to sublicense these same
    rights to third parties through multiple levels of sublicensees or other
    licensing arrangements;
    * you agree that each of us can do all things in relation to your
    contribution as if each of us were the sole owners, and if one of us makes
    a derivative work of your contribution, the one who makes the derivative
    work (or has it made will be the sole owner of that derivative work;
    * you agree that you will not assert any moral rights in your contribution
    against us, our licensees or transferees;
    * you agree that we may register a copyright in your contribution and
    exercise all ownership rights associated with it; and
    * you agree that neither of us has any duty to consult with, obtain the
    consent of, pay or render an accounting to the other for any use or
    distribution of your contribution.
 3. With respect to any patents you own, or that you can license without payment
 to any third party, you hereby grant to us a perpetual, irrevocable,
 non-exclusive, worldwide, no-charge, royalty-free license to:
    * make, have made, use, sell, offer to sell, import, and otherwise transfer
    your contribution in whole or in part, alone or in combination with or
    included in any product, work or materials arising out of the project to
    which your contribution was submitted, and
    * at our option, to sublicense these same rights to third parties through
    multiple levels of sublicensees or other licensing arrangements.
 4. Except as set out above, you keep all right, title, and interest in your
 contribution. The rights that you grant to us under these terms are effective
 on the date you first submitted a contribution to us, even if your submission
 took place before the date you sign these terms.
 5. You covenant, represent, warrant and agree that:
    * Each contribution that you submit is and shall be an original work of
    authorship and you can legally grant the rights set out in this SCA;
    * to the best of your knowledge, each contribution will not violate any
    third party's copyrights, trademarks, patents, or other intellectual
    property rights; and
    * each contribution shall be in compliance with U.S. export control laws and
    other applicable export and import laws. You agree to notify us if you
    become aware of any circumstance which would make any of the foregoing
    representations inaccurate in any respect. We may publicly disclose your
    participation in the project, including the fact that you have signed the SCA.
 6. This SCA is governed by the laws of the State of California and applicable
 U.S. Federal law. Any choice of law rules will not apply.
 7. Please place an “x” on one of the applicable statement below. Please do NOT
 mark both statements:
    * [x] I am signing on behalf of myself as an individual and no other person
    or entity, including my employer, has or will have rights with respect to my
    contributions.
    * [ ] I am signing on behalf of my employer or a legal entity and I have the
    actual authority to contractually bind that entity.
 ## Contributor Details
 | Field                          | Entry                |
 |------------------------------- | -------------------- |
 | Name                           | Fitsum Gaim          |
 | Company name (if applicable)   |                      |
 | Title or role (if applicable)  |                      |
 | Date                           | 2021-08-07           |
 | GitHub username                | fgaim                |
 | Website (optional)             |                      |
--- a/.github/contributors/syrull.md
+++ b/.github/contributors/syrull.md
@ -0,0 +1,106 @@
 # spaCy contributor agreement
 This spaCy Contributor Agreement (**"SCA"**) is based on the
 [Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
 The SCA applies to any contribution that you make to any product or project
 managed by us (the **"project"**), and sets out the intellectual property rights
 you grant to us in the contributed materials. The term **"us"** shall mean
 [ExplosionAI GmbH](https://explosion.ai/legal). The term
 **"you"** shall mean the person or entity identified below.
 If you agree to be bound by these terms, fill in the information requested
 below and include the filled-in version with your first pull request, under the
 folder [`.github/contributors/`](/.github/contributors/). The name of the file
 should be your GitHub username, with the extension `.md`. For example, the user
 example_user would create the file `.github/contributors/example_user.md`.
 Read this agreement carefully before signing. These terms and conditions
 constitute a binding legal agreement.
 ## Contributor Agreement
 1. The term "contribution" or "contributed materials" means any source code,
 object code, patch, tool, sample, graphic, specification, manual,
 documentation, or any other material posted or submitted by you to the project.
 2. With respect to any worldwide copyrights, or copyright applications and
 registrations, in your contribution:
    * you hereby assign to us joint ownership, and to the extent that such
    assignment is or becomes invalid, ineffective or unenforceable, you hereby
    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
    royalty-free, unrestricted license to exercise all rights under those
    copyrights. This includes, at our option, the right to sublicense these same
    rights to third parties through multiple levels of sublicensees or other
    licensing arrangements;
    * you agree that each of us can do all things in relation to your
    contribution as if each of us were the sole owners, and if one of us makes
    a derivative work of your contribution, the one who makes the derivative
    work (or has it made will be the sole owner of that derivative work;
    * you agree that you will not assert any moral rights in your contribution
    against us, our licensees or transferees;
    * you agree that we may register a copyright in your contribution and
    exercise all ownership rights associated with it; and
    * you agree that neither of us has any duty to consult with, obtain the
    consent of, pay or render an accounting to the other for any use or
    distribution of your contribution.
 3. With respect to any patents you own, or that you can license without payment
 to any third party, you hereby grant to us a perpetual, irrevocable,
 non-exclusive, worldwide, no-charge, royalty-free license to:
    * make, have made, use, sell, offer to sell, import, and otherwise transfer
    your contribution in whole or in part, alone or in combination with or
    included in any product, work or materials arising out of the project to
    which your contribution was submitted, and
    * at our option, to sublicense these same rights to third parties through
    multiple levels of sublicensees or other licensing arrangements.
 4. Except as set out above, you keep all right, title, and interest in your
 contribution. The rights that you grant to us under these terms are effective
 on the date you first submitted a contribution to us, even if your submission
 took place before the date you sign these terms.
 5. You covenant, represent, warrant and agree that:
    * Each contribution that you submit is and shall be an original work of
    authorship and you can legally grant the rights set out in this SCA;
    * to the best of your knowledge, each contribution will not violate any
    third party's copyrights, trademarks, patents, or other intellectual
    property rights; and
    * each contribution shall be in compliance with U.S. export control laws and
    other applicable export and import laws. You agree to notify us if you
    become aware of any circumstance which would make any of the foregoing
    representations inaccurate in any respect. We may publicly disclose your
    participation in the project, including the fact that you have signed the SCA.
 6. This SCA is governed by the laws of the State of California and applicable
 U.S. Federal law. Any choice of law rules will not apply.
 7. Please place an “x” on one of the applicable statement below. Please do NOT
 mark both statements:
    * [x] I am signing on behalf of myself as an individual and no other person
    or entity, including my employer, has or will have rights with respect to my
    contributions.
    * [ ] I am signing on behalf of my employer or a legal entity and I have the
    actual authority to contractually bind that entity.
 ## Contributor Details
 | Field                          | Entry                |
 |------------------------------- | -------------------- |
 | Name                           | Dimitar Ganev |
 | Company name (if applicable)   |  |
 | Title or role (if applicable)  |  |
 | Date                           | 2021/8/2 |
 | GitHub username                | syrull |
 | Website (optional)             |                      |
--- a/.gitignore
+++ b/.gitignore
@ -9,6 +9,7 @@ keys/
 spacy/tests/package/setup.cfg
 spacy/tests/package/pyproject.toml
 spacy/tests/package/requirements.txt
 spacy/tests/universe/universe.json
 # Website
 website/.cache/
--- a/requirements.txt
+++ b/requirements.txt
@ -1,5 +1,6 @@
 # Our libraries
 spacy-legacy>=3.0.8,<3.1.0
 spacy-loggers>=1.0.0,<2.0.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
 thinc>=8.0.12,<8.1.0
@ -17,6 +18,7 @@ requests>=2.13.0,<3.0.0
 tqdm>=4.38.0,<5.0.0
 pydantic>=1.7.4,!=1.8,!=1.8.1,<1.9.0
 jinja2
 langcodes>=3.2.0,<4.0.0
 # Official Python utilities
 setuptools
 packaging>=20.0
--- a/setup.cfg
+++ b/setup.cfg
@ -42,6 +42,7 @@ setup_requires =
 install_requires =
    # Our libraries
    spacy-legacy>=3.0.8,<3.1.0
    spacy-loggers>=1.0.0,<2.0.0
    murmurhash>=0.28.0,<1.1.0
    cymem>=2.0.2,<2.1.0
    preshed>=3.0.2,<3.1.0
@ -62,6 +63,7 @@ install_requires =
    setuptools
    packaging>=20.0
    typing_extensions>=3.7.4,<4.0.0.0; python_version < "3.8"
    langcodes>=3.2.0,<4.0.0
 [options.entry_points]
 console_scripts =
@ -69,9 +71,9 @@ console_scripts =
 [options.extras_require]
 lookups =
-    spacy_lookups_data>=1.0.2,<1.1.0
+    spacy_lookups_data>=1.0.3,<1.1.0
 transformers =
-    spacy_transformers>=1.0.1,<1.2.0
+    spacy_transformers>=1.1.2,<1.2.0
 ray =
    spacy_ray>=0.1.0,<1.0.0
 cuda =
--- a/setup.py
+++ b/setup.py
@ -81,6 +81,7 @@ COPY_FILES = {
    ROOT / "setup.cfg": PACKAGE_ROOT / "tests" / "package",
    ROOT / "pyproject.toml": PACKAGE_ROOT / "tests" / "package",
    ROOT / "requirements.txt": PACKAGE_ROOT / "tests" / "package",
    ROOT / "website" / "meta" / "universe.json": PACKAGE_ROOT / "tests" / "universe",
 }
--- a/spacy/about.py
+++ b/spacy/about.py
@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "3.1.4"
+__version__ = "3.2.0"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __projects__ = "https://github.com/explosion/projects"
--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@ -142,7 +142,7 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
    for name, value in stringy_attrs.items():
        int_key = intify_attr(name)
        if int_key is not None:
-            if strings_map is not None and isinstance(value, basestring):
+            if strings_map is not None and isinstance(value, str):
                if hasattr(strings_map, 'add'):
                    value = strings_map.add(value)
                else:
--- a/spacy/cli/init_pipeline.py
+++ b/spacy/cli/init_pipeline.py
@ -20,6 +20,7 @@ def init_vectors_cli(
    output_dir: Path = Arg(..., help="Pipeline output directory"),
    prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"),
    truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
    mode: str = Opt("default", "--mode", "-m", help="Vectors mode: default or floret"),
    name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
    jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
@ -34,7 +35,14 @@ def init_vectors_cli(
    nlp = util.get_lang_class(lang)()
    if jsonl_loc is not None:
        update_lexemes(nlp, jsonl_loc)
-    convert_vectors(nlp, vectors_loc, truncate=truncate, prune=prune, name=name)
+    convert_vectors(
        nlp,
        vectors_loc,
        truncate=truncate,
        prune=prune,
        name=name,
        mode=mode,
    )
    msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
    nlp.to_disk(output_dir)
    msg.good(
--- a/spacy/default_config_pretraining.cfg
+++ b/spacy/default_config_pretraining.cfg
@ -5,6 +5,7 @@ raw_text = null
 max_epochs = 1000
 dropout = 0.2
 n_save_every = null
 n_save_epoch = null
 component = "tok2vec"
 layer = ""
 corpus = "corpora.pretrain"
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -22,6 +22,9 @@ def setup_default_warnings():
    # warn once about lemmatizer without required POS
    filter_warning("once", error_msg=Warnings.W108)
    # floret vector table cannot be modified
    filter_warning("once", error_msg="[W114]")
 def filter_warning(action: str, error_msg: str):
    """Customize how spaCy should handle a certain warning.
@ -186,6 +189,8 @@ class Warnings(metaclass=ErrorsWithCodes):
            "vectors are not identical to current pipeline vectors.")
    W114 = ("Using multiprocessing with GPU models is not recommended and may "
            "lead to errors.")
    W115 = ("Skipping {method}: the floret vector table cannot be modified. "
            "Vectors are calculated from character ngrams.")
 class Errors(metaclass=ErrorsWithCodes):
@ -277,7 +282,7 @@ class Errors(metaclass=ErrorsWithCodes):
            "you forget to call the `set_extension` method?")
    E047 = ("Can't assign a value to unregistered extension attribute "
            "'{name}'. Did you forget to call the `set_extension` method?")
-    E048 = ("Can't import language {lang} from spacy.lang: {err}")
+    E048 = ("Can't import language {lang} or any matching language from spacy.lang: {err}")
    E050 = ("Can't find model '{name}'. It doesn't seem to be a Python "
            "package or a valid path to a data directory.")
    E052 = ("Can't find model directory: {path}")
@ -511,13 +516,24 @@ class Errors(metaclass=ErrorsWithCodes):
    E199 = ("Unable to merge 0-length span at `doc[{start}:{end}]`.")
    E200 = ("Can't yet set {attr} from Span. Vote for this feature on the "
            "issue tracker: http://github.com/explosion/spaCy/issues")
-    E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.")
+    E202 = ("Unsupported {name} mode '{mode}'. Supported modes: {modes}.")
    # New errors added in v3.x
-    E866 = ("A SpanGroup is not functional after the corresponding Doc has "
+    E858 = ("The {mode} vector table does not support this operation. "
            "{alternative}")
    E859 = ("The floret vector table cannot be modified.")
    E860 = ("Can't truncate fasttext-bloom vectors.")
    E861 = ("No 'keys' should be provided when initializing floret vectors "
            "with 'minn' and 'maxn'.")
    E862 = ("'hash_count' must be between 1-4 for floret vectors.")
    E863 = ("'maxn' must be greater than or equal to 'minn'.")
    E864 = ("The complete vector table 'data' is required to initialize floret "
            "vectors.")
    E865 = ("A SpanGroup is not functional after the corresponding Doc has "
            "been garbage collected. To keep using the spans, make sure that "
            "the corresponding Doc object is still available in the scope of "
            "your function.")
    E866 = ("Expected a string or 'Doc' as input, but got: {type}.")
    E867 = ("The 'textcat' component requires at least two labels because it "
            "uses mutually exclusive classes where exactly one label is True "
            "for each doc. For binary classification tasks, you can use two "
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@ -124,7 +124,7 @@ cdef class KnowledgeBase:
    def get_alias_strings(self):
        return [self.vocab.strings[x] for x in self._alias_index]
-    def add_entity(self, unicode entity, float freq, vector[float] entity_vector):
+    def add_entity(self, str entity, float freq, vector[float] entity_vector):
        """
        Add an entity to the KB, optionally specifying its log probability based on corpus frequency
        Return the hash of the entity ID/name at the end.
@ -185,15 +185,15 @@ cdef class KnowledgeBase:
            i += 1
-    def contains_entity(self, unicode entity):
+    def contains_entity(self, str entity):
        cdef hash_t entity_hash = self.vocab.strings.add(entity)
        return entity_hash in self._entry_index
-    def contains_alias(self, unicode alias):
+    def contains_alias(self, str alias):
        cdef hash_t alias_hash = self.vocab.strings.add(alias)
        return alias_hash in self._alias_index
-    def add_alias(self, unicode alias, entities, probabilities):
+    def add_alias(self, str alias, entities, probabilities):
        """
        For a given alias, add its potential entities and prior probabilies to the KB.
        Return the alias_hash at the end
@ -239,7 +239,7 @@ cdef class KnowledgeBase:
            raise RuntimeError(Errors.E891.format(alias=alias))
        return alias_hash
-    def append_alias(self, unicode alias, unicode entity, float prior_prob, ignore_warnings=False):
+    def append_alias(self, str alias, str entity, float prior_prob, ignore_warnings=False):
        """
        For an alias already existing in the KB, extend its potential entities with one more.
        Throw a warning if either the alias or the entity is unknown,
@ -286,7 +286,7 @@ cdef class KnowledgeBase:
            alias_entry.probs = probs
            self._aliases_table[alias_index] = alias_entry
-    def get_alias_candidates(self, unicode alias) -> Iterator[Candidate]:
+    def get_alias_candidates(self, str alias) -> Iterator[Candidate]:
        """
        Return candidate entities for an alias. Each candidate defines the entity, the original alias,
        and the prior probability of that alias resolving to that entity.
@ -307,7 +307,7 @@ cdef class KnowledgeBase:
                for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs)
                if entry_index != 0]
-    def get_vector(self, unicode entity):
+    def get_vector(self, str entity):
        cdef hash_t entity_hash = self.vocab.strings[entity]
        # Return an empty list if this entity is unknown in this KB
@ -317,7 +317,7 @@ cdef class KnowledgeBase:
        return self._vectors_table[self._entries[entry_index].vector_index]
-    def get_prior_prob(self, unicode entity, unicode alias):
+    def get_prior_prob(self, str entity, str alias):
        """ Return the prior probability of a given alias being linked to a given entity,
        or return 0.0 when this combination is not known in the knowledge base"""
        cdef hash_t alias_hash = self.vocab.strings[alias]
@ -587,7 +587,7 @@ cdef class Writer:
    def __init__(self, path):
        assert isinstance(path, Path)
        content = bytes(path)
-        cdef bytes bytes_loc = content.encode('utf8') if type(content) == unicode else content
+        cdef bytes bytes_loc = content.encode('utf8') if type(content) == str else content
        self._fp = fopen(<char*>bytes_loc, 'wb')
        if not self._fp:
            raise IOError(Errors.E146.format(path=path))
@ -629,7 +629,7 @@ cdef class Writer:
 cdef class Reader:
    def __init__(self, path):
        content = bytes(path)
-        cdef bytes bytes_loc = content.encode('utf8') if type(content) == unicode else content
+        cdef bytes bytes_loc = content.encode('utf8') if type(content) == str else content
        self._fp = fopen(<char*>bytes_loc, 'rb')
        if not self._fp:
            PyErr_SetFromErrno(IOError)
--- a/spacy/lang/am/punctuation.py
+++ b/spacy/lang/am/punctuation.py
@ -1,7 +1,7 @@
 from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
 from ..char_classes import UNITS, ALPHA_UPPER
-_list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧".strip().split()
+_list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧ ፠ ፨".strip().split()
 _suffixes = (
    _list_punct
--- a/spacy/lang/bg/stop_words.py
+++ b/spacy/lang/bg/stop_words.py
@ -1,265 +1,79 @@
-# Source: https://github.com/Alir3z4/stop-words
+"""
-
+References:
    https://github.com/Alir3z4/stop-words - Original list, serves as a base.
    https://postvai.com/books/stop-dumi.pdf - Additions to the original list in order to improve it.
 """
 STOP_WORDS = set(
    """
-а
+а автентичен аз ако ала
-автентичен
+
-аз
+бе без беше би бивш бивша бившо бивши бил била били било благодаря близо бъдат
-ако
+бъде бъда бяха
-ала
+
-бе
+в вас ваш ваша вашата вашият вероятно вече взема ви вие винаги внимава време все 
-без
+всеки всички вместо всичко вследствие всъщност всяка втори във въпреки върху
-беше
+вътре веднъж 
-би
+
-бивш
+г ги главен главна главно глас го годно година години годишен
-бивша
+
-бившо
+д да дали далеч далече два двама двамата две двете ден днес дни до добра добре 
-бил
+добро добър достатъчно докато докога дори досега доста друг друга другаде други
-била
+
-били
+е евтин едва един една еднаква еднакви еднакъв едно екип ето
-било
+
-благодаря
+живот жив
-близо
+
-бъдат
+за здравей здрасти знае зная забавям зад зададени заедно заради засега заспал 
-бъде
+затова запазва започвам защо защото завинаги
-бяха
+
-в
+и из или им има имат иска искам използвайки изглежда изглеждаше изглеждайки 
-вас
+извън имайки
-ваш
+
-ваша
+й йо 
-вероятно
+
-вече
+каза казва казвайки казвам как каква какво както какъв като кога кауза каузи 
-взема
+когато когото което които кой който колко която къде където към край кратък 
-ви
+кръгъл
-вие
+
-винаги
+лесен лесно ли летя летиш летим лош
-внимава
+
-време
+м май малко макар малцина междувременно минус ме между мек мен месец ми мис 
-все
+мисля много мнозина мога могат може мой можем мокър моля момента му
-всеки
+
-всички
+н на над назад най наш навсякъде навътре нагоре направи напред надолу наистина 
-всичко
+например наопаки наполовина напоследък нека независимо нас насам наскоро 
-всяка
+настрана необходимо него негов нещо нея ни ние никой нито нищо но нов някак нова 
-във
+нови новина някои някой някога някъде няколко няма
-въпреки
+
-върху
+о обаче около описан опитах опитва опитвайки опитвам определен определено освен 
-г
+обикновено осигурява обратно означава особен особено от ох отвъд отгоре отдолу 
-ги
+отново отива отивам отидох отсега отделно отколкото откъдето очевидно оттам 
-главен
+относно още
-главна
+
-главно
+п пак по повече повечето под поне просто пряко поради после последен последно 
-глас
+посочен почти прави прав прави правя пред преди през при пък първата първи първо 
-го
+път пъти плюс
-година
+
-години
+равен равна различен различни разумен разумно
-годишен
+
-д
+с са сам само себе сериозно сигурен сигурно се сега си син скоро скорошен след 
-да
+следващ следващия следва следното следователно случва сме смях собствен 
-дали
+сравнително смея според сред става срещу съвсем съдържа съдържащ съжалявам 
-два
+съответен съответно сте съм със също
-двама
+
-двамата
+т така техен техни такива такъв твърде там трета твой те тези ти то това 
-две
+тогава този той търси толкова точно три трябва тук тъй тя тях
-двете
+
-ден
+у утре ужасно употреба успоредно уточнен уточняване
-днес
+
-дни
+харесва харесали хиляди
-до
+
-добра
+ч часа ценя цяло цялостен че често чрез чудя
-добре
+
-добро
+ще щеше щом щяха
-добър
+
 докато
 докога
 дори
 досега
 доста
 друг
 друга
 други
 е
 евтин
 едва
 един
 една
 еднаква
 еднакви
 еднакъв
 едно
 екип
 ето
 живот
 за
 забавям
 зад
 заедно
 заради
 засега
 заспал
 затова
 защо
 защото
 и
 из
 или
 им
 има
 имат
 иска
 й
 каза
 как
 каква
 какво
 както
 какъв
 като
 кога
 когато
 което
 които
 кой
 който
 колко
 която
 къде
 където
 към
 лесен
 лесно
 ли
 лош
 м
 май
 малко
 ме
 между
 мек
 мен
 месец
 ми
 много
 мнозина
 мога
 могат
 може
 мокър
 моля
 момента
 му
 н
 на
 над
 назад
 най
 направи
 напред
 например
 нас
 не
 него
 нещо
 нея
 ни
 ние
 никой
 нито
 нищо
 но
 нов
 нова
 нови
 новина
 някои
 някой
 няколко
 няма
 обаче
 около
 освен
 особено
 от
 отгоре
 отново
 още
 пак
 по
 повече
 повечето
 под
 поне
 поради
 после
 почти
 прави
 пред
 преди
 през
 при
 пък
 първата
 първи
 първо
 пъти
 равен
 равна
 с
 са
 сам
 само
 се
 сега
 си
 син
 скоро
 след
 следващ
 сме
 смях
 според
 сред
 срещу
 сте
 съм
 със
 също
 т
 тази
 така
 такива
 такъв
 там
 твой
 те
 тези
 ти
 т.н.
 то
 това
 тогава
 този
 той
 толкова
 точно
 три
 трябва
 тук
 тъй
 тя
 тях
 у
 утре
 харесва
 хиляди
 ч
 часа
 че
 често
 чрез
 ще
 щом
 юмрук
-я
+
-як
+я як
 """.split()
 )
--- a/spacy/lang/bg/tokenizer_exceptions.py
+++ b/spacy/lang/bg/tokenizer_exceptions.py
@ -1,10 +1,16 @@
 """
 References:
    https://slovored.com/bg/abbr/grammar/ - Additional refs for abbreviations
    (countries, occupations, fields of studies and more).
 """
 from ...symbols import ORTH, NORM
 _exc = {}
-
+# measurements
-_abbr_exc = [
+for abbr in [
    {ORTH: "м", NORM: "метър"},
    {ORTH: "мм", NORM: "милиметър"},
    {ORTH: "см", NORM: "сантиметър"},
@ -17,51 +23,191 @@ _abbr_exc = [
    {ORTH: "хл", NORM: "хектолиър"},
    {ORTH: "дкл", NORM: "декалитър"},
    {ORTH: "л", NORM: "литър"},
-]
+]:
 for abbr in _abbr_exc:
    _exc[abbr[ORTH]] = [abbr]
-_abbr_line_exc = [
+# line abbreviations
 for abbr in [
    {ORTH: "г-жа", NORM: "госпожа"},
    {ORTH: "г-н", NORM: "господин"},
    {ORTH: "г-ца", NORM: "госпожица"},
    {ORTH: "д-р", NORM: "доктор"},
    {ORTH: "о-в", NORM: "остров"},
    {ORTH: "п-в", NORM: "полуостров"},
-]
+    {ORTH: "с-у", NORM: "срещу"},
-
+    {ORTH: "в-у", NORM: "върху"},
-for abbr in _abbr_line_exc:
+    {ORTH: "м-у", NORM: "между"},
 ]:
    _exc[abbr[ORTH]] = [abbr]
-_abbr_dot_exc = [
+# foreign language related abbreviations
 for abbr in [
    {ORTH: "англ.", NORM: "английски"},
    {ORTH: "ан.", NORM: "английски термин"},
    {ORTH: "араб.", NORM: "арабски"},
    {ORTH: "афр.", NORM: "африкански"},
    {ORTH: "гр.", NORM: "гръцки"},
    {ORTH: "лат.", NORM: "латински"},
    {ORTH: "рим.", NORM: "римски"},
    {ORTH: "старогр.", NORM: "старогръцки"},
    {ORTH: "староевр.", NORM: "староеврейски"},
    {ORTH: "фр.", NORM: "френски"},
    {ORTH: "хол.", NORM: "холандски"},
    {ORTH: "швед.", NORM: "шведски"},
    {ORTH: "шотл.", NORM: "шотландски"},
    {ORTH: "яп.", NORM: "японски"},
 ]:
    _exc[abbr[ORTH]] = [abbr]
 # profession and academic titles abbreviations
 for abbr in [
    {ORTH: "акад.", NORM: "академик"},
    {ORTH: "ал.", NORM: "алинея"},
    {ORTH: "арх.", NORM: "архитект"},
    {ORTH: "инж.", NORM: "инженер"},
    {ORTH: "канц.", NORM: "канцлер"},
    {ORTH: "проф.", NORM: "професор"},
    {ORTH: "св.", NORM: "свети"},
 ]:
    _exc[abbr[ORTH]] = [abbr]
 # fields of studies
 for abbr in [
    {ORTH: "агр.", NORM: "агрономия"},
    {ORTH: "ав.", NORM: "авиация"},
    {ORTH: "агр.", NORM: "агрономия"},
    {ORTH: "археол.", NORM: "археология"},
    {ORTH: "астр.", NORM: "астрономия"},
    {ORTH: "геод.", NORM: "геодезия"},
    {ORTH: "геол.", NORM: "геология"},
    {ORTH: "геом.", NORM: "геометрия"},
    {ORTH: "гимн.", NORM: "гимнастика"},
    {ORTH: "грам.", NORM: "граматика"},
    {ORTH: "жур.", NORM: "журналистика"},
    {ORTH: "журн.", NORM: "журналистика"},
    {ORTH: "зем.", NORM: "земеделие"},
    {ORTH: "икон.", NORM: "икономика"},
    {ORTH: "лит.", NORM: "литература"},
    {ORTH: "мат.", NORM: "математика"},
    {ORTH: "мед.", NORM: "медицина"},
    {ORTH: "муз.", NORM: "музика"},
    {ORTH: "печ.", NORM: "печатарство"},
    {ORTH: "пол.", NORM: "политика"},
    {ORTH: "псих.", NORM: "психология"},
    {ORTH: "соц.", NORM: "социология"},
    {ORTH: "стат.", NORM: "статистика"},
    {ORTH: "стил.", NORM: "стилистика"},
    {ORTH: "топогр.", NORM: "топография"},
    {ORTH: "търг.", NORM: "търговия"},
    {ORTH: "фарм.", NORM: "фармацевтика"},
    {ORTH: "фехт.", NORM: "фехтовка"},
    {ORTH: "физиол.", NORM: "физиология"},
    {ORTH: "физ.", NORM: "физика"},
    {ORTH: "фил.", NORM: "философия"},
    {ORTH: "фин.", NORM: "финанси"},
    {ORTH: "фолкл.", NORM: "фолклор"},
    {ORTH: "фон.", NORM: "фонетика"},
    {ORTH: "фот.", NORM: "фотография"},
    {ORTH: "футб.", NORM: "футбол"},
    {ORTH: "хим.", NORM: "химия"},
    {ORTH: "хир.", NORM: "хирургия"},
    {ORTH: "ел.", NORM: "електротехника"},
 ]:
    _exc[abbr[ORTH]] = [abbr]
 for abbr in [
    {ORTH: "ал.", NORM: "алинея"},
    {ORTH: "авт.", NORM: "автоматично"},
    {ORTH: "адм.", NORM: "администрация"},
    {ORTH: "арт.", NORM: "артилерия"},
    {ORTH: "бл.", NORM: "блок"},
    {ORTH: "бр.", NORM: "брой"},
    {ORTH: "бул.", NORM: "булевард"},
    {ORTH: "букв.", NORM: "буквално"},
    {ORTH: "в.", NORM: "век"},
    {ORTH: "вр.", NORM: "време"},
    {ORTH: "вм.", NORM: "вместо"},
    {ORTH: "воен.", NORM: "военен термин"},
    {ORTH: "г.", NORM: "година"},
    {ORTH: "гр.", NORM: "град"},
    {ORTH: "гл.", NORM: "глагол"},
    {ORTH: "др.", NORM: "други"},
    {ORTH: "ез.", NORM: "езеро"},
    {ORTH: "ж.р.", NORM: "женски род"},
-    {ORTH: "инж.", NORM: "инженер"},
+    {ORTH: "жп.", NORM: "железопът"},
    {ORTH: "застр.", NORM: "застрахователно дело"},
    {ORTH: "знач.", NORM: "значение"},
    {ORTH: "и др.", NORM: "и други"},
    {ORTH: "и под.", NORM: "и подобни"},
    {ORTH: "и пр.", NORM: "и прочие"},
    {ORTH: "изр.", NORM: "изречение"},
    {ORTH: "изт.", NORM: "източен"},
    {ORTH: "конкр.", NORM: "конкретно"},
    {ORTH: "лв.", NORM: "лев"},
    {ORTH: "л.", NORM: "лице"},
    {ORTH: "м.р.", NORM: "мъжки род"},
-    {ORTH: "мат.", NORM: "математика"},
+    {ORTH: "мин.вр.", NORM: "минало време"},
-    {ORTH: "мед.", NORM: "медицина"},
+    {ORTH: "мн.ч.", NORM: "множествено число"},
    {ORTH: "напр.", NORM: "например"},
    {ORTH: "нар.", NORM: "наречие"},
    {ORTH: "науч.", NORM: "научен термин"},
    {ORTH: "непр.", NORM: "неправилно"},
    {ORTH: "обик.", NORM: "обикновено"},
    {ORTH: "опред.", NORM: "определение"},
    {ORTH: "особ.", NORM: "особено"},
    {ORTH: "ост.", NORM: "остаряло"},
    {ORTH: "относ.", NORM: "относително"},
    {ORTH: "отр.", NORM: "отрицателно"},
    {ORTH: "пл.", NORM: "площад"},
-    {ORTH: "проф.", NORM: "професор"},
+    {ORTH: "пад.", NORM: "падеж"},
    {ORTH: "парл.", NORM: "парламентарен"},
    {ORTH: "погов.", NORM: "поговорка"},
    {ORTH: "пон.", NORM: "понякога"},
    {ORTH: "правосл.", NORM: "православен"},
    {ORTH: "прибл.", NORM: "приблизително"},
    {ORTH: "прил.", NORM: "прилагателно име"},
    {ORTH: "пр.", NORM: "прочие"},
    {ORTH: "с.", NORM: "село"},
    {ORTH: "с.р.", NORM: "среден род"},
    {ORTH: "св.", NORM: "свети"},
    {ORTH: "сп.", NORM: "списание"},
    {ORTH: "стр.", NORM: "страница"},
    {ORTH: "сз.", NORM: "съюз"},
    {ORTH: "сег.", NORM: "сегашно"},
    {ORTH: "сп.", NORM: "спорт"},
    {ORTH: "срв.", NORM: "сравни"},
    {ORTH: "с.ст.", NORM: "селскостопанска техника"},
    {ORTH: "счет.", NORM: "счетоводство"},
    {ORTH: "съкр.", NORM: "съкратено"},
    {ORTH: "съобщ.", NORM: "съобщение"},
    {ORTH: "същ.", NORM: "съществително"},
    {ORTH: "текст.", NORM: "текстилен"},
    {ORTH: "телев.", NORM: "телевизия"},
    {ORTH: "тел.", NORM: "телефон"},
    {ORTH: "т.е.", NORM: "тоест"},
    {ORTH: "т.н.", NORM: "така нататък"},
    {ORTH: "т.нар.", NORM: "така наречен"},
    {ORTH: "търж.", NORM: "тържествено"},
    {ORTH: "ул.", NORM: "улица"},
    {ORTH: "уч.", NORM: "училище"},
    {ORTH: "унив.", NORM: "университет"},
    {ORTH: "харт.", NORM: "хартия"},
    {ORTH: "хидр.", NORM: "хидравлика"},
    {ORTH: "хран.", NORM: "хранителна"},
    {ORTH: "църк.", NORM: "църковен термин"},
    {ORTH: "числ.", NORM: "числително"},
    {ORTH: "чл.", NORM: "член"},
-]
+    {ORTH: "ч.", NORM: "число"},
-
+    {ORTH: "числ.", NORM: "числително"},
-for abbr in _abbr_dot_exc:
+    {ORTH: "шахм.", NORM: "шахмат"},
    {ORTH: "шах.", NORM: "шахмат"},
    {ORTH: "юр.", NORM: "юридически"},
 ]:
    _exc[abbr[ORTH]] = [abbr]
 # slash abbreviations
 for abbr in [
    {ORTH: "м/у", NORM: "между"},
    {ORTH: "с/у", NORM: "срещу"},
 ]:
    _exc[abbr[ORTH]] = [abbr]
 TOKENIZER_EXCEPTIONS = _exc
--- a/spacy/lang/bn/init.py
+++ b/spacy/lang/bn/init.py
@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, Callable
 from thinc.api import Model
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
@ -23,13 +23,25 @@ class Bengali(Language):
@Bengali.factory(
    "lemmatizer",
    assigns=["token.lemma"],
-    default_config={"model": None, "mode": "rule", "overwrite": False},
+    default_config={
        "model": None,
        "mode": "rule",
        "overwrite": False,
        "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
    },
    default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
-    nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
+    nlp: Language,
    model: Optional[Model],
    name: str,
    mode: str,
    overwrite: bool,
    scorer: Optional[Callable],
 ):
-    return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+    return Lemmatizer(
        nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
    )
 __all__ = ["Bengali"]
--- a/spacy/lang/ca/init.py
+++ b/spacy/lang/ca/init.py
@ -1,9 +1,9 @@
-from typing import Optional
+from typing import Optional, Callable
 from thinc.api import Model
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .syntax_iterators import SYNTAX_ITERATORS
@ -15,6 +15,7 @@ class CatalanDefaults(BaseDefaults):
    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
    infixes = TOKENIZER_INFIXES
    suffixes = TOKENIZER_SUFFIXES
    prefixes = TOKENIZER_PREFIXES
    stop_words = STOP_WORDS
    lex_attr_getters = LEX_ATTRS
    syntax_iterators = SYNTAX_ITERATORS
@ -28,13 +29,25 @@ class Catalan(Language):
@Catalan.factory(
    "lemmatizer",
    assigns=["token.lemma"],
-    default_config={"model": None, "mode": "rule", "overwrite": False},
+    default_config={
        "model": None,
        "mode": "rule",
        "overwrite": False,
        "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
    },
    default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
-    nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
+    nlp: Language,
    model: Optional[Model],
    name: str,
    mode: str,
    overwrite: bool,
    scorer: Optional[Callable],
 ):
-    return CatalanLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+    return CatalanLemmatizer(
        nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
    )
 __all__ = ["Catalan"]
--- a/spacy/lang/ca/punctuation.py
+++ b/spacy/lang/ca/punctuation.py
@ -1,4 +1,5 @@
 from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
 from ..char_classes import LIST_CURRENCY
 from ..char_classes import CURRENCY
 from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
 from ..char_classes import merge_chars, _units
@ -6,6 +7,14 @@ from ..char_classes import merge_chars, _units
 ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "")
 _prefixes = (
    ["§", "%", "=", "—", "–", "-", r"\+(?![0-9])"]
    + LIST_PUNCT
    + LIST_ELLIPSES
    + LIST_QUOTES
    + LIST_CURRENCY
    + LIST_ICONS
 )
 _infixes = (
    LIST_ELLIPSES
@ -18,6 +27,7 @@ _infixes = (
        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
        r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
        r"(?<=[{a}][{el}])(?=[{a}0-9])".format(a=ALPHA, el=ELISION),
        r"('ls|'l|'ns|'t|'m|'n|-les|-la|-lo|-li|-los|-me|-nos|-te|-vos|-se|-hi|-ne|-ho)(?![A-Za-z])|(-l'|-m'|-t'|-n')",
    ]
 )
@ -44,3 +54,4 @@ _suffixes = (
 TOKENIZER_INFIXES = _infixes
 TOKENIZER_SUFFIXES = _suffixes
 TOKENIZER_PREFIXES = _prefixes
--- a/spacy/lang/ca/tokenizer_exceptions.py
+++ b/spacy/lang/ca/tokenizer_exceptions.py
@ -18,12 +18,21 @@ for exc_data in [
    {ORTH: "nov.", NORM: "novembre"},
    {ORTH: "dec.", NORM: "desembre"},
    {ORTH: "Dr.", NORM: "doctor"},
    {ORTH: "Dra.", NORM: "doctora"},
    {ORTH: "Sr.", NORM: "senyor"},
    {ORTH: "Sra.", NORM: "senyora"},
    {ORTH: "Srta.", NORM: "senyoreta"},
    {ORTH: "núm", NORM: "número"},
    {ORTH: "St.", NORM: "sant"},
    {ORTH: "Sta.", NORM: "santa"},
    {ORTH: "pl.", NORM: "plaça"},
    {ORTH: "à."},
    {ORTH: "è."},
    {ORTH: "é."},
    {ORTH: "í."},
    {ORTH: "ò."},
    {ORTH: "ó."},
    {ORTH: "ú."},
    {ORTH: "'l"},
    {ORTH: "'ls"},
    {ORTH: "'m"},
@ -34,6 +43,18 @@ for exc_data in [
 ]:
    _exc[exc_data[ORTH]] = [exc_data]
 _exc["del"] = [{ORTH: "d", NORM: "de"}, {ORTH: "el"}]
 _exc["dels"] = [{ORTH: "d", NORM: "de"}, {ORTH: "els"}]
 _exc["al"] = [{ORTH: "a"}, {ORTH: "l", NORM: "el"}]
 _exc["als"] = [{ORTH: "a"}, {ORTH: "ls", NORM: "els"}]
 _exc["pel"] = [{ORTH: "p", NORM: "per"}, {ORTH: "el"}]
 _exc["pels"] = [{ORTH: "p", NORM: "per"}, {ORTH: "els"}]
 _exc["holahola"] = [{ORTH: "holahola", NORM: "cocacola"}]
 # Times
 _exc["12m."] = [{ORTH: "12"}, {ORTH: "m.", NORM: "p.m."}]
--- a/spacy/lang/el/init.py
+++ b/spacy/lang/el/init.py
@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, Callable
 from thinc.api import Model
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
@ -28,13 +28,25 @@ class Greek(Language):
@Greek.factory(
    "lemmatizer",
    assigns=["token.lemma"],
-    default_config={"model": None, "mode": "rule", "overwrite": False},
+    default_config={
        "model": None,
        "mode": "rule",
        "overwrite": False,
        "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
    },
    default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
-    nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
+    nlp: Language,
    model: Optional[Model],
    name: str,
    mode: str,
    overwrite: bool,
    scorer: Optional[Callable],
 ):
-    return GreekLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+    return GreekLemmatizer(
        nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
    )
 __all__ = ["Greek"]
--- a/spacy/lang/en/init.py
+++ b/spacy/lang/en/init.py
@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, Callable
 from thinc.api import Model
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
@ -26,13 +26,25 @@ class English(Language):
@English.factory(
    "lemmatizer",
    assigns=["token.lemma"],
-    default_config={"model": None, "mode": "rule", "overwrite": False},
+    default_config={
        "model": None,
        "mode": "rule",
        "overwrite": False,
        "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
    },
    default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
-    nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
+    nlp: Language,
    model: Optional[Model],
    name: str,
    mode: str,
    overwrite: bool,
    scorer: Optional[Callable],
 ):
-    return EnglishLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+    return EnglishLemmatizer(
        nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
    )
 __all__ = ["English"]
--- a/spacy/lang/en/lemmatizer.py
+++ b/spacy/lang/en/lemmatizer.py
@ -10,7 +10,7 @@ class EnglishLemmatizer(Lemmatizer):
        Check whether we're dealing with an uninflected paradigm, so we can
        avoid lemmatization entirely.
-        univ_pos (unicode / int): The token's universal part-of-speech tag.
+        univ_pos (str / int): The token's universal part-of-speech tag.
        morphology (dict): The token's morphological features following the
            Universal Dependencies scheme.
        """
--- a/spacy/lang/es/init.py
+++ b/spacy/lang/es/init.py
@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, Callable
 from thinc.api import Model
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
@ -26,13 +26,25 @@ class Spanish(Language):
@Spanish.factory(
    "lemmatizer",
    assigns=["token.lemma"],
-    default_config={"model": None, "mode": "rule", "overwrite": False},
+    default_config={
        "model": None,
        "mode": "rule",
        "overwrite": False,
        "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
    },
    default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
-    nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
+    nlp: Language,
    model: Optional[Model],
    name: str,
    mode: str,
    overwrite: bool,
    scorer: Optional[Callable],
 ):
-    return SpanishLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+    return SpanishLemmatizer(
        nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
    )
 __all__ = ["Spanish"]
--- a/spacy/lang/fa/init.py
+++ b/spacy/lang/fa/init.py
@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, Callable
 from thinc.api import Model
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
@ -26,13 +26,25 @@ class Persian(Language):
@Persian.factory(
    "lemmatizer",
    assigns=["token.lemma"],
-    default_config={"model": None, "mode": "rule", "overwrite": False},
+    default_config={
        "model": None,
        "mode": "rule",
        "overwrite": False,
        "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
    },
    default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
-    nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
+    nlp: Language,
    model: Optional[Model],
    name: str,
    mode: str,
    overwrite: bool,
    scorer: Optional[Callable],
 ):
-    return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+    return Lemmatizer(
        nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
    )
 __all__ = ["Persian"]
--- a/spacy/lang/fr/init.py
+++ b/spacy/lang/fr/init.py
@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, Callable
 from thinc.api import Model
@ -31,13 +31,25 @@ class French(Language):
@French.factory(
    "lemmatizer",
    assigns=["token.lemma"],
-    default_config={"model": None, "mode": "rule", "overwrite": False},
+    default_config={
        "model": None,
        "mode": "rule",
        "overwrite": False,
        "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
    },
    default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
-    nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
+    nlp: Language,
    model: Optional[Model],
    name: str,
    mode: str,
    overwrite: bool,
    scorer: Optional[Callable],
 ):
-    return FrenchLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+    return FrenchLemmatizer(
        nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
    )
 __all__ = ["French"]
--- a/spacy/lang/ga/init.py
+++ b/spacy/lang/ga/init.py
@ -1,6 +1,11 @@
 from typing import Optional
 from thinc.api import Model
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
 from ...language import Language, BaseDefaults
 from .lemmatizer import IrishLemmatizer
 class IrishDefaults(BaseDefaults):
@ -13,4 +18,16 @@ class Irish(Language):
    Defaults = IrishDefaults
@Irish.factory(
    "lemmatizer",
    assigns=["token.lemma"],
    default_config={"model": None, "mode": "pos_lookup", "overwrite": False},
    default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
    nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
 ):
    return IrishLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
 __all__ = ["Irish"]
--- a/spacy/lang/ga/irish_morphology_helpers.py
+++ b/spacy/lang/ga/irish_morphology_helpers.py
@ -1,35 +0,0 @@
 # fmt: off
 consonants = ["b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "z"]
 broad_vowels = ["a", "á", "o", "ó", "u", "ú"]
 slender_vowels = ["e", "é", "i", "í"]
 vowels = broad_vowels + slender_vowels
 # fmt: on
 def ends_dentals(word):
    if word != "" and word[-1] in ["d", "n", "t", "s"]:
        return True
    else:
        return False
 def devoice(word):
    if len(word) > 2 and word[-2] == "s" and word[-1] == "d":
        return word[:-1] + "t"
    else:
        return word
 def ends_with_vowel(word):
    return word != "" and word[-1] in vowels
 def starts_with_vowel(word):
    return word != "" and word[0] in vowels
 def deduplicate(word):
    if len(word) > 2 and word[-2] == word[-1] and word[-1] in consonants:
        return word[:-1]
    else:
        return word
--- a/spacy/lang/ga/lemmatizer.py
+++ b/spacy/lang/ga/lemmatizer.py
@ -0,0 +1,162 @@
 from typing import List, Dict, Tuple
 from ...pipeline import Lemmatizer
 from ...tokens import Token
 class IrishLemmatizer(Lemmatizer):
    # This is a lookup-based lemmatiser using data extracted from
    # BuNaMo (https://github.com/michmech/BuNaMo)
    @classmethod
    def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
        if mode == "pos_lookup":
            # fmt: off
            required = [
                "lemma_lookup_adj", "lemma_lookup_adp",
                "lemma_lookup_noun", "lemma_lookup_verb"
            ]
            # fmt: on
            return (required, [])
        else:
            return super().get_lookups_config(mode)
    def pos_lookup_lemmatize(self, token: Token) -> List[str]:
        univ_pos = token.pos_
        string = unponc(token.text)
        if univ_pos not in ["PROPN", "ADP", "ADJ", "NOUN", "VERB"]:
            return [string.lower()]
        demutated = demutate(string)
        secondary = ""
        if string[0:1].lower() == "h" and string[1:2].lower() in "aáeéiíoóuú":
            secondary = string[1:]
        lookup_pos = univ_pos.lower()
        if univ_pos == "PROPN":
            lookup_pos = "noun"
        if token.has_morph():
            # TODO: lookup is actually required for the genitive forms, but
            # this is not in BuNaMo, and would not be of use with IDT.
            if univ_pos == "NOUN" and (
                "VerbForm=Vnoun" in token.morph or "VerbForm=Inf" in token.morph
            ):
                hpref = "Form=HPref" in token.morph
                return [demutate(string, hpref).lower()]
            elif univ_pos == "ADJ" and "VerbForm=Part" in token.morph:
                return [demutate(string).lower()]
        lookup_table = self.lookups.get_table("lemma_lookup_" + lookup_pos, {})
        def to_list(value):
            if value is None:
                value = []
            elif not isinstance(value, list):
                value = [value]
            return value
        if univ_pos == "ADP":
            return to_list(lookup_table.get(string, string.lower()))
        ret = []
        if univ_pos == "PROPN":
            ret.extend(to_list(lookup_table.get(demutated)))
            ret.extend(to_list(lookup_table.get(secondary)))
        else:
            ret.extend(to_list(lookup_table.get(demutated.lower())))
            ret.extend(to_list(lookup_table.get(secondary.lower())))
        if len(ret) == 0:
            ret = [string.lower()]
        return ret
 def demutate(word: str, is_hpref: bool = False) -> str:
    UVOWELS = "AÁEÉIÍOÓUÚ"
    LVOWELS = "aáeéiíoóuú"
    lc = word.lower()
    # remove eclipsis
    if lc.startswith("bhf"):
        word = word[2:]
    elif lc.startswith("mb"):
        word = word[1:]
    elif lc.startswith("gc"):
        word = word[1:]
    elif lc.startswith("nd"):
        word = word[1:]
    elif lc.startswith("ng"):
        word = word[1:]
    elif lc.startswith("bp"):
        word = word[1:]
    elif lc.startswith("dt"):
        word = word[1:]
    elif word[0:1] == "n" and word[1:2] in UVOWELS:
        word = word[1:]
    elif lc.startswith("n-") and word[2:3] in LVOWELS:
        word = word[2:]
    # non-standard eclipsis
    elif lc.startswith("bh-f"):
        word = word[3:]
    elif lc.startswith("m-b"):
        word = word[2:]
    elif lc.startswith("g-c"):
        word = word[2:]
    elif lc.startswith("n-d"):
        word = word[2:]
    elif lc.startswith("n-g"):
        word = word[2:]
    elif lc.startswith("b-p"):
        word = word[2:]
    elif lc.startswith("d-t"):
        word = word[2:]
    # t-prothesis
    elif lc.startswith("ts"):
        word = word[1:]
    elif lc.startswith("t-s"):
        word = word[2:]
    # h-prothesis, if known to be present
    elif is_hpref and word[0:1] == "h":
        word = word[1:]
    # h-prothesis, simple case
    # words can also begin with 'h', but unlike eclipsis,
    # a hyphen is not used, so that needs to be handled
    # elsewhere
    elif word[0:1] == "h" and word[1:2] in UVOWELS:
        word = word[1:]
    # lenition
    # this breaks the previous if, to handle super-non-standard
    # text where both eclipsis and lenition were used.
    if lc[0:1] in "bcdfgmpst" and lc[1:2] == "h":
        word = word[0:1] + word[2:]
    return word
 def unponc(word: str) -> str:
    # fmt: off
    PONC = {
        "ḃ": "bh",
        "ċ": "ch",
        "ḋ": "dh",
        "ḟ": "fh",
        "ġ": "gh",
        "ṁ": "mh",
        "ṗ": "ph",
        "ṡ": "sh",
        "ṫ": "th",
        "Ḃ": "BH",
        "Ċ": "CH",
        "Ḋ": "DH",
        "Ḟ": "FH",
        "Ġ": "GH",
        "Ṁ": "MH",
        "Ṗ": "PH",
        "Ṡ": "SH",
        "Ṫ": "TH"
    }
    # fmt: on
    buf = []
    for ch in word:
        if ch in PONC:
            buf.append(PONC[ch])
        else:
            buf.append(ch)
    return "".join(buf)
--- a/spacy/lang/ga/tokenizer_exceptions.py
+++ b/spacy/lang/ga/tokenizer_exceptions.py
@ -9,6 +9,8 @@ _exc = {
    "ded'": [{ORTH: "de", NORM: "de"}, {ORTH: "d'", NORM: "do"}],
    "lem'": [{ORTH: "le", NORM: "le"}, {ORTH: "m'", NORM: "mo"}],
    "led'": [{ORTH: "le", NORM: "le"}, {ORTH: "d'", NORM: "do"}],
    "théis": [{ORTH: "th", NORM: "tar"}, {ORTH: "éis", NORM: "éis"}],
    "tréis": [{ORTH: "tr", NORM: "tar"}, {ORTH: "éis", NORM: "éis"}],
 }
 for exc_data in [
--- a/spacy/lang/hu/tokenizer_exceptions.py
+++ b/spacy/lang/hu/tokenizer_exceptions.py
@ -646,5 +646,10 @@ _nums = r"(({ne})|({t})|({on})|({c}))({s})?".format(
 )
 for u in "cfkCFK":
    _exc[f"°{u}"] = [{ORTH: f"°{u}"}]
    _exc[f"°{u}."] = [{ORTH: f"°{u}"}, {ORTH: "."}]
 TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
 TOKEN_MATCH = re.compile(r"^{n}$".format(n=_nums)).match
--- a/spacy/lang/it/init.py
+++ b/spacy/lang/it/init.py
@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, Callable
 from thinc.api import Model
 from .stop_words import STOP_WORDS
@ -23,13 +23,25 @@ class Italian(Language):
@Italian.factory(
    "lemmatizer",
    assigns=["token.lemma"],
-    default_config={"model": None, "mode": "pos_lookup", "overwrite": False},
+    default_config={
        "model": None,
        "mode": "pos_lookup",
        "overwrite": False,
        "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
    },
    default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
-    nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
+    nlp: Language,
    model: Optional[Model],
    name: str,
    mode: str,
    overwrite: bool,
    scorer: Optional[Callable],
 ):
-    return ItalianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+    return ItalianLemmatizer(
        nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
    )
 __all__ = ["Italian"]
--- a/spacy/lang/ja/init.py
+++ b/spacy/lang/ja/init.py
@ -1,21 +1,25 @@
-from typing import Optional, Union, Dict, Any
+from typing import Optional, Union, Dict, Any, Callable
 from pathlib import Path
 import srsly
 from collections import namedtuple
 from thinc.api import Model
 import re
 from .stop_words import STOP_WORDS
 from .syntax_iterators import SYNTAX_ITERATORS
 from .tag_map import TAG_MAP
 from .tag_orth_map import TAG_ORTH_MAP
 from .tag_bigram_map import TAG_BIGRAM_MAP
 from ...compat import copy_reg
 from ...errors import Errors
 from ...language import Language, BaseDefaults
 from ...pipeline import Morphologizer
 from ...pipeline.morphologizer import DEFAULT_MORPH_MODEL
 from ...scorer import Scorer
 from ...symbols import POS
-from ...tokens import Doc
+from ...tokens import Doc, MorphAnalysis
 from ...training import validate_examples
 from ...util import DummyTokenizer, registry, load_config_from_str
 from ...vocab import Vocab
 from ... import util
@ -31,16 +35,21 @@ split_mode = null
@registry.tokenizers("spacy.ja.JapaneseTokenizer")
 def create_tokenizer(split_mode: Optional[str] = None):
    def japanese_tokenizer_factory(nlp):
-        return JapaneseTokenizer(nlp, split_mode=split_mode)
+        return JapaneseTokenizer(nlp.vocab, split_mode=split_mode)
    return japanese_tokenizer_factory
 class JapaneseTokenizer(DummyTokenizer):
-    def __init__(self, nlp: Language, split_mode: Optional[str] = None) -> None:
+    def __init__(self, vocab: Vocab, split_mode: Optional[str] = None) -> None:
-        self.vocab = nlp.vocab
+        self.vocab = vocab
        self.split_mode = split_mode
        self.tokenizer = try_sudachi_import(self.split_mode)
        # if we're using split mode A we don't need subtokens
        self.need_subtokens = not (split_mode is None or split_mode == "A")
    def __reduce__(self):
        return JapaneseTokenizer, (self.vocab, self.split_mode)
    def __call__(self, text: str) -> Doc:
        # convert sudachipy.morpheme.Morpheme to DetailedToken and merge continuous spaces
@ -49,8 +58,8 @@ class JapaneseTokenizer(DummyTokenizer):
        dtokens, spaces = get_dtokens_and_spaces(dtokens, text)
        # create Doc with tag bi-gram based part-of-speech identification rules
-        words, tags, inflections, lemmas, readings, sub_tokens_list = (
+        words, tags, inflections, lemmas, norms, readings, sub_tokens_list = (
-            zip(*dtokens) if dtokens else [[]] * 6
+            zip(*dtokens) if dtokens else [[]] * 7
        )
        sub_tokens_list = list(sub_tokens_list)
        doc = Doc(self.vocab, words=words, spaces=spaces)
@ -68,9 +77,18 @@ class JapaneseTokenizer(DummyTokenizer):
                )
            # if there's no lemma info (it's an unk) just use the surface
            token.lemma_ = dtoken.lemma if dtoken.lemma else dtoken.surface
-        doc.user_data["inflections"] = inflections
+            morph = {}
-        doc.user_data["reading_forms"] = readings
+            if dtoken.inf:
-        doc.user_data["sub_tokens"] = sub_tokens_list
+                # it's normal for this to be empty for non-inflecting types
                morph["Inflection"] = dtoken.inf
            token.norm_ = dtoken.norm
            if dtoken.reading:
                # punctuation is its own reading, but we don't want values like
                # "=" here
                morph["Reading"] = re.sub("[=|]", "_", dtoken.reading)
            token.morph = MorphAnalysis(self.vocab, morph)
        if self.need_subtokens:
            doc.user_data["sub_tokens"] = sub_tokens_list
        return doc
    def _get_dtokens(self, sudachipy_tokens, need_sub_tokens: bool = True):
@ -81,9 +99,10 @@ class JapaneseTokenizer(DummyTokenizer):
            DetailedToken(
                token.surface(),  # orth
                "-".join([xx for xx in token.part_of_speech()[:4] if xx != "*"]),  # tag
-                ",".join([xx for xx in token.part_of_speech()[4:] if xx != "*"]),  # inf
+                ";".join([xx for xx in token.part_of_speech()[4:] if xx != "*"]),  # inf
                token.dictionary_form(),  # lemma
-                token.reading_form(),  # user_data['reading_forms']
+                token.normalized_form(),
                token.reading_form(),
                sub_tokens_list[idx]
                if sub_tokens_list
                else None,  # user_data['sub_tokens']
@ -105,9 +124,8 @@ class JapaneseTokenizer(DummyTokenizer):
        ]
    def _get_sub_tokens(self, sudachipy_tokens):
-        if (
+        # do nothing for default split mode
-            self.split_mode is None or self.split_mode == "A"
+        if not self.need_subtokens:
        ):  # do nothing for default split mode
            return None
        sub_tokens_list = []  # list of (list of list of DetailedToken | None)
@ -176,9 +194,33 @@ class Japanese(Language):
    Defaults = JapaneseDefaults
@Japanese.factory(
    "morphologizer",
    assigns=["token.morph", "token.pos"],
    default_config={
        "model": DEFAULT_MORPH_MODEL,
        "overwrite": True,
        "extend": True,
        "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"},
    },
    default_score_weights={"pos_acc": 0.5, "morph_micro_f": 0.5, "morph_per_feat": None},
 )
 def make_morphologizer(
    nlp: Language,
    model: Model,
    name: str,
    overwrite: bool,
    extend: bool,
    scorer: Optional[Callable],
 ):
    return Morphologizer(
        nlp.vocab, model, name, overwrite=overwrite, extend=extend, scorer=scorer
    )
 # Hold the attributes we need with convenient names
 DetailedToken = namedtuple(
-    "DetailedToken", ["surface", "tag", "inf", "lemma", "reading", "sub_tokens"]
+    "DetailedToken", ["surface", "tag", "inf", "lemma", "norm", "reading", "sub_tokens"]
 )
@ -254,7 +296,7 @@ def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"):
        return text_dtokens, text_spaces
    elif len([word for word in words if not word.isspace()]) == 0:
        assert text.isspace()
-        text_dtokens = [DetailedToken(text, gap_tag, "", text, None, None)]
+        text_dtokens = [DetailedToken(text, gap_tag, "", text, text, None, None)]
        text_spaces = [False]
        return text_dtokens, text_spaces
@ -271,7 +313,7 @@ def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"):
        # space token
        if word_start > 0:
            w = text[text_pos : text_pos + word_start]
-            text_dtokens.append(DetailedToken(w, gap_tag, "", w, None, None))
+            text_dtokens.append(DetailedToken(w, gap_tag, "", w, w, None, None))
            text_spaces.append(False)
            text_pos += word_start
@ -287,16 +329,10 @@ def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"):
    # trailing space token
    if text_pos < len(text):
        w = text[text_pos:]
-        text_dtokens.append(DetailedToken(w, gap_tag, "", w, None, None))
+        text_dtokens.append(DetailedToken(w, gap_tag, "", w, w, None, None))
        text_spaces.append(False)
    return text_dtokens, text_spaces
 def pickle_japanese(instance):
    return Japanese, tuple()
 copy_reg.pickle(Japanese, pickle_japanese)
 __all__ = ["Japanese"]
--- a/spacy/lang/ko/init.py
+++ b/spacy/lang/ko/init.py
@ -5,11 +5,11 @@ from .tag_map import TAG_MAP
 from .lex_attrs import LEX_ATTRS
 from ...language import Language, BaseDefaults
 from ...tokens import Doc
 from ...compat import copy_reg
 from ...scorer import Scorer
 from ...symbols import POS
 from ...training import validate_examples
 from ...util import DummyTokenizer, registry, load_config_from_str
 from ...vocab import Vocab
 DEFAULT_CONFIG = """
@ -23,17 +23,20 @@ DEFAULT_CONFIG = """
@registry.tokenizers("spacy.ko.KoreanTokenizer")
 def create_tokenizer():
    def korean_tokenizer_factory(nlp):
-        return KoreanTokenizer(nlp)
+        return KoreanTokenizer(nlp.vocab)
    return korean_tokenizer_factory
 class KoreanTokenizer(DummyTokenizer):
-    def __init__(self, nlp: Language):
+    def __init__(self, vocab: Vocab):
-        self.vocab = nlp.vocab
+        self.vocab = vocab
        MeCab = try_mecab_import()  # type: ignore[func-returns-value]
        self.mecab_tokenizer = MeCab("-F%f[0],%f[7]")
    def __reduce__(self):
        return KoreanTokenizer, (self.vocab,)
    def __del__(self):
        self.mecab_tokenizer.__del__()
@ -106,10 +109,4 @@ def check_spaces(text, tokens):
        yield False
 def pickle_korean(instance):
    return Korean, tuple()
 copy_reg.pickle(Korean, pickle_korean)
 __all__ = ["Korean"]
--- a/spacy/lang/lex_attrs.py
+++ b/spacy/lang/lex_attrs.py
@ -3,6 +3,7 @@ import unicodedata
 import re
 from .. import attrs
 from .tokenizer_exceptions import URL_MATCH
 _like_email = re.compile(r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)").match
@ -109,6 +110,8 @@ def like_url(text: str) -> bool:
        return True
    if tld.isalpha() and tld in _tlds:
        return True
    if URL_MATCH(text):
        return True
    return False
--- a/spacy/lang/mk/init.py
+++ b/spacy/lang/mk/init.py
@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, Callable
 from thinc.api import Model
 from .lemmatizer import MacedonianLemmatizer
 from .stop_words import STOP_WORDS
@ -38,13 +38,25 @@ class Macedonian(Language):
@Macedonian.factory(
    "lemmatizer",
    assigns=["token.lemma"],
-    default_config={"model": None, "mode": "rule", "overwrite": False},
+    default_config={
        "model": None,
        "mode": "rule",
        "overwrite": False,
        "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
    },
    default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
-    nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
+    nlp: Language,
    model: Optional[Model],
    name: str,
    mode: str,
    overwrite: bool,
    scorer: Optional[Callable],
 ):
-    return MacedonianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+    return MacedonianLemmatizer(
        nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
    )
 __all__ = ["Macedonian"]
--- a/spacy/lang/nb/init.py
+++ b/spacy/lang/nb/init.py
@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, Callable
 from thinc.api import Model
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
@ -26,13 +26,25 @@ class Norwegian(Language):
@Norwegian.factory(
    "lemmatizer",
    assigns=["token.lemma"],
-    default_config={"model": None, "mode": "rule", "overwrite": False},
+    default_config={
        "model": None,
        "mode": "rule",
        "overwrite": False,
        "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
    },
    default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
-    nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
+    nlp: Language,
    model: Optional[Model],
    name: str,
    mode: str,
    overwrite: bool,
    scorer: Optional[Callable],
 ):
-    return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+    return Lemmatizer(
        nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
    )
 __all__ = ["Norwegian"]
--- a/spacy/lang/nl/init.py
+++ b/spacy/lang/nl/init.py
@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, Callable
 from thinc.api import Model
@ -30,13 +30,25 @@ class Dutch(Language):
@Dutch.factory(
    "lemmatizer",
    assigns=["token.lemma"],
-    default_config={"model": None, "mode": "rule", "overwrite": False},
+    default_config={
        "model": None,
        "mode": "rule",
        "overwrite": False,
        "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
    },
    default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
-    nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
+    nlp: Language,
    model: Optional[Model],
    name: str,
    mode: str,
    overwrite: bool,
    scorer: Optional[Callable],
 ):
-    return DutchLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+    return DutchLemmatizer(
        nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
    )
 __all__ = ["Dutch"]
--- a/spacy/lang/pl/init.py
+++ b/spacy/lang/pl/init.py
@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, Callable
 from thinc.api import Model
@ -33,13 +33,25 @@ class Polish(Language):
@Polish.factory(
    "lemmatizer",
    assigns=["token.lemma"],
-    default_config={"model": None, "mode": "pos_lookup", "overwrite": False},
+    default_config={
        "model": None,
        "mode": "pos_lookup",
        "overwrite": False,
        "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
    },
    default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
-    nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
+    nlp: Language,
    model: Optional[Model],
    name: str,
    mode: str,
    overwrite: bool,
    scorer: Optional[Callable],
 ):
-    return PolishLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+    return PolishLemmatizer(
        nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
    )
 __all__ = ["Polish"]
--- a/spacy/lang/ru/init.py
+++ b/spacy/lang/ru/init.py
@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, Callable
 from thinc.api import Model
 from .stop_words import STOP_WORDS
@ -22,7 +22,12 @@ class Russian(Language):
@Russian.factory(
    "lemmatizer",
    assigns=["token.lemma"],
-    default_config={"model": None, "mode": "pymorphy2", "overwrite": False},
+    default_config={
        "model": None,
        "mode": "pymorphy2",
        "overwrite": False,
        "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
    },
    default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
@ -31,8 +36,11 @@ def make_lemmatizer(
    name: str,
    mode: str,
    overwrite: bool,
    scorer: Optional[Callable],
 ):
-    return RussianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+    return RussianLemmatizer(
        nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
    )
 __all__ = ["Russian"]
--- a/spacy/lang/ru/lemmatizer.py
+++ b/spacy/lang/ru/lemmatizer.py
@ -1,8 +1,9 @@
-from typing import Optional, List, Dict, Tuple
+from typing import Optional, List, Dict, Tuple, Callable
 from thinc.api import Model
 from ...pipeline import Lemmatizer
 from ...pipeline.lemmatizer import lemmatizer_score
 from ...symbols import POS
 from ...tokens import Token
 from ...vocab import Vocab
@ -20,6 +21,7 @@ class RussianLemmatizer(Lemmatizer):
        *,
        mode: str = "pymorphy2",
        overwrite: bool = False,
        scorer: Optional[Callable] = lemmatizer_score,
    ) -> None:
        if mode == "pymorphy2":
            try:
@ -31,7 +33,7 @@ class RussianLemmatizer(Lemmatizer):
                ) from None
            if getattr(self, "_morph", None) is None:
                self._morph = MorphAnalyzer()
-        super().__init__(vocab, model, name, mode=mode, overwrite=overwrite)
+        super().__init__(vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer)
    def pymorphy2_lemmatize(self, token: Token) -> List[str]:
        string = token.text
--- a/spacy/lang/si/stop_words.py
+++ b/spacy/lang/si/stop_words.py
@ -1,47 +1,195 @@
 STOP_WORDS = set(
    """
-අතර
+සහ
-එච්චර
+සමග
-එපමණ
+සමඟ
-එලෙස
+අහා
-එවිට
+ආහ්
-ඒ
+ආ
-කට
+ඕහෝ
-කදී
+අනේ
-කින්
+අඳෝ
-ක්
+අපොයි
-ට
+අපෝ
-තුර
+අයියෝ
-ත්
+ආයි
-ද
+ඌයි
-නමුත්
+චී
-නොහොත්
+චිහ්
-පමණ
+චික්
-පමණි
+හෝ‍
-ම
+දෝ
-මෙච්චර
+දෝහෝ
-මෙපමණ
+මෙන්
-මෙලෙස
+සේ
-මෙවිට
+වැනි
-මේ
+බඳු
-ය
+වන්
-යි
+අයුරු
-ලදී
+අයුරින්
 ලෙස
-වගේ
+වැඩි
 ශ්‍රී
 හා
 ය
 නිසා
 නිසාවෙන්
 බවට
 බව
 බවෙන්
 නම්
 වැඩි
 සිට
 දී
 මහා
 මහ
 පමණ
 පමණින්
 පමන
 වන
 විට
-විටෙක
+විටින්
-විතර
+මේ
-විය
+මෙලෙස
-වුව
+මෙයින්
-වුවත්
+ඇති
-වුවද
+ලෙස
-වූ
+සිදු
-සමඟ
+වශයෙන්
 යන
 සඳහා
 මගින්
 හෝ‍
 ඉතා
 ඒ
 එම
 ද
 අතර
 විසින්
 සමග
 පිළිබඳව
 පිළිබඳ
 තුළ
 බව
 වැනි
 මහ
 මෙම
 මෙහි
 මේ
 වෙත
 වෙතින්
 වෙතට
 වෙනුවෙන්
 වෙනුවට
 වෙන
 ගැන
 නෑ
 අනුව
 නව
 පිළිබඳ
 විශේෂ
 දැනට
 එහෙන්
 මෙහෙන්
 එහේ
 මෙහේ
 ම
 තවත්
 තව 
 සහ
-හා
+දක්වා
 ට
 ගේ
 එ
 ක
 ක්
 බවත්
 බවද
 මත
 ඇතුලු
 ඇතුළු
 මෙසේ
 වඩා
 වඩාත්ම
 නිති
 නිතිත්
 නිතොර
 නිතර
 ඉක්බිති
 දැන්
 යලි
 පුන
 ඉතින්
 සිට
 සිටන්
 පටන්
 තෙක්
 දක්වා
 සා
 තාක්
 තුවක්
 පවා
 ද
 හෝ‍
 වත්
 විනා
 හැර
 මිස
 මුත්
 කිම
 කිම්
 ඇයි
 මන්ද
 හෙවත්
-හෝ
+නොහොත්
 පතා
 පාසා
 ගානෙ
 තව
 ඉතා
 බොහෝ
 වහා
 සෙද
 සැනින්
 හනික
 එම්බා
 එම්බල
 බොල
 නම්
 වනාහි
 කලී
 ඉඳුරා
 අන්න
 ඔන්න
 මෙන්න
 උදෙසා
 පිණිස
 සඳහා
 අරබයා
 නිසා
 එනිසා
 එබැවින්
 බැවින්
 හෙයින්
 සේක්
 සේක
 ගැන
 අනුව
 පරිදි
 විට
 තෙක්
 මෙතෙක්
 මේතාක්
 තුරු
 තුරා
 තුරාවට
 තුලින්
 නමුත්
 එනමුත්
 වස්
 මෙන්
 ලෙස
 පරිදි
 එහෙත්
 """.split()
 )
--- a/spacy/lang/sv/init.py
+++ b/spacy/lang/sv/init.py
@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, Callable
 from thinc.api import Model
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
@ -29,13 +29,25 @@ class Swedish(Language):
@Swedish.factory(
    "lemmatizer",
    assigns=["token.lemma"],
-    default_config={"model": None, "mode": "rule", "overwrite": False},
+    default_config={
        "model": None,
        "mode": "rule",
        "overwrite": False,
        "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
    },
    default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
-    nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
+    nlp: Language,
    model: Optional[Model],
    name: str,
    mode: str,
    overwrite: bool,
    scorer: Optional[Callable],
 ):
-    return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+    return Lemmatizer(
        nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
    )
 __all__ = ["Swedish"]
--- a/spacy/lang/th/init.py
+++ b/spacy/lang/th/init.py
@ -3,6 +3,7 @@ from .lex_attrs import LEX_ATTRS
 from ...language import Language, BaseDefaults
 from ...tokens import Doc
 from ...util import DummyTokenizer, registry, load_config_from_str
 from ...vocab import Vocab
 DEFAULT_CONFIG = """
@ -16,13 +17,13 @@ DEFAULT_CONFIG = """
@registry.tokenizers("spacy.th.ThaiTokenizer")
 def create_thai_tokenizer():
    def thai_tokenizer_factory(nlp):
-        return ThaiTokenizer(nlp)
+        return ThaiTokenizer(nlp.vocab)
    return thai_tokenizer_factory
 class ThaiTokenizer(DummyTokenizer):
-    def __init__(self, nlp: Language) -> None:
+    def __init__(self, vocab: Vocab) -> None:
        try:
            from pythainlp.tokenize import word_tokenize
        except ImportError:
@ -31,7 +32,7 @@ class ThaiTokenizer(DummyTokenizer):
                "https://github.com/PyThaiNLP/pythainlp"
            ) from None
        self.word_tokenize = word_tokenize
-        self.vocab = nlp.vocab
+        self.vocab = vocab
    def __call__(self, text: str) -> Doc:
        words = list(self.word_tokenize(text))
--- a/spacy/lang/ti/lex_attrs.py
+++ b/spacy/lang/ti/lex_attrs.py
@ -2,7 +2,7 @@ from ...attrs import LIKE_NUM
 _num_words = [
    "ዜሮ",
-    "ሐደ",
+    "ሓደ",
    "ክልተ",
    "ሰለስተ",
    "ኣርባዕተ",
@ -11,66 +11,37 @@ _num_words = [
    "ሸውዓተ",
    "ሽሞንተ",
    "ትሽዓተ",
-    "ኣሰርተ",
+    "ዓሰርተ",
    "ኣሰርተ ሐደ",
    "ኣሰርተ ክልተ",
    "ኣሰርተ ሰለስተ",
    "ኣሰርተ ኣርባዕተ",
    "ኣሰርተ ሓሙሽተ",
    "ኣሰርተ ሽድሽተ",
    "ኣሰርተ ሸውዓተ",
    "ኣሰርተ ሽሞንተ",
    "ኣሰርተ ትሽዓተ",
    "ዕስራ",
    "ሰላሳ",
    "ኣርብዓ",
-    "ሃምሳ",
+    "ሓምሳ",
-    "ስልሳ",
+    "ሱሳ",
    "ሰብዓ",
    "ሰማንያ",
-    "ተስዓ",
+    "ቴስዓ",
    "ሚእቲ",
    "ሺሕ",
    "ሚልዮን",
    "ቢልዮን",
    "ትሪልዮን",
    "ኳድሪልዮን",
-    "ገጅልዮን",
+    "ጋዚልዮን",
-    "ባዝልዮን",
+    "ባዚልዮን"
 ]
 # Tigrinya ordinals above 10 are the same as _num_words but start with "መበል "
 _ordinal_words = [
    "ቀዳማይ",
    "ካልኣይ",
    "ሳልሳይ",
-    "ራብኣይ",
+    "ራብዓይ",
    "ሓምሻይ",
    "ሻድሻይ",
    "ሻውዓይ",
    "ሻምናይ",
-    "ዘጠነኛ",
+    "ታሽዓይ",
-    "አስረኛ",
+    "ዓስራይ"
    "ኣሰርተ አንደኛ",
    "ኣሰርተ ሁለተኛ",
    "ኣሰርተ ሶስተኛ",
    "ኣሰርተ አራተኛ",
    "ኣሰርተ አምስተኛ",
    "ኣሰርተ ስድስተኛ",
    "ኣሰርተ ሰባተኛ",
    "ኣሰርተ ስምንተኛ",
    "ኣሰርተ ዘጠነኛ",
    "ሃያኛ",
    "ሰላሳኛ" "አርባኛ",
    "አምሳኛ",
    "ስድሳኛ",
    "ሰባኛ",
    "ሰማንያኛ",
    "ዘጠናኛ",
    "መቶኛ",
    "ሺኛ",
    "ሚሊዮንኛ",
    "ቢሊዮንኛ",
    "ትሪሊዮንኛ",
 ]
@ -92,7 +63,7 @@ def like_num(text):
    # Check ordinal number
    if text_lower in _ordinal_words:
        return True
-    if text_lower.endswith("ኛ"):
+    if text_lower.endswith("ይ"):
        if text_lower[:-2].isdigit():
            return True
--- a/spacy/lang/ti/punctuation.py
+++ b/spacy/lang/ti/punctuation.py
@ -1,7 +1,7 @@
 from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
 from ..char_classes import UNITS, ALPHA_UPPER
-_list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧".strip().split()
+_list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧ ፠ ፨".strip().split()
 _suffixes = (
    _list_punct
--- a/spacy/lang/ti/stop_words.py
+++ b/spacy/lang/ti/stop_words.py
@ -1,6 +1,27 @@
 # Stop words from Tigrinya Wordcount: https://github.com/fgaim/Tigrinya-WordCount/blob/main/ti_stop_words.txt
 # Stop words
 STOP_WORDS = set(
    """
-ግን ግና ንስኻ ንስኺ ንስኻትክን ንስኻትኩም ናትካ ናትኪ ናትክን ናትኩም
+'ምበር 'ሞ 'ቲ 'ታ 'ኳ 'ውን 'ዚ 'የ 'ዩ 'ያ 'ዮም 'ዮን
 ልዕሊ ሒዙ ሒዛ ሕጂ መበል መን መንጎ መጠን ማለት ምስ ምባል
 ምእንቲ ምኽንያቱ ምኽንያት ምዃኑ ምዃንና ምዃኖም
 ስለ ስለዚ ስለዝበላ ሽዑ ቅድሚ በለ በቲ በዚ ብምባል ብተወሳኺ ብኸመይ
 ብዘይ ብዘይካ ብዙሕ ብዛዕባ ብፍላይ ተባሂሉ ነበረ ነቲ ነታ ነቶም
 ነዚ ነይሩ ነገራት ነገር ናብ ናብቲ ናትኩም ናትኪ ናትካ ናትክን
 ናይ ናይቲ ንሕና ንሱ ንሳ ንሳቶም ንስኺ ንስኻ ንስኻትኩም ንስኻትክን ንዓይ
 ኢለ ኢሉ ኢላ ኢልካ ኢሎም ኢና ኢኻ ኢዩ ኣለኹ
 ኣለዉ ኣለዎ ኣሎ ኣብ ኣብቲ ኣብታ ኣብኡ ኣብዚ ኣነ ኣዝዩ ኣይኮነን ኣይኰነን
 እምበር እሞ እተን እቲ እታ እቶም እንተ እንተሎ
 ኣላ እንተኾነ እንታይ እንከሎ እኳ እዋን እውን እዚ እዛ እዞም
 እየ እየን እዩ እያ እዮም
 ከሎ ከመይ ከም ከምቲ ከምኡ ከምዘሎ
 ከምዚ ከኣ ኩሉ ካልእ ካልኦት ካብ ካብቲ ካብቶም ክሳብ ክሳዕ ክብል
 ክንደይ ክንዲ ክኸውን ኮይኑ ኰይኑ ኵሉ ኸም ኸኣ ወይ
 ዋላ ዘለና ዘለዉ ዘለዋ ዘለዎ ዘለዎም ዘላ ዘሎ ዘይብሉ  
 ዝርከብ ዝበሃል ዝበለ ዝብል ዝተባህለ ዝተኻየደ ዝተፈላለየ ዝተፈላለዩ
 ዝነበረ ዝነበረት ዝነበሩ ዝካየድ ዝኸውን ዝኽእል ዝኾነ ዝዀነ
 የለን ይቕረብ ይብል ይኸውን ይኹን ይኽእል ደኣ ድሕሪ ድማ
 ገለ ገሊጹ ገና ገይሩ ግና ግን ጥራይ
 """.split()
 )
--- a/spacy/lang/tokenizer_exceptions.py
+++ b/spacy/lang/tokenizer_exceptions.py
@ -250,3 +250,9 @@ o.0
 for orth in emoticons:
    BASE_EXCEPTIONS[orth] = [{ORTH: orth}]
 # Moved from a suffix setting due to #9155 removing prefixes from consideration
 # for lookbehinds
 for u in "cfkCFK":
    BASE_EXCEPTIONS[f"°{u}."] = [{ORTH: "°"}, {ORTH: f"{u}"}, {ORTH: "."}]
--- a/spacy/lang/uk/init.py
+++ b/spacy/lang/uk/init.py
@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, Callable
 from thinc.api import Model
@ -23,13 +23,25 @@ class Ukrainian(Language):
@Ukrainian.factory(
    "lemmatizer",
    assigns=["token.lemma"],
-    default_config={"model": None, "mode": "pymorphy2", "overwrite": False},
+    default_config={
        "model": None,
        "mode": "pymorphy2",
        "overwrite": False,
        "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
    },
    default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
-    nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
+    nlp: Language,
    model: Optional[Model],
    name: str,
    mode: str,
    overwrite: bool,
    scorer: Optional[Callable],
 ):
-    return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+    return UkrainianLemmatizer(
        nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
    )
 __all__ = ["Ukrainian"]
--- a/spacy/lang/uk/lemmatizer.py
+++ b/spacy/lang/uk/lemmatizer.py
@ -1,8 +1,9 @@
-from typing import Optional
+from typing import Optional, Callable
 from thinc.api import Model
 from ..ru.lemmatizer import RussianLemmatizer
 from ...pipeline.lemmatizer import lemmatizer_score
 from ...vocab import Vocab
@ -15,6 +16,7 @@ class UkrainianLemmatizer(RussianLemmatizer):
        *,
        mode: str = "pymorphy2",
        overwrite: bool = False,
        scorer: Optional[Callable] = lemmatizer_score,
    ) -> None:
        if mode == "pymorphy2":
            try:
@ -27,4 +29,4 @@ class UkrainianLemmatizer(RussianLemmatizer):
                ) from None
            if getattr(self, "_morph", None) is None:
                self._morph = MorphAnalyzer(lang="uk")
-        super().__init__(vocab, model, name, mode=mode, overwrite=overwrite)
+        super().__init__(vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer)
--- a/spacy/lang/vi/init.py
+++ b/spacy/lang/vi/init.py
@ -9,6 +9,7 @@ from .lex_attrs import LEX_ATTRS
 from ...language import Language, BaseDefaults
 from ...tokens import Doc
 from ...util import DummyTokenizer, registry, load_config_from_str
 from ...vocab import Vocab
 from ... import util
@ -24,14 +25,14 @@ use_pyvi = true
@registry.tokenizers("spacy.vi.VietnameseTokenizer")
 def create_vietnamese_tokenizer(use_pyvi: bool = True):
    def vietnamese_tokenizer_factory(nlp):
-        return VietnameseTokenizer(nlp, use_pyvi=use_pyvi)
+        return VietnameseTokenizer(nlp.vocab, use_pyvi=use_pyvi)
    return vietnamese_tokenizer_factory
 class VietnameseTokenizer(DummyTokenizer):
-    def __init__(self, nlp: Language, use_pyvi: bool = False):
+    def __init__(self, vocab: Vocab, use_pyvi: bool = False):
-        self.vocab = nlp.vocab
+        self.vocab = vocab
        self.use_pyvi = use_pyvi
        if self.use_pyvi:
            try:
@ -45,6 +46,9 @@ class VietnameseTokenizer(DummyTokenizer):
                )
                raise ImportError(msg) from None
    def __reduce__(self):
        return VietnameseTokenizer, (self.vocab, self.use_pyvi)
    def __call__(self, text: str) -> Doc:
        if self.use_pyvi:
            words = self.pyvi_tokenize(text)
--- a/spacy/lang/vi/examples.py
+++ b/spacy/lang/vi/examples.py
@ -0,0 +1,18 @@
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.vi.examples import sentences
 >>> docs = nlp.pipe(sentences)
 """
 sentences = [
    "Đây là đâu, tôi là ai?",
    "Căn phòng có nhiều cửa sổ nên nó khá sáng",
    "Đại dịch COVID vừa qua đã gây ảnh hưởng rất lớn tới nhiều doanh nghiệp lớn nhỏ.",
    "Thành phố Hồ Chí Minh đã bị ảnh hưởng nặng nề trong thời gian vừa qua.",
    "Ông bạn đang ở đâu thế?",
    "Ai là người giải phóng đất nước Việt Nam khỏi ách đô hộ?",
    "Vị tướng nào là người đã làm nên chiến thắng lịch sử Điện Biên Phủ?",
    "Làm việc nhiều chán quá, đi chơi đâu đi?",
 ]
--- a/spacy/lang/vi/lex_attrs.py
+++ b/spacy/lang/vi/lex_attrs.py
@ -9,11 +9,14 @@ _num_words = [
    "bốn",
    "năm",
    "sáu",
    "bảy",
    "bẩy",
    "tám",
    "chín",
    "mười",
    "chục",
    "trăm",
    "nghìn",
    "tỷ",
 ]
--- a/spacy/lang/zh/init.py
+++ b/spacy/lang/zh/init.py
@ -11,6 +11,7 @@ from ...scorer import Scorer
 from ...tokens import Doc
 from ...training import validate_examples, Example
 from ...util import DummyTokenizer, registry, load_config_from_str
 from ...vocab import Vocab
 from .lex_attrs import LEX_ATTRS
 from .stop_words import STOP_WORDS
 from ... import util
@ -48,14 +49,14 @@ class Segmenter(str, Enum):
@registry.tokenizers("spacy.zh.ChineseTokenizer")
 def create_chinese_tokenizer(segmenter: Segmenter = Segmenter.char):
    def chinese_tokenizer_factory(nlp):
-        return ChineseTokenizer(nlp, segmenter=segmenter)
+        return ChineseTokenizer(nlp.vocab, segmenter=segmenter)
    return chinese_tokenizer_factory
 class ChineseTokenizer(DummyTokenizer):
-    def __init__(self, nlp: Language, segmenter: Segmenter = Segmenter.char):
+    def __init__(self, vocab: Vocab, segmenter: Segmenter = Segmenter.char):
-        self.vocab = nlp.vocab
+        self.vocab = vocab
        self.segmenter = (
            segmenter.value if isinstance(segmenter, Segmenter) else segmenter
        )
--- a/spacy/language.py
+++ b/spacy/language.py
@ -115,7 +115,7 @@ class Language:
    Defaults (class): Settings, data and factory methods for creating the `nlp`
        object and processing pipeline.
-    lang (str): Two-letter language ID, i.e. ISO code.
+    lang (str): IETF language code, such as 'en'.
    DOCS: https://spacy.io/api/language
    """
@ -228,6 +228,7 @@ class Language:
            "vectors": len(self.vocab.vectors),
            "keys": self.vocab.vectors.n_keys,
            "name": self.vocab.vectors.name,
            "mode": self.vocab.vectors.mode,
        }
        self._meta["labels"] = dict(self.pipe_labels)
        # TODO: Adding this back to prevent breaking people's code etc., but
@ -978,7 +979,7 @@ class Language:
    def __call__(
        self,
-        text: str,
+        text: Union[str, Doc],
        *,
        disable: Iterable[str] = SimpleFrozenList(),
        component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
@ -987,7 +988,9 @@ class Language:
        and can contain arbitrary whitespace. Alignment into the original string
        is preserved.
-        text (str): The text to be processed.
+        text (Union[str, Doc]): If `str`, the text to be processed. If `Doc`,
            the doc will be passed directly to the pipeline, skipping
            `Language.make_doc`.
        disable (List[str]): Names of the pipeline components to disable.
        component_cfg (Dict[str, dict]): An optional dictionary with extra
            keyword arguments for specific components.
@ -995,7 +998,7 @@ class Language:
        DOCS: https://spacy.io/api/language#call
        """
-        doc = self.make_doc(text)
+        doc = self._ensure_doc(text)
        if component_cfg is None:
            component_cfg = {}
        for name, proc in self.pipeline:
@ -1080,6 +1083,20 @@ class Language:
            )
        return self.tokenizer(text)
    def _ensure_doc(self, doc_like: Union[str, Doc]) -> Doc:
        """Create a Doc if need be, or raise an error if the input is not a Doc or a string."""
        if isinstance(doc_like, Doc):
            return doc_like
        if isinstance(doc_like, str):
            return self.make_doc(doc_like)
        raise ValueError(Errors.E866.format(type=type(doc_like)))
    def _ensure_doc_with_context(self, doc_like: Union[str, Doc], context: Any) -> Doc:
        """Create a Doc if need be and add as_tuples context, or raise an error if the input is not a Doc or a string."""
        doc = self._ensure_doc(doc_like)
        doc._context = context
        return doc
    def update(
        self,
        examples: Iterable[Example],
@ -1450,7 +1467,7 @@ class Language:
    @overload
    def pipe(
        self,
-        texts: Iterable[str],
+        texts: Iterable[Union[str, Doc]],
        *,
        as_tuples: Literal[False] = ...,
        batch_size: Optional[int] = ...,
@ -1463,7 +1480,7 @@ class Language:
    @overload
    def pipe(  # noqa: F811
        self,
-        texts: Iterable[Tuple[str, _AnyContext]],
+        texts: Iterable[Tuple[Union[str, Doc], _AnyContext]],
        *,
        as_tuples: Literal[True] = ...,
        batch_size: Optional[int] = ...,
@ -1475,7 +1492,9 @@ class Language:
    def pipe(  # noqa: F811
        self,
-        texts: Union[Iterable[str], Iterable[Tuple[str, _AnyContext]]],
+        texts: Union[
            Iterable[Union[str, Doc]], Iterable[Tuple[Union[str, Doc], _AnyContext]]
        ],
        *,
        as_tuples: bool = False,
        batch_size: Optional[int] = None,
@ -1485,7 +1504,8 @@ class Language:
    ) -> Union[Iterator[Doc], Iterator[Tuple[Doc, _AnyContext]]]:
        """Process texts as a stream, and yield `Doc` objects in order.
-        texts (Iterable[str]): A sequence of texts to process.
+        texts (Iterable[Union[str, Doc]]): A sequence of texts or docs to
            process.
        as_tuples (bool): If set to True, inputs should be a sequence of
            (text, context) tuples. Output will then be a sequence of
            (doc, context) tuples. Defaults to False.
@ -1500,23 +1520,24 @@ class Language:
        """
        # Handle texts with context as tuples
        if as_tuples:
-            texts = cast(Iterable[Tuple[str, _AnyContext]], texts)
+            texts = cast(Iterable[Tuple[Union[str, Doc], _AnyContext]], texts)
-            text_context1, text_context2 = itertools.tee(texts)
+            docs_with_contexts = (
-            texts = (tc[0] for tc in text_context1)
+                self._ensure_doc_with_context(text, context) for text, context in texts
-            contexts = (tc[1] for tc in text_context2)
+            )
            docs = self.pipe(
-                texts,
+                docs_with_contexts,
                batch_size=batch_size,
                disable=disable,
                n_process=n_process,
                component_cfg=component_cfg,
            )
-            for doc, context in zip(docs, contexts):
+            for doc in docs:
                context = doc._context
                doc._context = None
                yield (doc, context)
            return
-        # At this point, we know that we're dealing with an iterable of plain texts
+        texts = cast(Iterable[Union[str, Doc]], texts)
        texts = cast(Iterable[str], texts)
        # Set argument defaults
        if n_process == -1:
@ -1551,7 +1572,7 @@ class Language:
            docs = self._multiprocessing_pipe(texts, pipes, n_process, batch_size)
        else:
            # if n_process == 1, no processes are forked.
-            docs = (self.make_doc(text) for text in texts)
+            docs = (self._ensure_doc(text) for text in texts)
            for pipe in pipes:
                docs = pipe(docs)
        for doc in docs:
@ -1570,7 +1591,7 @@ class Language:
    def _multiprocessing_pipe(
        self,
-        texts: Iterable[str],
+        texts: Iterable[Union[str, Doc]],
        pipes: Iterable[Callable[..., Iterator[Doc]]],
        n_process: int,
        batch_size: int,
@ -1596,7 +1617,7 @@ class Language:
        procs = [
            mp.Process(
                target=_apply_pipes,
-                args=(self.make_doc, pipes, rch, sch, Underscore.get_state()),
+                args=(self._ensure_doc, pipes, rch, sch, Underscore.get_state()),
            )
            for rch, sch in zip(texts_q, bytedocs_send_ch)
        ]
@ -1609,11 +1630,12 @@ class Language:
            recv.recv() for recv in cycle(bytedocs_recv_ch)
        )
        try:
-            for i, (_, (byte_doc, byte_error)) in enumerate(
+            for i, (_, (byte_doc, byte_context, byte_error)) in enumerate(
                zip(raw_texts, byte_tuples), 1
            ):
                if byte_doc is not None:
                    doc = Doc(self.vocab).from_bytes(byte_doc)
                    doc._context = byte_context
                    yield doc
                elif byte_error is not None:
                    error = srsly.msgpack_loads(byte_error)
@ -2138,7 +2160,7 @@ def _copy_examples(examples: Iterable[Example]) -> List[Example]:
 def _apply_pipes(
-    make_doc: Callable[[str], Doc],
+    ensure_doc: Callable[[Union[str, Doc]], Doc],
    pipes: Iterable[Callable[..., Iterator[Doc]]],
    receiver,
    sender,
@ -2146,7 +2168,8 @@ def _apply_pipes(
 ) -> None:
    """Worker for Language.pipe
-    make_doc (Callable[[str,] Doc]): Function to create Doc from text.
+    ensure_doc (Callable[[Union[str, Doc]], Doc]): Function to create Doc from text
        or raise an error if the input is neither a Doc nor a string.
    pipes (Iterable[Pipe]): The components to apply.
    receiver (multiprocessing.Connection): Pipe to receive text. Usually
        created by `multiprocessing.Pipe()`
@ -2159,16 +2182,16 @@ def _apply_pipes(
    while True:
        try:
            texts = receiver.get()
-            docs = (make_doc(text) for text in texts)
+            docs = (ensure_doc(text) for text in texts)
            for pipe in pipes:
                docs = pipe(docs)  # type: ignore[arg-type, assignment]
            # Connection does not accept unpickable objects, so send list.
-            byte_docs = [(doc.to_bytes(), None) for doc in docs]
+            byte_docs = [(doc.to_bytes(), doc._context, None) for doc in docs]
-            padding = [(None, None)] * (len(texts) - len(byte_docs))
+            padding = [(None, None, None)] * (len(texts) - len(byte_docs))
            sender.send(byte_docs + padding)  # type: ignore[operator]
        except Exception:
-            error_msg = [(None, srsly.msgpack_dumps(traceback.format_exc()))]
+            error_msg = [(None, None, srsly.msgpack_dumps(traceback.format_exc()))]
-            padding = [(None, None)] * (len(texts) - 1)
+            padding = [(None, None, None)] * (len(texts) - 1)
            sender.send(error_msg + padding)
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -284,7 +284,7 @@ cdef class Lexeme:
        def __get__(self):
            return self.vocab.strings[self.c.lower]
-        def __set__(self, unicode x):
+        def __set__(self, str x):
            self.c.lower = self.vocab.strings.add(x)
    property norm_:
@ -294,7 +294,7 @@ cdef class Lexeme:
        def __get__(self):
            return self.vocab.strings[self.c.norm]
-        def __set__(self, unicode x):
+        def __set__(self, str x):
            self.norm = self.vocab.strings.add(x)
    property shape_:
@ -304,7 +304,7 @@ cdef class Lexeme:
        def __get__(self):
            return self.vocab.strings[self.c.shape]
-        def __set__(self, unicode x):
+        def __set__(self, str x):
            self.c.shape = self.vocab.strings.add(x)
    property prefix_:
@ -314,7 +314,7 @@ cdef class Lexeme:
        def __get__(self):
            return self.vocab.strings[self.c.prefix]
-        def __set__(self, unicode x):
+        def __set__(self, str x):
            self.c.prefix = self.vocab.strings.add(x)
    property suffix_:
@ -324,7 +324,7 @@ cdef class Lexeme:
        def __get__(self):
            return self.vocab.strings[self.c.suffix]
-        def __set__(self, unicode x):
+        def __set__(self, str x):
            self.c.suffix = self.vocab.strings.add(x)
    property lang_:
@ -332,7 +332,7 @@ cdef class Lexeme:
        def __get__(self):
            return self.vocab.strings[self.c.lang]
-        def __set__(self, unicode x):
+        def __set__(self, str x):
            self.c.lang = self.vocab.strings.add(x)
    property flags:
--- a/spacy/matcher/dependencymatcher.pyx
+++ b/spacy/matcher/dependencymatcher.pyx
@ -148,9 +148,9 @@ cdef class DependencyMatcher:
        Creates a token key to be used by the matcher
        """
        return self._normalize_key(
-            unicode(key) + DELIMITER + 
+            str(key) + DELIMITER +
-            unicode(pattern_idx) + DELIMITER + 
+            str(pattern_idx) + DELIMITER +
-            unicode(token_idx)
+            str(token_idx)
        )
    def add(self, key, patterns, *, on_match=None):
@ -424,7 +424,7 @@ cdef class DependencyMatcher:
        return [doc[child.i] for child in doc[node].head.children if child.i < node]
    def _normalize_key(self, key):
-        if isinstance(key, basestring):
+        if isinstance(key, str):
            return self.vocab.strings.add(key)
        else:
            return key
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@ -312,7 +312,7 @@ cdef class Matcher:
        return final_results
    def _normalize_key(self, key):
-        if isinstance(key, basestring):
+        if isinstance(key, str):
            return self.vocab.strings.add(key)
        else:
            return key
@ -360,7 +360,7 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
    for i, token in enumerate(doclike):
        for name, index in extensions.items():
            value = token._.get(name)
-            if isinstance(value, basestring):
+            if isinstance(value, str):
                value = token.vocab.strings[value]
            extra_attr_values[i * nr_extra_attr + index] = value
    # Main loop
@ -786,7 +786,7 @@ def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates):
 def _get_attr_values(spec, string_store):
    attr_values = []
    for attr, value in spec.items():
-        if isinstance(attr, basestring):
+        if isinstance(attr, str):
            attr = attr.upper()
            if attr == '_':
                continue
@ -797,7 +797,7 @@ def _get_attr_values(spec, string_store):
            if attr == "IS_SENT_START":
                attr = "SENT_START"
            attr = IDS.get(attr)
-        if isinstance(value, basestring):
+        if isinstance(value, str):
            value = string_store.add(value)
        elif isinstance(value, bool):
            value = int(value)
@ -938,7 +938,7 @@ def _get_extra_predicates(spec, extra_predicates, vocab):
    seen_predicates = {pred.key: pred.i for pred in extra_predicates}
    output = []
    for attr, value in spec.items():
-        if isinstance(attr, basestring):
+        if isinstance(attr, str):
            if attr == "_":
                output.extend(
                    _get_extension_extra_predicates(
@ -995,7 +995,7 @@ def _get_operators(spec):
              "?": (ZERO_ONE,), "1": (ONE,), "!": (ZERO,)}
    # Fix casing
    spec = {key.upper(): values for key, values in spec.items()
-            if isinstance(key, basestring)}
+            if isinstance(key, str)}
    if "OP" not in spec:
        return (ONE,)
    elif spec["OP"] in lookup:
@ -1013,7 +1013,7 @@ def _get_extensions(spec, string_store, name2index):
        if isinstance(value, dict):
            # Handle predicates (e.g. "IN", in the extra_predicates, not here.
            continue
-        if isinstance(value, basestring):
+        if isinstance(value, str):
            value = string_store.add(value)
        if name not in name2index:
            name2index[name] = len(name2index)
--- a/spacy/ml/staticvectors.py
+++ b/spacy/ml/staticvectors.py
@ -1,11 +1,13 @@
-from typing import List, Tuple, Callable, Optional, cast
+from typing import List, Tuple, Callable, Optional, Sequence, cast
 from thinc.initializers import glorot_uniform_init
 from thinc.util import partial
-from thinc.types import Ragged, Floats2d, Floats1d
+from thinc.types import Ragged, Floats2d, Floats1d, Ints1d
 from thinc.api import Model, Ops, registry
 from ..tokens import Doc
 from ..errors import Errors
 from ..vectors import Mode
 from ..vocab import Vocab
@registry.layers("spacy.StaticVectors.v2")
@ -34,20 +36,32 @@ def StaticVectors(
 def forward(
    model: Model[List[Doc], Ragged], docs: List[Doc], is_train: bool
 ) -> Tuple[Ragged, Callable]:
-    if not sum(len(doc) for doc in docs):
+    token_count = sum(len(doc) for doc in docs)
    if not token_count:
        return _handle_empty(model.ops, model.get_dim("nO"))
-    key_attr = model.attrs["key_attr"]
+    key_attr: int = model.attrs["key_attr"]
-    W = cast(Floats2d, model.ops.as_contig(model.get_param("W")))
+    keys: Ints1d = model.ops.flatten(
-    V = cast(Floats2d, model.ops.asarray(docs[0].vocab.vectors.data))
+        cast(Sequence, [doc.to_array(key_attr) for doc in docs])
    rows = model.ops.flatten(
        [doc.vocab.vectors.find(keys=doc.to_array(key_attr)) for doc in docs]
    )
    vocab: Vocab = docs[0].vocab
    W = cast(Floats2d, model.ops.as_contig(model.get_param("W")))
    if vocab.vectors.mode == Mode.default:
        V = cast(Floats2d, model.ops.asarray(vocab.vectors.data))
        rows = vocab.vectors.find(keys=keys)
        V = model.ops.as_contig(V[rows])
    elif vocab.vectors.mode == Mode.floret:
        V = cast(Floats2d, vocab.vectors.get_batch(keys))
        V = model.ops.as_contig(V)
    else:
        raise RuntimeError(Errors.E896)
    try:
-        vectors_data = model.ops.gemm(model.ops.as_contig(V[rows]), W, trans2=True)
+        vectors_data = model.ops.gemm(V, W, trans2=True)
    except ValueError:
        raise RuntimeError(Errors.E896)
-    # Convert negative indices to 0-vectors (TODO: more options for UNK tokens)
+    if vocab.vectors.mode == Mode.default:
-    vectors_data[rows < 0] = 0
+        # Convert negative indices to 0-vectors
        # TODO: more options for UNK tokens
        vectors_data[rows < 0] = 0
    output = Ragged(
        vectors_data, model.ops.asarray([len(doc) for doc in docs], dtype="i")  # type: ignore
    )
@ -63,7 +77,7 @@ def forward(
        model.inc_grad(
            "W",
            model.ops.gemm(
-                cast(Floats2d, d_output.data), model.ops.as_contig(V[rows]), trans1=True
+                cast(Floats2d, d_output.data), model.ops.as_contig(V), trans1=True
            ),
        )
        return []
--- a/spacy/pipeline/_parser_internals/arc_eager.pyx
+++ b/spacy/pipeline/_parser_internals/arc_eager.pyx
@ -17,7 +17,7 @@ from ...errors import Errors
 from thinc.extra.search cimport Beam
 cdef weight_t MIN_SCORE = -90000
-cdef attr_t SUBTOK_LABEL = hash_string(u'subtok')
+cdef attr_t SUBTOK_LABEL = hash_string('subtok')
 DEF NON_MONOTONIC = True
--- a/spacy/pipeline/attributeruler.py
+++ b/spacy/pipeline/attributeruler.py
@ -5,15 +5,15 @@ from pathlib import Path
 from .pipe import Pipe
 from ..errors import Errors
-from ..training import validate_examples, Example
+from ..training import Example
 from ..language import Language
 from ..matcher import Matcher
 from ..scorer import Scorer
-from ..symbols import IDS, TAG, POS, MORPH, LEMMA
+from ..symbols import IDS
 from ..tokens import Doc, Span
 from ..tokens._retokenize import normalize_token_attrs, set_token_attrs
 from ..vocab import Vocab
-from ..util import SimpleFrozenList
+from ..util import SimpleFrozenList, registry
 from .. import util
@ -23,9 +23,41 @@ TagMapType = Dict[str, Dict[Union[int, str], Union[int, str]]]
 MorphRulesType = Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]]
-@Language.factory("attribute_ruler", default_config={"validate": False})
+@Language.factory(
-def make_attribute_ruler(nlp: Language, name: str, validate: bool):
+    "attribute_ruler",
-    return AttributeRuler(nlp.vocab, name, validate=validate)
+    default_config={
        "validate": False,
        "scorer": {"@scorers": "spacy.attribute_ruler_scorer.v1"},
    },
 )
 def make_attribute_ruler(
    nlp: Language, name: str, validate: bool, scorer: Optional[Callable]
 ):
    return AttributeRuler(nlp.vocab, name, validate=validate, scorer=scorer)
 def attribute_ruler_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
    def morph_key_getter(token, attr):
        return getattr(token, attr).key
    results = {}
    results.update(Scorer.score_token_attr(examples, "tag", **kwargs))
    results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
    results.update(
        Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs)
    )
    results.update(
        Scorer.score_token_attr_per_feat(
            examples, "morph", getter=morph_key_getter, **kwargs
        )
    )
    results.update(Scorer.score_token_attr(examples, "lemma", **kwargs))
    return results
@registry.scorers("spacy.attribute_ruler_scorer.v1")
 def make_attribute_ruler_scorer():
    return attribute_ruler_score
 class AttributeRuler(Pipe):
@ -36,7 +68,12 @@ class AttributeRuler(Pipe):
    """
    def __init__(
-        self, vocab: Vocab, name: str = "attribute_ruler", *, validate: bool = False
+        self,
        vocab: Vocab,
        name: str = "attribute_ruler",
        *,
        validate: bool = False,
        scorer: Optional[Callable] = attribute_ruler_score,
    ) -> None:
        """Create the AttributeRuler. After creation, you can add patterns
        with the `.initialize()` or `.add_patterns()` methods, or load patterns
@ -45,6 +82,10 @@ class AttributeRuler(Pipe):
        vocab (Vocab): The vocab.
        name (str): The pipe name. Defaults to "attribute_ruler".
        scorer (Optional[Callable]): The scoring method. Defaults to
            Scorer.score_token_attr for the attributes "tag", "pos", "morph" and
            "lemma" and Scorer.score_token_attr_per_feat for the attribute
            "morph".
        RETURNS (AttributeRuler): The AttributeRuler component.
@ -57,6 +98,7 @@ class AttributeRuler(Pipe):
        self.attrs: List[Dict] = []
        self._attrs_unnormed: List[Dict] = []  # store for reference
        self.indices: List[int] = []
        self.scorer = scorer
    def clear(self) -> None:
        """Reset all patterns."""
@ -228,45 +270,6 @@ class AttributeRuler(Pipe):
            all_patterns.append(p)
        return all_patterns  # type: ignore[return-value]
    def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
        """Score a batch of examples.
        examples (Iterable[Example]): The examples to score.
        RETURNS (Dict[str, Any]): The scores, produced by
            Scorer.score_token_attr for the attributes "tag", "pos", "morph"
            and "lemma" for the target token attributes.
        DOCS: https://spacy.io/api/tagger#score
        """
        def morph_key_getter(token, attr):
            return getattr(token, attr).key
        validate_examples(examples, "AttributeRuler.score")
        results = {}
        attrs = set()  # type: ignore
        for token_attrs in self.attrs:
            attrs.update(token_attrs)
        for attr in attrs:
            if attr == TAG:
                results.update(Scorer.score_token_attr(examples, "tag", **kwargs))
            elif attr == POS:
                results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
            elif attr == MORPH:
                results.update(
                    Scorer.score_token_attr(
                        examples, "morph", getter=morph_key_getter, **kwargs
                    )
                )
                results.update(
                    Scorer.score_token_attr_per_feat(
                        examples, "morph", getter=morph_key_getter, **kwargs
                    )
                )
            elif attr == LEMMA:
                results.update(Scorer.score_token_attr(examples, "lemma", **kwargs))
        return results
    def to_bytes(self, exclude: Iterable[str] = SimpleFrozenList()) -> bytes:
        """Serialize the AttributeRuler to a bytestring.
--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.pyx
@ -1,6 +1,6 @@
 # cython: infer_types=True, profile=True, binding=True
 from collections import defaultdict
-from typing import Optional, Iterable
+from typing import Optional, Iterable, Callable
 from thinc.api import Model, Config
 from ._parser_internals.transition_system import TransitionSystem
@ -12,7 +12,7 @@ from ..language import Language
 from ._parser_internals import nonproj
 from ._parser_internals.nonproj import DELIMITER
 from ..scorer import Scorer
-from ..training import validate_examples
+from ..util import registry
 default_model_config = """
@ -46,6 +46,7 @@ DEFAULT_PARSER_MODEL = Config().from_str(default_model_config)["model"]
        "learn_tokens": False,
        "min_action_freq": 30,
        "model": DEFAULT_PARSER_MODEL,
        "scorer": {"@scorers": "spacy.parser_scorer.v1"},
    },
    default_score_weights={
        "dep_uas": 0.5,
@ -63,7 +64,8 @@ def make_parser(
    moves: Optional[TransitionSystem],
    update_with_oracle_cut_size: int,
    learn_tokens: bool,
-    min_action_freq: int
+    min_action_freq: int,
    scorer: Optional[Callable],
 ):
    """Create a transition-based DependencyParser component. The dependency parser
    jointly learns sentence segmentation and labelled dependency parsing, and can
@ -100,6 +102,7 @@ def make_parser(
        primarily affects the label accuracy, it can also affect the attachment
        structure, as the labels are used to represent the pseudo-projectivity
        transformation.
    scorer (Optional[Callable]): The scoring method.
    """
    return DependencyParser(
        nlp.vocab,
@ -115,7 +118,8 @@ def make_parser(
        beam_update_prob=0.0,
        # At some point in the future we can try to implement support for
        # partial annotations, perhaps only in the beam objective.
-        incorrect_spans_key=None
+        incorrect_spans_key=None,
        scorer=scorer,
    )
@Language.factory(
@ -130,6 +134,7 @@ def make_parser(
        "learn_tokens": False,
        "min_action_freq": 30,
        "model": DEFAULT_PARSER_MODEL,
        "scorer": {"@scorers": "spacy.parser_scorer.v1"},
    },
    default_score_weights={
        "dep_uas": 0.5,
@ -151,6 +156,7 @@ def make_beam_parser(
    beam_width: int,
    beam_density: float,
    beam_update_prob: float,
    scorer: Optional[Callable],
 ):
    """Create a transition-based DependencyParser component that uses beam-search.
    The dependency parser jointly learns sentence segmentation and labelled
@ -207,10 +213,41 @@ def make_beam_parser(
        min_action_freq=min_action_freq,
        # At some point in the future we can try to implement support for
        # partial annotations, perhaps only in the beam objective.
-        incorrect_spans_key=None
+        incorrect_spans_key=None,
        scorer=scorer,
    )
 def parser_score(examples, **kwargs):
    """Score a batch of examples.
    examples (Iterable[Example]): The examples to score.
    RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans
        and Scorer.score_deps.
    DOCS: https://spacy.io/api/dependencyparser#score
    """
    def has_sents(doc):
        return doc.has_annotation("SENT_START")
    def dep_getter(token, attr):
        dep = getattr(token, attr)
        dep = token.vocab.strings.as_string(dep).lower()
        return dep
    results = {}
    results.update(Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs))
    kwargs.setdefault("getter", dep_getter)
    kwargs.setdefault("ignore_labels", ("p", "punct"))
    results.update(Scorer.score_deps(examples, "dep", **kwargs))
    del results["sents_per_type"]
    return results
@registry.scorers("spacy.parser_scorer.v1")
 def make_parser_scorer():
    return parser_score
 cdef class DependencyParser(Parser):
    """Pipeline component for dependency parsing.
@ -233,6 +270,7 @@ cdef class DependencyParser(Parser):
        beam_update_prob=0.0,
        multitasks=tuple(),
        incorrect_spans_key=None,
        scorer=parser_score,
    ):
        """Create a DependencyParser.
        """
@ -249,6 +287,7 @@ cdef class DependencyParser(Parser):
            beam_update_prob=beam_update_prob,
            multitasks=multitasks,
            incorrect_spans_key=incorrect_spans_key,
            scorer=scorer,
        )
    @property
@ -281,31 +320,6 @@ cdef class DependencyParser(Parser):
                labels.add(label)
        return tuple(sorted(labels))
    def score(self, examples, **kwargs):
        """Score a batch of examples.
        examples (Iterable[Example]): The examples to score.
        RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans
            and Scorer.score_deps.
        DOCS: https://spacy.io/api/dependencyparser#score
        """
        def has_sents(doc):
            return doc.has_annotation("SENT_START")
        validate_examples(examples, "DependencyParser.score")
        def dep_getter(token, attr):
            dep = getattr(token, attr)
            dep = token.vocab.strings.as_string(dep).lower()
            return dep
        results = {}
        results.update(Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs))
        kwargs.setdefault("getter", dep_getter)
        kwargs.setdefault("ignore_labels", ("p", "punct"))
        results.update(Scorer.score_deps(examples, "dep", **kwargs))
        del results["sents_per_type"]
        return results
    def scored_parses(self, beams):
        """Return two dictionaries with scores for each beam/doc that was processed:
        one containing (i, head) keys, and another containing (i, label) keys.
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@ -17,10 +17,12 @@ from ..language import Language
 from ..vocab import Vocab
 from ..training import Example, validate_examples, validate_get_examples
 from ..errors import Errors, Warnings
-from ..util import SimpleFrozenList
+from ..util import SimpleFrozenList, registry
 from .. import util
 from ..scorer import Scorer
 # See #9050
 BACKWARD_OVERWRITE = True
 default_model_config = """
 [model]
@ -51,6 +53,8 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
        "incl_context": True,
        "entity_vector_length": 64,
        "get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
        "overwrite": True,
        "scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
    },
    default_score_weights={
        "nel_micro_f": 1.0,
@ -69,6 +73,8 @@ def make_entity_linker(
    incl_context: bool,
    entity_vector_length: int,
    get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
    overwrite: bool,
    scorer: Optional[Callable],
 ):
    """Construct an EntityLinker component.
@ -82,6 +88,7 @@ def make_entity_linker(
    entity_vector_length (int): Size of encoding vectors in the KB.
    get_candidates (Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]): Function that
        produces a list of candidates, given a certain knowledge base and a textual mention.
    scorer (Optional[Callable]): The scoring method.
    """
    return EntityLinker(
        nlp.vocab,
@ -93,9 +100,20 @@ def make_entity_linker(
        incl_context=incl_context,
        entity_vector_length=entity_vector_length,
        get_candidates=get_candidates,
        overwrite=overwrite,
        scorer=scorer,
    )
 def entity_linker_score(examples, **kwargs):
    return Scorer.score_links(examples, negative_labels=[EntityLinker.NIL], **kwargs)
@registry.scorers("spacy.entity_linker_scorer.v1")
 def make_entity_linker_scorer():
    return entity_linker_score
 class EntityLinker(TrainablePipe):
    """Pipeline component for named entity linking.
@ -116,6 +134,8 @@ class EntityLinker(TrainablePipe):
        incl_context: bool,
        entity_vector_length: int,
        get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
        overwrite: bool = BACKWARD_OVERWRITE,
        scorer: Optional[Callable] = entity_linker_score,
    ) -> None:
        """Initialize an entity linker.
@ -130,6 +150,8 @@ class EntityLinker(TrainablePipe):
        entity_vector_length (int): Size of encoding vectors in the KB.
        get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
            produces a list of candidates, given a certain knowledge base and a textual mention.
        scorer (Optional[Callable]): The scoring method. Defaults to
            Scorer.score_links.
        DOCS: https://spacy.io/api/entitylinker#init
        """
@ -141,11 +163,12 @@ class EntityLinker(TrainablePipe):
        self.incl_prior = incl_prior
        self.incl_context = incl_context
        self.get_candidates = get_candidates
-        self.cfg: Dict[str, Any] = {}
+        self.cfg: Dict[str, Any] = {"overwrite": overwrite}
        self.distance = CosineDistance(normalize=False)
        # how many neighbour sentences to take into account
        # create an empty KB by default. If you want to load a predefined one, specify it in 'initialize'.
        self.kb = empty_kb(entity_vector_length)(self.vocab)
        self.scorer = scorer
    def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
        """Define the KB of this pipe by providing a function that will
@ -384,23 +407,14 @@ class EntityLinker(TrainablePipe):
        if count_ents != len(kb_ids):
            raise ValueError(Errors.E148.format(ents=count_ents, ids=len(kb_ids)))
        i = 0
        overwrite = self.cfg["overwrite"]
        for doc in docs:
            for ent in doc.ents:
                kb_id = kb_ids[i]
                i += 1
                for token in ent:
-                    token.ent_kb_id_ = kb_id
+                    if token.ent_kb_id == 0 or overwrite:
-
+                        token.ent_kb_id_ = kb_id
    def score(self, examples, **kwargs):
        """Score a batch of examples.
        examples (Iterable[Example]): The examples to score.
        RETURNS (Dict[str, Any]): The scores.
        DOCS TODO: https://spacy.io/api/entity_linker#score
        """
        validate_examples(examples, "EntityLinker.score")
        return Scorer.score_links(examples, negative_labels=[self.NIL])
    def to_bytes(self, *, exclude=tuple()):
        """Serialize the pipe to a bytestring.
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@ -9,11 +9,10 @@ from .pipe import Pipe
 from ..training import Example
 from ..language import Language
 from ..errors import Errors, Warnings
-from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList
+from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList, registry
 from ..tokens import Doc, Span
 from ..matcher import Matcher, PhraseMatcher
 from ..scorer import get_ner_prf
 from ..training import validate_examples
 DEFAULT_ENT_ID_SEP = "||"
@ -28,6 +27,7 @@ PatternType = Dict[str, Union[str, List[Dict[str, Any]]]]
        "validate": False,
        "overwrite_ents": False,
        "ent_id_sep": DEFAULT_ENT_ID_SEP,
        "scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"},
    },
    default_score_weights={
        "ents_f": 1.0,
@ -43,6 +43,7 @@ def make_entity_ruler(
    validate: bool,
    overwrite_ents: bool,
    ent_id_sep: str,
    scorer: Optional[Callable],
 ):
    return EntityRuler(
        nlp,
@ -51,9 +52,19 @@ def make_entity_ruler(
        validate=validate,
        overwrite_ents=overwrite_ents,
        ent_id_sep=ent_id_sep,
        scorer=scorer,
    )
 def entity_ruler_score(examples, **kwargs):
    return get_ner_prf(examples)
@registry.scorers("spacy.entity_ruler_scorer.v1")
 def make_entity_ruler_scorer():
    return entity_ruler_score
 class EntityRuler(Pipe):
    """The EntityRuler lets you add spans to the `Doc.ents` using token-based
    rules or exact phrase matches. It can be combined with the statistical
@ -75,6 +86,7 @@ class EntityRuler(Pipe):
        overwrite_ents: bool = False,
        ent_id_sep: str = DEFAULT_ENT_ID_SEP,
        patterns: Optional[List[PatternType]] = None,
        scorer: Optional[Callable] = entity_ruler_score,
    ) -> None:
        """Initialize the entity ruler. If patterns are supplied here, they
        need to be a list of dictionaries with a `"label"` and `"pattern"`
@ -95,6 +107,8 @@ class EntityRuler(Pipe):
        overwrite_ents (bool): If existing entities are present, e.g. entities
            added by the model, overwrite them by matches if necessary.
        ent_id_sep (str): Separator used internally for entity IDs.
        scorer (Optional[Callable]): The scoring method. Defaults to
            spacy.scorer.get_ner_prf.
        DOCS: https://spacy.io/api/entityruler#init
        """
@ -113,6 +127,7 @@ class EntityRuler(Pipe):
        self._ent_ids = defaultdict(tuple)  # type: ignore
        if patterns is not None:
            self.add_patterns(patterns)
        self.scorer = scorer
    def __len__(self) -> int:
        """The number of all patterns added to the entity ruler."""
@ -363,10 +378,6 @@ class EntityRuler(Pipe):
            label = f"{label}{self.ent_id_sep}{ent_id}"
        return label
    def score(self, examples, **kwargs):
        validate_examples(examples, "EntityRuler.score")
        return get_ner_prf(examples)
    def from_bytes(
        self, patterns_bytes: bytes, *, exclude: Iterable[str] = SimpleFrozenList()
    ) -> "EntityRuler":
--- a/spacy/pipeline/lemmatizer.py
+++ b/spacy/pipeline/lemmatizer.py
@ -12,21 +12,41 @@ from ..lookups import Lookups, load_lookups
 from ..scorer import Scorer
 from ..tokens import Doc, Token
 from ..vocab import Vocab
-from ..training import validate_examples
+from ..util import logger, SimpleFrozenList, registry
 from ..util import logger, SimpleFrozenList
 from .. import util
@Language.factory(
    "lemmatizer",
    assigns=["token.lemma"],
-    default_config={"model": None, "mode": "lookup", "overwrite": False},
+    default_config={
        "model": None,
        "mode": "lookup",
        "overwrite": False,
        "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
    },
    default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
-    nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool = False
+    nlp: Language,
    model: Optional[Model],
    name: str,
    mode: str,
    overwrite: bool,
    scorer: Optional[Callable],
 ):
-    return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+    return Lemmatizer(
        nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
    )
 def lemmatizer_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
    return Scorer.score_token_attr(examples, "lemma", **kwargs)
@registry.scorers("spacy.lemmatizer_scorer.v1")
 def make_lemmatizer_scorer():
    return lemmatizer_score
 class Lemmatizer(Pipe):
@ -60,6 +80,7 @@ class Lemmatizer(Pipe):
        *,
        mode: str = "lookup",
        overwrite: bool = False,
        scorer: Optional[Callable] = lemmatizer_score,
    ) -> None:
        """Initialize a Lemmatizer.
@ -69,6 +90,8 @@ class Lemmatizer(Pipe):
        mode (str): The lemmatizer mode: "lookup", "rule". Defaults to "lookup".
        overwrite (bool): Whether to overwrite existing lemmas. Defaults to
            `False`.
        scorer (Optional[Callable]): The scoring method. Defaults to
            Scorer.score_token_attr for the attribute "lemma".
        DOCS: https://spacy.io/api/lemmatizer#init
        """
@ -89,6 +112,7 @@ class Lemmatizer(Pipe):
                raise ValueError(Errors.E1003.format(mode=mode))
            self.lemmatize = getattr(self, mode_attr)
        self.cache = {}  # type: ignore[var-annotated]
        self.scorer = scorer
    @property
    def mode(self):
@ -247,17 +271,6 @@ class Lemmatizer(Pipe):
        """
        return False
    def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
        """Score a batch of examples.
        examples (Iterable[Example]): The examples to score.
        RETURNS (Dict[str, Any]): The scores.
        DOCS: https://spacy.io/api/lemmatizer#score
        """
        validate_examples(examples, "Lemmatizer.score")
        return Scorer.score_token_attr(examples, "lemma", **kwargs)
    def to_disk(
        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
    ):
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@ -1,5 +1,5 @@
 # cython: infer_types=True, profile=True, binding=True
-from typing import Optional, Union, Dict
+from typing import Optional, Union, Dict, Callable
 import srsly
 from thinc.api import SequenceCategoricalCrossentropy, Model, Config
 from itertools import islice
@ -17,7 +17,11 @@ from .tagger import Tagger
 from .. import util
 from ..scorer import Scorer
 from ..training import validate_examples, validate_get_examples
 from ..util import registry
 # See #9050
 BACKWARD_OVERWRITE = True
 BACKWARD_EXTEND = False
 default_model_config = """
 [model]
@ -48,15 +52,35 @@ DEFAULT_MORPH_MODEL = Config().from_str(default_model_config)["model"]
@Language.factory(
    "morphologizer",
    assigns=["token.morph", "token.pos"],
-    default_config={"model": DEFAULT_MORPH_MODEL},
+    default_config={"model": DEFAULT_MORPH_MODEL, "overwrite": True, "extend": False, "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"}},
    default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None},
 )
 def make_morphologizer(
    nlp: Language,
    model: Model,
    name: str,
    overwrite: bool,
    extend: bool,
    scorer: Optional[Callable],
 ):
-    return Morphologizer(nlp.vocab, model, name)
+    return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, scorer=scorer)
 def morphologizer_score(examples, **kwargs):
    def morph_key_getter(token, attr):
        return getattr(token, attr).key
    results = {}
    results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
    results.update(Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs))
    results.update(Scorer.score_token_attr_per_feat(examples,
        "morph", getter=morph_key_getter, **kwargs))
    return results
@registry.scorers("spacy.morphologizer_scorer.v1")
 def make_morphologizer_scorer():
    return morphologizer_score
 class Morphologizer(Tagger):
@ -67,6 +91,10 @@ class Morphologizer(Tagger):
        vocab: Vocab,
        model: Model,
        name: str = "morphologizer",
        *,
        overwrite: bool = BACKWARD_OVERWRITE,
        extend: bool = BACKWARD_EXTEND,
        scorer: Optional[Callable] = morphologizer_score,
    ):
        """Initialize a morphologizer.
@ -74,6 +102,9 @@ class Morphologizer(Tagger):
        model (thinc.api.Model): The Thinc Model powering the pipeline component.
        name (str): The component instance name, used to add entries to the
            losses during training.
        scorer (Optional[Callable]): The scoring method. Defaults to
            Scorer.score_token_attr for the attributes "pos" and "morph" and
            Scorer.score_token_attr_per_feat for the attribute "morph".
        DOCS: https://spacy.io/api/morphologizer#init
        """
@ -85,8 +116,14 @@ class Morphologizer(Tagger):
        # store mappings from morph+POS labels to token-level annotations:
        # 1) labels_morph stores a mapping from morph+POS->morph
        # 2) labels_pos stores a mapping from morph+POS->POS
-        cfg = {"labels_morph": {}, "labels_pos": {}}
+        cfg = {
            "labels_morph": {},
            "labels_pos": {},
            "overwrite": overwrite,
            "extend": extend,
        }
        self.cfg = dict(sorted(cfg.items()))
        self.scorer = scorer
    @property
    def labels(self):
@ -192,14 +229,34 @@ class Morphologizer(Tagger):
            docs = [docs]
        cdef Doc doc
        cdef Vocab vocab = self.vocab
        cdef bint overwrite = self.cfg["overwrite"]
        cdef bint extend = self.cfg["extend"]
        for i, doc in enumerate(docs):
            doc_tag_ids = batch_tag_ids[i]
            if hasattr(doc_tag_ids, "get"):
                doc_tag_ids = doc_tag_ids.get()
            for j, tag_id in enumerate(doc_tag_ids):
                morph = self.labels[tag_id]
-                doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels_morph"].get(morph, 0))
+                # set morph
-                doc.c[j].pos = self.cfg["labels_pos"].get(morph, 0)
+                if doc.c[j].morph == 0 or overwrite or extend:
                    if overwrite and extend:
                        # morphologizer morph overwrites any existing features
                        # while extending
                        extended_morph = Morphology.feats_to_dict(self.vocab.strings[doc.c[j].morph])
                        extended_morph.update(Morphology.feats_to_dict(self.cfg["labels_morph"].get(morph, 0)))
                        doc.c[j].morph = self.vocab.morphology.add(extended_morph)
                    elif extend:
                        # existing features are preserved and any new features
                        # are added
                        extended_morph = Morphology.feats_to_dict(self.cfg["labels_morph"].get(morph, 0))
                        extended_morph.update(Morphology.feats_to_dict(self.vocab.strings[doc.c[j].morph]))
                        doc.c[j].morph = self.vocab.morphology.add(extended_morph)
                    else:
                        # clobber
                        doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels_morph"].get(morph, 0))
                # set POS
                if doc.c[j].pos == 0 or overwrite:
                    doc.c[j].pos = self.cfg["labels_pos"].get(morph, 0)
    def get_loss(self, examples, scores):
        """Find the loss and gradient of loss for the batch of documents and
@ -246,24 +303,3 @@ class Morphologizer(Tagger):
        if self.model.ops.xp.isnan(loss):
            raise ValueError(Errors.E910.format(name=self.name))
        return float(loss), d_scores
    def score(self, examples, **kwargs):
        """Score a batch of examples.
        examples (Iterable[Example]): The examples to score.
        RETURNS (Dict[str, Any]): The scores, produced by
            Scorer.score_token_attr for the attributes "pos" and "morph" and
            Scorer.score_token_attr_per_feat for the attribute "morph".
        DOCS: https://spacy.io/api/morphologizer#score
        """
        def morph_key_getter(token, attr):
            return getattr(token, attr).key
        validate_examples(examples, "Morphologizer.score")
        results = {}
        results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
        results.update(Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs))
        results.update(Scorer.score_token_attr_per_feat(examples,
            "morph", getter=morph_key_getter, **kwargs))
        return results
--- a/spacy/pipeline/ner.pyx
+++ b/spacy/pipeline/ner.pyx
@ -1,6 +1,6 @@
 # cython: infer_types=True, profile=True, binding=True
 from collections import defaultdict
-from typing import Optional, Iterable
+from typing import Optional, Iterable, Callable
 from thinc.api import Model, Config
 from ._parser_internals.transition_system import TransitionSystem
@ -9,7 +9,7 @@ from ._parser_internals.ner cimport BiluoPushDown
 from ..language import Language
 from ..scorer import get_ner_prf, PRFScore
-from ..training import validate_examples
+from ..util import registry
 default_model_config = """
@ -41,7 +41,8 @@ DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"]
        "moves": None,
        "update_with_oracle_cut_size": 100,
        "model": DEFAULT_NER_MODEL,
-        "incorrect_spans_key": None
+        "incorrect_spans_key": None,
        "scorer": {"@scorers": "spacy.ner_scorer.v1"},
    },
    default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
@ -52,7 +53,8 @@ def make_ner(
    model: Model,
    moves: Optional[TransitionSystem],
    update_with_oracle_cut_size: int,
-    incorrect_spans_key: Optional[str]=None
+    incorrect_spans_key: Optional[str],
    scorer: Optional[Callable],
 ):
    """Create a transition-based EntityRecognizer component. The entity recognizer
    identifies non-overlapping labelled spans of tokens.
@ -80,6 +82,7 @@ def make_ner(
    incorrect_spans_key (Optional[str]): Identifies spans that are known
        to be incorrect entity annotations. The incorrect entity annotations
        can be stored in the span group, under this key.
    scorer (Optional[Callable]): The scoring method.
    """
    return EntityRecognizer(
        nlp.vocab,
@ -92,6 +95,7 @@ def make_ner(
        beam_width=1,
        beam_density=0.0,
        beam_update_prob=0.0,
        scorer=scorer,
    )
@Language.factory(
@ -104,7 +108,8 @@ def make_ner(
        "beam_density": 0.01,
        "beam_update_prob": 0.5,
        "beam_width": 32,
-        "incorrect_spans_key": None
+        "incorrect_spans_key": None,
        "scorer": None,
    },
    default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
 )
@ -117,7 +122,8 @@ def make_beam_ner(
    beam_width: int,
    beam_density: float,
    beam_update_prob: float,
-    incorrect_spans_key: Optional[str]=None
+    incorrect_spans_key: Optional[str],
    scorer: Optional[Callable],
 ):
    """Create a transition-based EntityRecognizer component that uses beam-search.
    The entity recognizer identifies non-overlapping labelled spans of tokens.
@ -153,6 +159,7 @@ def make_beam_ner(
        and are faster to compute.
    incorrect_spans_key (Optional[str]): Optional key into span groups of
        entities known to be non-entities.
    scorer (Optional[Callable]): The scoring method.
    """
    return EntityRecognizer(
        nlp.vocab,
@ -164,10 +171,20 @@ def make_beam_ner(
        beam_width=beam_width,
        beam_density=beam_density,
        beam_update_prob=beam_update_prob,
-        incorrect_spans_key=incorrect_spans_key
+        incorrect_spans_key=incorrect_spans_key,
        scorer=scorer,
    )
 def ner_score(examples, **kwargs):
    return get_ner_prf(examples, **kwargs)
@registry.scorers("spacy.ner_scorer.v1")
 def make_ner_scorer():
    return ner_score
 cdef class EntityRecognizer(Parser):
    """Pipeline component for named entity recognition.
@ -188,6 +205,7 @@ cdef class EntityRecognizer(Parser):
        beam_update_prob=0.0,
        multitasks=tuple(),
        incorrect_spans_key=None,
        scorer=ner_score,
    ):
        """Create an EntityRecognizer.
        """
@ -204,6 +222,7 @@ cdef class EntityRecognizer(Parser):
            beam_update_prob=beam_update_prob,
            multitasks=multitasks,
            incorrect_spans_key=incorrect_spans_key,
            scorer=scorer,
        )
    def add_multitask_objective(self, mt_component):
@ -227,17 +246,6 @@ cdef class EntityRecognizer(Parser):
                     if move[0] in ("B", "I", "L", "U"))
        return tuple(sorted(labels))
    def score(self, examples, **kwargs):
        """Score a batch of examples.
        examples (Iterable[Example]): The examples to score.
        RETURNS (Dict[str, Any]): The NER precision, recall and f-scores.
        DOCS: https://spacy.io/api/entityrecognizer#score
        """
        validate_examples(examples, "EntityRecognizer.score")
        return get_ner_prf(examples)
    def scored_ents(self, beams):
        """Return a dictionary of (start, end, label) tuples with corresponding scores
        for each beam/doc that was processed.
--- a/spacy/pipeline/pipe.pyx
+++ b/spacy/pipeline/pipe.pyx
@ -81,6 +81,17 @@ cdef class Pipe:
        DOCS: https://spacy.io/api/pipe#score
        """
        if hasattr(self, "scorer") and self.scorer is not None:
            scorer_kwargs = {}
            # use default settings from cfg (e.g., threshold)
            if hasattr(self, "cfg") and isinstance(self.cfg, dict):
                scorer_kwargs.update(self.cfg)
            # override self.cfg["labels"] with self.labels
            if hasattr(self, "labels"):
                scorer_kwargs["labels"] = self.labels
            # override with kwargs settings
            scorer_kwargs.update(kwargs)
            return self.scorer(examples, **scorer_kwargs)
        return {}
    @property
--- a/spacy/pipeline/sentencizer.pyx
+++ b/spacy/pipeline/sentencizer.pyx
@ -1,26 +1,32 @@
 # cython: infer_types=True, profile=True, binding=True
-from typing import Optional, List
+from typing import Optional, List, Callable
 import srsly
 from ..tokens.doc cimport Doc
 from .pipe import Pipe
 from .senter import senter_score
 from ..language import Language
 from ..scorer import Scorer
 from ..training import validate_examples
 from .. import util
 # see #9050
 BACKWARD_OVERWRITE = False
@Language.factory(
    "sentencizer",
    assigns=["token.is_sent_start", "doc.sents"],
-    default_config={"punct_chars": None},
+    default_config={"punct_chars": None, "overwrite": False, "scorer": {"@scorers": "spacy.senter_scorer.v1"}},
    default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
 )
 def make_sentencizer(
    nlp: Language,
    name: str,
-    punct_chars: Optional[List[str]]
+    punct_chars: Optional[List[str]],
    overwrite: bool,
    scorer: Optional[Callable],
 ):
-    return Sentencizer(name, punct_chars=punct_chars)
+    return Sentencizer(name, punct_chars=punct_chars, overwrite=overwrite, scorer=scorer)
 class Sentencizer(Pipe):
@ -41,12 +47,20 @@ class Sentencizer(Pipe):
            '𑩃', '𑪛', '𑪜', '𑱁', '𑱂', '𖩮', '𖩯', '𖫵', '𖬷', '𖬸', '𖭄', '𛲟', '𝪈',
            '｡', '。']
-    def __init__(self, name="sentencizer", *, punct_chars=None):
+    def __init__(
        self,
        name="sentencizer",
        *,
        punct_chars=None,
        overwrite=BACKWARD_OVERWRITE,
        scorer=senter_score,
    ):
        """Initialize the sentencizer.
        punct_chars (list): Punctuation characters to split on. Will be
            serialized with the nlp object.
-        RETURNS (Sentencizer): The sentencizer component.
+        scorer (Optional[Callable]): The scoring method. Defaults to
            Scorer.score_spans for the attribute "sents".
        DOCS: https://spacy.io/api/sentencizer#init
        """
@ -55,6 +69,8 @@ class Sentencizer(Pipe):
            self.punct_chars = set(punct_chars)
        else:
            self.punct_chars = set(self.default_punct_chars)
        self.overwrite = overwrite
        self.scorer = scorer
    def __call__(self, doc):
        """Apply the sentencizer to a Doc and set Token.is_sent_start.
@ -115,29 +131,12 @@ class Sentencizer(Pipe):
        for i, doc in enumerate(docs):
            doc_tag_ids = batch_tag_ids[i]
            for j, tag_id in enumerate(doc_tag_ids):
-                # Don't clobber existing sentence boundaries
+                if doc.c[j].sent_start == 0 or self.overwrite:
                if doc.c[j].sent_start == 0:
                    if tag_id:
                        doc.c[j].sent_start = 1
                    else:
                        doc.c[j].sent_start = -1
    def score(self, examples, **kwargs):
        """Score a batch of examples.
        examples (Iterable[Example]): The examples to score.
        RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans.
        DOCS: https://spacy.io/api/sentencizer#score
        """
        def has_sents(doc):
            return doc.has_annotation("SENT_START")
        validate_examples(examples, "Sentencizer.score")
        results = Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
        del results["sents_per_type"]
        return results
    def to_bytes(self, *, exclude=tuple()):
        """Serialize the sentencizer to a bytestring.
@ -145,7 +144,7 @@ class Sentencizer(Pipe):
        DOCS: https://spacy.io/api/sentencizer#to_bytes
        """
-        return srsly.msgpack_dumps({"punct_chars": list(self.punct_chars)})
+        return srsly.msgpack_dumps({"punct_chars": list(self.punct_chars), "overwrite": self.overwrite})
    def from_bytes(self, bytes_data, *, exclude=tuple()):
        """Load the sentencizer from a bytestring.
@ -157,6 +156,7 @@ class Sentencizer(Pipe):
        """
        cfg = srsly.msgpack_loads(bytes_data)
        self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars))
        self.overwrite = cfg.get("overwrite", self.overwrite)
        return self
    def to_disk(self, path, *, exclude=tuple()):
@ -166,7 +166,7 @@ class Sentencizer(Pipe):
        """
        path = util.ensure_path(path)
        path = path.with_suffix(".json")
-        srsly.write_json(path, {"punct_chars": list(self.punct_chars)})
+        srsly.write_json(path, {"punct_chars": list(self.punct_chars), "overwrite": self.overwrite})
    def from_disk(self, path, *, exclude=tuple()):
@ -178,4 +178,5 @@ class Sentencizer(Pipe):
        path = path.with_suffix(".json")
        cfg = srsly.read_json(path)
        self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars))
        self.overwrite = cfg.get("overwrite", self.overwrite)
        return self
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@ -1,5 +1,6 @@
 # cython: infer_types=True, profile=True, binding=True
 from itertools import islice
 from typing import Optional, Callable
 import srsly
 from thinc.api import Model, SequenceCategoricalCrossentropy, Config
@ -11,8 +12,11 @@ from ..language import Language
 from ..errors import Errors
 from ..scorer import Scorer
 from ..training import validate_examples, validate_get_examples
 from ..util import registry
 from .. import util
 # See #9050
 BACKWARD_OVERWRITE = False
 default_model_config = """
 [model]
@ -34,11 +38,25 @@ DEFAULT_SENTER_MODEL = Config().from_str(default_model_config)["model"]
@Language.factory(
    "senter",
    assigns=["token.is_sent_start"],
-    default_config={"model": DEFAULT_SENTER_MODEL},
+    default_config={"model": DEFAULT_SENTER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.senter_scorer.v1"}},
    default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
 )
-def make_senter(nlp: Language, name: str, model: Model):
+def make_senter(nlp: Language, name: str, model: Model, overwrite: bool, scorer: Optional[Callable]):
-    return SentenceRecognizer(nlp.vocab, model, name)
+    return SentenceRecognizer(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer)
 def senter_score(examples, **kwargs):
    def has_sents(doc):
        return doc.has_annotation("SENT_START")
    results = Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
    del results["sents_per_type"]
    return results
@registry.scorers("spacy.senter_scorer.v1")
 def make_senter_scorer():
    return senter_score
 class SentenceRecognizer(Tagger):
@ -46,13 +64,23 @@ class SentenceRecognizer(Tagger):
    DOCS: https://spacy.io/api/sentencerecognizer
    """
-    def __init__(self, vocab, model, name="senter"):
+    def __init__(
        self,
        vocab,
        model,
        name="senter",
        *,
        overwrite=BACKWARD_OVERWRITE,
        scorer=senter_score,
    ):
        """Initialize a sentence recognizer.
        vocab (Vocab): The shared vocabulary.
        model (thinc.api.Model): The Thinc Model powering the pipeline component.
        name (str): The component instance name, used to add entries to the
            losses during training.
        scorer (Optional[Callable]): The scoring method. Defaults to
            Scorer.score_spans for the attribute "sents".
        DOCS: https://spacy.io/api/sentencerecognizer#init
        """
@ -60,7 +88,8 @@ class SentenceRecognizer(Tagger):
        self.model = model
        self.name = name
        self._rehearsal_model = None
-        self.cfg = {}
+        self.cfg = {"overwrite": overwrite}
        self.scorer = scorer
    @property
    def labels(self):
@ -85,13 +114,13 @@ class SentenceRecognizer(Tagger):
        if isinstance(docs, Doc):
            docs = [docs]
        cdef Doc doc
        cdef bint overwrite = self.cfg["overwrite"]
        for i, doc in enumerate(docs):
            doc_tag_ids = batch_tag_ids[i]
            if hasattr(doc_tag_ids, "get"):
                doc_tag_ids = doc_tag_ids.get()
            for j, tag_id in enumerate(doc_tag_ids):
-                # Don't clobber existing sentence boundaries
+                if doc.c[j].sent_start == 0 or overwrite:
                if doc.c[j].sent_start == 0:
                    if tag_id == 1:
                        doc.c[j].sent_start = 1
                    else:
@ -153,18 +182,3 @@ class SentenceRecognizer(Tagger):
    def add_label(self, label, values=None):
        raise NotImplementedError
    def score(self, examples, **kwargs):
        """Score a batch of examples.
        examples (Iterable[Example]): The examples to score.
        RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans.
        DOCS: https://spacy.io/api/sentencerecognizer#score
        """
        def has_sents(doc):
            return doc.has_annotation("SENT_START")
        validate_examples(examples, "SentenceRecognizer.score")
        results = Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
        del results["sents_per_type"]
        return results
--- a/spacy/pipeline/spancat.py
+++ b/spacy/pipeline/spancat.py
@ -104,6 +104,7 @@ def build_ngram_range_suggester(min_size: int, max_size: int) -> Suggester:
        "max_positive": None,
        "model": DEFAULT_SPANCAT_MODEL,
        "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
        "scorer": {"@scorers": "spacy.spancat_scorer.v1"},
    },
    default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
 )
@ -113,8 +114,9 @@ def make_spancat(
    suggester: Suggester,
    model: Model[Tuple[List[Doc], Ragged], Floats2d],
    spans_key: str,
-    threshold: float = 0.5,
+    scorer: Optional[Callable],
-    max_positive: Optional[int] = None,
+    threshold: float,
    max_positive: Optional[int],
 ) -> "SpanCategorizer":
    """Create a SpanCategorizer component. The span categorizer consists of two
    parts: a suggester function that proposes candidate spans, and a labeller
@ -144,9 +146,28 @@ def make_spancat(
        threshold=threshold,
        max_positive=max_positive,
        name=name,
        scorer=scorer,
    )
 def spancat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
    kwargs = dict(kwargs)
    attr_prefix = "spans_"
    key = kwargs["spans_key"]
    kwargs.setdefault("attr", f"{attr_prefix}{key}")
    kwargs.setdefault("allow_overlap", True)
    kwargs.setdefault(
        "getter", lambda doc, key: doc.spans.get(key[len(attr_prefix) :], [])
    )
    kwargs.setdefault("has_annotation", lambda doc: key in doc.spans)
    return Scorer.score_spans(examples, **kwargs)
@registry.scorers("spacy.spancat_scorer.v1")
 def make_spancat_scorer():
    return spancat_score
 class SpanCategorizer(TrainablePipe):
    """Pipeline component to label spans of text.
@ -163,8 +184,25 @@ class SpanCategorizer(TrainablePipe):
        spans_key: str = "spans",
        threshold: float = 0.5,
        max_positive: Optional[int] = None,
        scorer: Optional[Callable] = spancat_score,
    ) -> None:
        """Initialize the span categorizer.
        vocab (Vocab): The shared vocabulary.
        model (thinc.api.Model): The Thinc Model powering the pipeline component.
        name (str): The component instance name, used to add entries to the
            losses during training.
        spans_key (str): Key of the Doc.spans dict to save the spans under.
            During initialization and training, the component will look for
            spans on the reference document under the same key. Defaults to
            `"spans"`.
        threshold (float): Minimum probability to consider a prediction
            positive. Spans with a positive prediction will be saved on the Doc.
            Defaults to 0.5.
        max_positive (Optional[int]): Maximum number of labels to consider
            positive per span. Defaults to None, indicating no limit.
        scorer (Optional[Callable]): The scoring method. Defaults to
            Scorer.score_spans for the Doc.spans[spans_key] with overlapping
            spans allowed.
        DOCS: https://spacy.io/api/spancategorizer#init
        """
@ -178,6 +216,7 @@ class SpanCategorizer(TrainablePipe):
        self.suggester = suggester
        self.model = model
        self.name = name
        self.scorer = scorer
    @property
    def key(self) -> str:
@ -379,26 +418,6 @@ class SpanCategorizer(TrainablePipe):
        else:
            self.model.initialize()
    def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
        """Score a batch of examples.
        examples (Iterable[Example]): The examples to score.
        RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_cats.
        DOCS: https://spacy.io/api/spancategorizer#score
        """
        validate_examples(examples, "SpanCategorizer.score")
        self._validate_categories(examples)
        kwargs = dict(kwargs)
        attr_prefix = "spans_"
        kwargs.setdefault("attr", f"{attr_prefix}{self.key}")
        kwargs.setdefault("allow_overlap", True)
        kwargs.setdefault(
            "getter", lambda doc, key: doc.spans.get(key[len(attr_prefix) :], [])
        )
        kwargs.setdefault("has_annotation", lambda doc: self.key in doc.spans)
        return Scorer.score_spans(examples, **kwargs)
    def _validate_categories(self, examples: Iterable[Example]):
        # TODO
        pass
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@ -1,4 +1,5 @@
 # cython: infer_types=True, profile=True, binding=True
 from typing import Callable, Optional
 import numpy
 import srsly
 from thinc.api import Model, set_dropout_rate, SequenceCategoricalCrossentropy, Config
@ -18,8 +19,11 @@ from ..parts_of_speech import X
 from ..errors import Errors, Warnings
 from ..scorer import Scorer
 from ..training import validate_examples, validate_get_examples
 from ..util import registry
 from .. import util
 # See #9050
 BACKWARD_OVERWRITE = False
 default_model_config = """
 [model]
@ -41,10 +45,16 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"]
@Language.factory(
    "tagger",
    assigns=["token.tag"],
-    default_config={"model": DEFAULT_TAGGER_MODEL},
+    default_config={"model": DEFAULT_TAGGER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.tagger_scorer.v1"}},
    default_score_weights={"tag_acc": 1.0},
 )
-def make_tagger(nlp: Language, name: str, model: Model):
+def make_tagger(
    nlp: Language,
    name: str,
    model: Model,
    overwrite: bool,
    scorer: Optional[Callable],
 ):
    """Construct a part-of-speech tagger component.
    model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
@ -52,7 +62,16 @@ def make_tagger(nlp: Language, name: str, model: Model):
        in size, and be normalized as probabilities (all scores between 0 and 1,
        with the rows summing to 1).
    """
-    return Tagger(nlp.vocab, model, name)
+    return Tagger(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer)
 def tagger_score(examples, **kwargs):
    return Scorer.score_token_attr(examples, "tag", **kwargs)
@registry.scorers("spacy.tagger_scorer.v1")
 def make_tagger_scorer():
    return tagger_score
 class Tagger(TrainablePipe):
@ -60,13 +79,23 @@ class Tagger(TrainablePipe):
    DOCS: https://spacy.io/api/tagger
    """
-    def __init__(self, vocab, model, name="tagger"):
+    def __init__(
        self,
        vocab,
        model,
        name="tagger",
        *,
        overwrite=BACKWARD_OVERWRITE,
        scorer=tagger_score,
    ):
        """Initialize a part-of-speech tagger.
        vocab (Vocab): The shared vocabulary.
        model (thinc.api.Model): The Thinc Model powering the pipeline component.
        name (str): The component instance name, used to add entries to the
            losses during training.
        scorer (Optional[Callable]): The scoring method. Defaults to
            Scorer.score_token_attr for the attribute "tag".
        DOCS: https://spacy.io/api/tagger#init
        """
@ -74,8 +103,9 @@ class Tagger(TrainablePipe):
        self.model = model
        self.name = name
        self._rehearsal_model = None
-        cfg = {"labels": []}
+        cfg = {"labels": [], "overwrite": overwrite}
        self.cfg = dict(sorted(cfg.items()))
        self.scorer = scorer
    @property
    def labels(self):
@ -135,13 +165,13 @@ class Tagger(TrainablePipe):
            docs = [docs]
        cdef Doc doc
        cdef Vocab vocab = self.vocab
        cdef bint overwrite = self.cfg["overwrite"]
        for i, doc in enumerate(docs):
            doc_tag_ids = batch_tag_ids[i]
            if hasattr(doc_tag_ids, "get"):
                doc_tag_ids = doc_tag_ids.get()
            for j, tag_id in enumerate(doc_tag_ids):
-                # Don't clobber preset POS tags
+                if doc.c[j].tag == 0 or overwrite:
                if doc.c[j].tag == 0:
                    doc.c[j].tag = self.vocab.strings[self.labels[tag_id]]
    def update(self, examples, *, drop=0., sgd=None, losses=None):
@ -289,15 +319,3 @@ class Tagger(TrainablePipe):
        self.cfg["labels"].append(label)
        self.vocab.strings.add(label)
        return 1
    def score(self, examples, **kwargs):
        """Score a batch of examples.
        examples (Iterable[Example]): The examples to score.
        RETURNS (Dict[str, Any]): The scores, produced by
            Scorer.score_token_attr for the attributes "tag".
        DOCS: https://spacy.io/api/tagger#score
        """
        validate_examples(examples, "Tagger.score")
        return Scorer.score_token_attr(examples, "tag", **kwargs)
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@ -10,6 +10,7 @@ from ..training import Example, validate_examples, validate_get_examples
 from ..errors import Errors
 from ..scorer import Scorer
 from ..tokens import Doc
 from ..util import registry
 from ..vocab import Vocab
@ -70,7 +71,11 @@ subword_features = true
@Language.factory(
    "textcat",
    assigns=["doc.cats"],
-    default_config={"threshold": 0.5, "model": DEFAULT_SINGLE_TEXTCAT_MODEL},
+    default_config={
        "threshold": 0.5,
        "model": DEFAULT_SINGLE_TEXTCAT_MODEL,
        "scorer": {"@scorers": "spacy.textcat_scorer.v1"},
    },
    default_score_weights={
        "cats_score": 1.0,
        "cats_score_desc": None,
@ -86,7 +91,11 @@ subword_features = true
    },
 )
 def make_textcat(
-    nlp: Language, name: str, model: Model[List[Doc], List[Floats2d]], threshold: float
+    nlp: Language,
    name: str,
    model: Model[List[Doc], List[Floats2d]],
    threshold: float,
    scorer: Optional[Callable],
 ) -> "TextCategorizer":
    """Create a TextCategorizer component. The text categorizer predicts categories
    over a whole document. It can learn one or more labels, and the labels are considered
@ -95,8 +104,23 @@ def make_textcat(
    model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
        scores for each category.
    threshold (float): Cutoff to consider a prediction "positive".
    scorer (Optional[Callable]): The scoring method.
    """
-    return TextCategorizer(nlp.vocab, model, name, threshold=threshold)
+    return TextCategorizer(nlp.vocab, model, name, threshold=threshold, scorer=scorer)
 def textcat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
    return Scorer.score_cats(
        examples,
        "cats",
        multi_label=False,
        **kwargs,
    )
@registry.scorers("spacy.textcat_scorer.v1")
 def make_textcat_scorer():
    return textcat_score
 class TextCategorizer(TrainablePipe):
@ -106,7 +130,13 @@ class TextCategorizer(TrainablePipe):
    """
    def __init__(
-        self, vocab: Vocab, model: Model, name: str = "textcat", *, threshold: float
+        self,
        vocab: Vocab,
        model: Model,
        name: str = "textcat",
        *,
        threshold: float,
        scorer: Optional[Callable] = textcat_score,
    ) -> None:
        """Initialize a text categorizer for single-label classification.
@ -115,6 +145,8 @@ class TextCategorizer(TrainablePipe):
        name (str): The component instance name, used to add entries to the
            losses during training.
        threshold (float): Cutoff to consider a prediction "positive".
        scorer (Optional[Callable]): The scoring method. Defaults to
                Scorer.score_cats for the attribute "cats".
        DOCS: https://spacy.io/api/textcategorizer#init
        """
@ -124,6 +156,7 @@ class TextCategorizer(TrainablePipe):
        self._rehearsal_model = None
        cfg = {"labels": [], "threshold": threshold, "positive_label": None}
        self.cfg = dict(cfg)
        self.scorer = scorer
    @property
    def labels(self) -> Tuple[str]:
@ -353,26 +386,6 @@ class TextCategorizer(TrainablePipe):
        assert len(label_sample) > 0, Errors.E923.format(name=self.name)
        self.model.initialize(X=doc_sample, Y=label_sample)
    def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
        """Score a batch of examples.
        examples (Iterable[Example]): The examples to score.
        RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_cats.
        DOCS: https://spacy.io/api/textcategorizer#score
        """
        validate_examples(examples, "TextCategorizer.score")
        self._validate_categories(examples)
        kwargs.setdefault("threshold", self.cfg["threshold"])
        kwargs.setdefault("positive_label", self.cfg["positive_label"])
        return Scorer.score_cats(
            examples,
            "cats",
            labels=self.labels,
            multi_label=False,
            **kwargs,
        )
    def _validate_categories(self, examples: Iterable[Example]):
        """Check whether the provided examples all have single-label cats annotations."""
        for ex in examples:
--- a/spacy/pipeline/textcat_multilabel.py
+++ b/spacy/pipeline/textcat_multilabel.py
@ -5,10 +5,11 @@ from thinc.api import Model, Config
 from thinc.types import Floats2d
 from ..language import Language
-from ..training import Example, validate_examples, validate_get_examples
+from ..training import Example, validate_get_examples
 from ..errors import Errors
 from ..scorer import Scorer
 from ..tokens import Doc
 from ..util import registry
 from ..vocab import Vocab
 from .textcat import TextCategorizer
@ -70,7 +71,11 @@ subword_features = true
@Language.factory(
    "textcat_multilabel",
    assigns=["doc.cats"],
-    default_config={"threshold": 0.5, "model": DEFAULT_MULTI_TEXTCAT_MODEL},
+    default_config={
        "threshold": 0.5,
        "model": DEFAULT_MULTI_TEXTCAT_MODEL,
        "scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v1"},
    },
    default_score_weights={
        "cats_score": 1.0,
        "cats_score_desc": None,
@ -86,7 +91,11 @@ subword_features = true
    },
 )
 def make_multilabel_textcat(
-    nlp: Language, name: str, model: Model[List[Doc], List[Floats2d]], threshold: float
+    nlp: Language,
    name: str,
    model: Model[List[Doc], List[Floats2d]],
    threshold: float,
    scorer: Optional[Callable],
 ) -> "TextCategorizer":
    """Create a TextCategorizer component. The text categorizer predicts categories
    over a whole document. It can learn one or more labels, and the labels are considered
@ -97,7 +106,23 @@ def make_multilabel_textcat(
        scores for each category.
    threshold (float): Cutoff to consider a prediction "positive".
    """
-    return MultiLabel_TextCategorizer(nlp.vocab, model, name, threshold=threshold)
+    return MultiLabel_TextCategorizer(
        nlp.vocab, model, name, threshold=threshold, scorer=scorer
    )
 def textcat_multilabel_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
    return Scorer.score_cats(
        examples,
        "cats",
        multi_label=True,
        **kwargs,
    )
@registry.scorers("spacy.textcat_multilabel_scorer.v1")
 def make_textcat_multilabel_scorer():
    return textcat_multilabel_score
 class MultiLabel_TextCategorizer(TextCategorizer):
@ -113,6 +138,7 @@ class MultiLabel_TextCategorizer(TextCategorizer):
        name: str = "textcat_multilabel",
        *,
        threshold: float,
        scorer: Optional[Callable] = textcat_multilabel_score,
    ) -> None:
        """Initialize a text categorizer for multi-label classification.
@ -130,6 +156,7 @@ class MultiLabel_TextCategorizer(TextCategorizer):
        self._rehearsal_model = None
        cfg = {"labels": [], "threshold": threshold}
        self.cfg = dict(cfg)
        self.scorer = scorer
    def initialize(  # type: ignore[override]
        self,
@ -166,24 +193,6 @@ class MultiLabel_TextCategorizer(TextCategorizer):
        assert len(label_sample) > 0, Errors.E923.format(name=self.name)
        self.model.initialize(X=doc_sample, Y=label_sample)
    def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
        """Score a batch of examples.
        examples (Iterable[Example]): The examples to score.
        RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_cats.
        DOCS: https://spacy.io/api/textcategorizer#score
        """
        validate_examples(examples, "MultiLabel_TextCategorizer.score")
        kwargs.setdefault("threshold", self.cfg["threshold"])
        return Scorer.score_cats(
            examples,
            "cats",
            labels=self.labels,
            multi_label=True,
            **kwargs,
        )
    def _validate_categories(self, examples: Iterable[Example]):
        """This component allows any type of single- or multi-label annotations.
        This method overwrites the more strict one from 'textcat'."""
--- a/spacy/pipeline/trainable_pipe.pxd
+++ b/spacy/pipeline/trainable_pipe.pxd
@ -5,3 +5,4 @@ cdef class TrainablePipe(Pipe):
    cdef public Vocab vocab
    cdef public object model
    cdef public object cfg
    cdef public object scorer
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@ -49,7 +49,8 @@ cdef class Parser(TrainablePipe):
        beam_density=0.0,
        beam_update_prob=0.0,
        multitasks=tuple(),
-        incorrect_spans_key=None
+        incorrect_spans_key=None,
        scorer=None,
    ):
        """Create a Parser.
@ -86,6 +87,7 @@ cdef class Parser(TrainablePipe):
        incorrect_spans_key (Optional[str]): Identifies spans that are known
            to be incorrect entity annotations. The incorrect entity annotations
            can be stored in the span group, under this key.
        scorer (Optional[Callable]): The scoring method. Defaults to None.
        """
        self.vocab = vocab
        self.name = name
@ -117,6 +119,7 @@ cdef class Parser(TrainablePipe):
            self.add_multitask_objective(multitask)
        self._rehearsal_model = None
        self.scorer = scorer
    def __getnewargs_ex__(self):
        """This allows pickling the Parser and its keyword-only init arguments"""
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@ -351,7 +351,8 @@ class ConfigSchemaPretrain(BaseModel):
    # fmt: off
    max_epochs: StrictInt = Field(..., title="Maximum number of epochs to train for")
    dropout: StrictFloat = Field(..., title="Dropout rate")
-    n_save_every: Optional[StrictInt] = Field(..., title="Saving frequency")
+    n_save_every: Optional[StrictInt] = Field(..., title="Saving additional temporary model after n batches within an epoch")
    n_save_epoch: Optional[StrictInt] = Field(..., title="Saving model after every n epoch")
    optimizer: Optimizer = Field(..., title="The optimizer to use")
    corpus: StrictStr = Field(..., title="Path in the config to the training data")
    batcher: Batcher = Field(..., title="Batcher for the training data")
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@ -247,18 +247,21 @@ class Scorer:
        missing_values: Set[Any] = MISSING_VALUES,  # type: ignore[assignment]
        **cfg,
    ) -> Dict[str, Any]:
-        """Return PRF scores per feat for a token attribute in UFEATS format.
+        """Return micro PRF and PRF scores per feat for a token attribute in
        UFEATS format.
        examples (Iterable[Example]): Examples to score
        attr (str): The attribute to score.
        getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
            getter(token, attr) should return the value of the attribute for an
            individual token.
-        missing_values (Set[Any]): Attribute values to treat as missing annotation
+        missing_values (Set[Any]): Attribute values to treat as missing
-            in the reference annotation.
+            annotation in the reference annotation.
-        RETURNS (dict): A dictionary containing the per-feat PRF scores under
+        RETURNS (dict): A dictionary containing the micro PRF scores under the
-            the key attr_per_feat.
+            key attr_micro_p/r/f and the per-feat PRF scores under
            attr_per_feat.
        """
        micro_score = PRFScore()
        per_feat = {}
        for example in examples:
            pred_doc = example.predicted
@ -300,15 +303,22 @@ class Scorer:
                                    pred_per_feat[field] = set()
                                pred_per_feat[field].add((gold_i, feat))
            for field in per_feat:
                micro_score.score_set(pred_per_feat.get(field, set()), gold_per_feat.get(field, set()))
                per_feat[field].score_set(
                    pred_per_feat.get(field, set()), gold_per_feat.get(field, set())
                )
-        score_key = f"{attr}_per_feat"
+        result: Dict[str, Any] = {}
-        if any([len(v) for v in per_feat.values()]):
+        if len(micro_score) > 0:
-            result = {k: v.to_dict() for k, v in per_feat.items()}
+            result[f"{attr}_micro_p"] = micro_score.precision
-            return {score_key: result}
+            result[f"{attr}_micro_r"] = micro_score.recall
            result[f"{attr}_micro_f"] = micro_score.fscore
            result[f"{attr}_per_feat"] = {k: v.to_dict() for k, v in per_feat.items()}
        else:
-            return {score_key: None}
+            result[f"{attr}_micro_p"] = None
            result[f"{attr}_micro_r"] = None
            result[f"{attr}_micro_f"] = None
            result[f"{attr}_per_feat"] = None
        return result
    @staticmethod
    def score_spans(
@ -545,7 +555,7 @@ class Scorer:
    @staticmethod
    def score_links(
-        examples: Iterable[Example], *, negative_labels: Iterable[str]
+        examples: Iterable[Example], *, negative_labels: Iterable[str], **cfg
    ) -> Dict[str, Any]:
        """Returns PRF for predicted links on the entity level.
        To disentangle the performance of the NEL from the NER,
@ -721,7 +731,7 @@ class Scorer:
            }
-def get_ner_prf(examples: Iterable[Example]) -> Dict[str, Any]:
+def get_ner_prf(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
    """Compute micro-PRF and per-entity PRF scores for a sequence of examples."""
    score_per_type = defaultdict(PRFScore)
    for eg in examples:
--- a/spacy/strings.pxd
+++ b/spacy/strings.pxd
@ -8,10 +8,10 @@ from murmurhash.mrmr cimport hash64
 from .typedefs cimport attr_t, hash_t
-cpdef hash_t hash_string(unicode string) except 0
+cpdef hash_t hash_string(str string) except 0
 cdef hash_t hash_utf8(char* utf8_string, int length) nogil
-cdef unicode decode_Utf8Str(const Utf8Str* string)
+cdef str decode_Utf8Str(const Utf8Str* string)
 ctypedef union Utf8Str:
@ -25,5 +25,5 @@ cdef class StringStore:
    cdef vector[hash_t] keys
    cdef public PreshMap _map
-    cdef const Utf8Str* intern_unicode(self, unicode py_string)
+    cdef const Utf8Str* intern_unicode(self, str py_string)
    cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length)
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@ -33,7 +33,7 @@ def get_string_id(key):
        return hash_utf8(chars, len(chars))
-cpdef hash_t hash_string(unicode string) except 0:
+cpdef hash_t hash_string(str string) except 0:
    chars = string.encode("utf8")
    return hash_utf8(chars, len(chars))
@ -46,7 +46,7 @@ cdef uint32_t hash32_utf8(char* utf8_string, int length) nogil:
    return hash32(utf8_string, length, 1)
-cdef unicode decode_Utf8Str(const Utf8Str* string):
+cdef str decode_Utf8Str(const Utf8Str* string):
    cdef int i, length
    if string.s[0] < sizeof(string.s) and string.s[0] != 0:
        return string.s[1:string.s[0]+1].decode("utf8")
@ -107,17 +107,17 @@ cdef class StringStore:
    def __getitem__(self, object string_or_id):
        """Retrieve a string from a given hash, or vice versa.
-        string_or_id (bytes, unicode or uint64): The value to encode.
+        string_or_id (bytes, str or uint64): The value to encode.
        Returns (str / uint64): The value to be retrieved.
        """
-        if isinstance(string_or_id, basestring) and len(string_or_id) == 0:
+        if isinstance(string_or_id, str) and len(string_or_id) == 0:
            return 0
        elif string_or_id == 0:
            return ""
        elif string_or_id in SYMBOLS_BY_STR:
            return SYMBOLS_BY_STR[string_or_id]
        cdef hash_t key
-        if isinstance(string_or_id, unicode):
+        if isinstance(string_or_id, str):
            key = hash_string(string_or_id)
            return key
        elif isinstance(string_or_id, bytes):
@ -135,14 +135,14 @@ cdef class StringStore:
    def as_int(self, key):
        """If key is an int, return it; otherwise, get the int value."""
-        if not isinstance(key, basestring):
+        if not isinstance(key, str):
            return key
        else:
            return self[key]
    def as_string(self, key):
        """If key is a string, return it; otherwise, get the string value."""
-        if isinstance(key, basestring):
+        if isinstance(key, str):
            return key
        else:
            return self[key]
@ -153,7 +153,7 @@ cdef class StringStore:
        string (str): The string to add.
        RETURNS (uint64): The string's hash value.
        """
-        if isinstance(string, unicode):
+        if isinstance(string, str):
            if string in SYMBOLS_BY_STR:
                return SYMBOLS_BY_STR[string]
            key = hash_string(string)
@ -189,7 +189,7 @@ cdef class StringStore:
            return True
        elif string in SYMBOLS_BY_STR:
            return True
-        elif isinstance(string, unicode):
+        elif isinstance(string, str):
            key = hash_string(string)
        else:
            string = string.encode("utf8")
@ -269,7 +269,7 @@ cdef class StringStore:
        for string in strings:
            self.add(string)
-    cdef const Utf8Str* intern_unicode(self, unicode py_string):
+    cdef const Utf8Str* intern_unicode(self, str py_string):
        # 0 means missing, but we don't bother offsetting the index.
        cdef bytes byte_string = py_string.encode("utf8")
        return self._intern_utf8(byte_string, len(byte_string))
--- a/spacy/tests/doc/test_pickle_doc.py
+++ b/spacy/tests/doc/test_pickle_doc.py
@ -5,9 +5,11 @@ from spacy.compat import pickle
 def test_pickle_single_doc():
    nlp = Language()
    doc = nlp("pickle roundtrip")
    doc._context = 3
    data = pickle.dumps(doc, 1)
    doc2 = pickle.loads(data)
    assert doc2.text == "pickle roundtrip"
    assert doc2._context == 3
 def test_list_of_docs_pickles_efficiently():
--- a/spacy/tests/lang/ca/test_exception.py
+++ b/spacy/tests/lang/ca/test_exception.py
@ -11,7 +11,18 @@ def test_ca_tokenizer_handles_abbr(ca_tokenizer, text, lemma):
 def test_ca_tokenizer_handles_exc_in_text(ca_tokenizer):
-    text = "La Núria i el Pere han vingut aprox. a les 7 de la tarda."
+    text = "La Dra. Puig viu a la pl. dels Til·lers."
-    tokens = ca_tokenizer(text)
+    doc = ca_tokenizer(text)
-    assert len(tokens) == 15
+    assert [t.text for t in doc] == [
-    assert tokens[7].text == "aprox."
+        "La",
        "Dra.",
        "Puig",
        "viu",
        "a",
        "la",
        "pl.",
        "d",
        "els",
        "Til·lers",
        ".",
    ]
--- a/spacy/tests/lang/ca/test_prefix_suffix_infix.py
+++ b/spacy/tests/lang/ca/test_prefix_suffix_infix.py
@ -2,7 +2,14 @@ import pytest
@pytest.mark.parametrize(
-    "text,expected_tokens", [("d'un", ["d'", "un"]), ("s'ha", ["s'", "ha"])]
+    "text,expected_tokens",
    [
        ("d'un", ["d'", "un"]),
        ("s'ha", ["s'", "ha"]),
        ("del", ["d", "el"]),
        ("cantar-te", ["cantar", "-te"]),
        ("-hola", ["-", "hola"]),
    ],
 )
 def test_contractions(ca_tokenizer, text, expected_tokens):
    """Test that the contractions are split into two tokens"""
--- a/spacy/tests/lang/ca/test_text.py
+++ b/spacy/tests/lang/ca/test_text.py
@ -12,17 +12,20 @@ def test_ca_tokenizer_handles_long_text(ca_tokenizer):
    una gerra de cervesa. Ens asseiem -fotògraf i periodista- en una terrassa buida."""
    tokens = ca_tokenizer(text)
-    assert len(tokens) == 140
+    assert len(tokens) == 146
@pytest.mark.parametrize(
    "text,length",
    [
-        ("Perquè va anar-hi?", 4),
+        ("Perquè va anar-hi?", 5),
        ("El cotxe dels veins.", 6),
        ("“Ah no?”", 5),
        ("""Sí! "Anem", va contestar el Joan Carles""", 11),
        ("Van córrer aprox. 10km", 5),
        ("Llavors perqué...", 3),
        ("Vull parlar-te'n demà al matí", 8),
        ("Vull explicar-t'ho demà al matí", 8),
    ],
 )
 def test_ca_tokenizer_handles_cnts(ca_tokenizer, text, length):
--- a/spacy/tests/lang/ja/test_lemmatization.py
+++ b/spacy/tests/lang/ja/test_lemmatization.py
@ -8,3 +8,17 @@ import pytest
 def test_ja_lemmatizer_assigns(ja_tokenizer, word, lemma):
    test_lemma = ja_tokenizer(word)[0].lemma_
    assert test_lemma == lemma
@pytest.mark.parametrize(
    "word,norm",
    [
        ("SUMMER", "サマー"),
        ("食べ物", "食べ物"),
        ("綜合", "総合"),
        ("コンピュータ", "コンピューター"),
    ],
 )
 def test_ja_lemmatizer_norm(ja_tokenizer, word, norm):
    test_norm = ja_tokenizer(word)[0].norm_
    assert test_norm == norm
--- a/spacy/tests/lang/ja/test_morphologizer_factory.py
+++ b/spacy/tests/lang/ja/test_morphologizer_factory.py
@ -0,0 +1,9 @@
 import pytest
 from spacy.lang.ja import Japanese
 def test_ja_morphologizer_factory():
    pytest.importorskip("sudachipy")
    nlp = Japanese()
    morphologizer = nlp.add_pipe("morphologizer")
    assert morphologizer.cfg["extend"] is True
--- a/spacy/tests/lang/ja/test_serialize.py
+++ b/spacy/tests/lang/ja/test_serialize.py
@ -1,3 +1,5 @@
 import pickle
 from spacy.lang.ja import Japanese
 from ...util import make_tempdir
@ -31,3 +33,9 @@ def test_ja_tokenizer_serialize(ja_tokenizer):
        nlp_r.from_disk(d)
        assert nlp_bytes == nlp_r.to_bytes()
        assert nlp_r.tokenizer.split_mode == "B"
 def test_ja_tokenizer_pickle(ja_tokenizer):
    b = pickle.dumps(ja_tokenizer)
    ja_tokenizer_re = pickle.loads(b)
    assert ja_tokenizer.to_bytes() == ja_tokenizer_re.to_bytes()
--- a/spacy/tests/lang/ja/test_tokenizer.py
+++ b/spacy/tests/lang/ja/test_tokenizer.py
@ -34,22 +34,22 @@ SENTENCE_TESTS = [
 ]
 tokens1 = [
-    DetailedToken(surface="委員", tag="名詞-普通名詞-一般", inf="", lemma="委員", reading="イイン", sub_tokens=None),
+    DetailedToken(surface="委員", tag="名詞-普通名詞-一般", inf="", lemma="委員", norm="委員", reading="イイン", sub_tokens=None),
-    DetailedToken(surface="会", tag="名詞-普通名詞-一般", inf="", lemma="会", reading="カイ", sub_tokens=None),
+    DetailedToken(surface="会", tag="名詞-普通名詞-一般", inf="", lemma="会", norm="会", reading="カイ", sub_tokens=None),
 ]
 tokens2 = [
-    DetailedToken(surface="選挙", tag="名詞-普通名詞-サ変可能", inf="", lemma="選挙", reading="センキョ", sub_tokens=None),
+    DetailedToken(surface="選挙", tag="名詞-普通名詞-サ変可能", inf="", lemma="選挙", norm="選挙", reading="センキョ", sub_tokens=None),
-    DetailedToken(surface="管理", tag="名詞-普通名詞-サ変可能", inf="", lemma="管理", reading="カンリ", sub_tokens=None),
+    DetailedToken(surface="管理", tag="名詞-普通名詞-サ変可能", inf="", lemma="管理", norm="管理", reading="カンリ", sub_tokens=None),
-    DetailedToken(surface="委員", tag="名詞-普通名詞-一般", inf="", lemma="委員", reading="イイン", sub_tokens=None),
+    DetailedToken(surface="委員", tag="名詞-普通名詞-一般", inf="", lemma="委員", norm="委員", reading="イイン", sub_tokens=None),
-    DetailedToken(surface="会", tag="名詞-普通名詞-一般", inf="", lemma="会", reading="カイ", sub_tokens=None),
+    DetailedToken(surface="会", tag="名詞-普通名詞-一般", inf="", lemma="会", norm="会", reading="カイ", sub_tokens=None),
 ]
 tokens3 = [
-    DetailedToken(surface="選挙", tag="名詞-普通名詞-サ変可能", inf="", lemma="選挙", reading="センキョ", sub_tokens=None),
+    DetailedToken(surface="選挙", tag="名詞-普通名詞-サ変可能", inf="", lemma="選挙", norm="選挙", reading="センキョ", sub_tokens=None),
-    DetailedToken(surface="管理", tag="名詞-普通名詞-サ変可能", inf="", lemma="管理", reading="カンリ", sub_tokens=None),
+    DetailedToken(surface="管理", tag="名詞-普通名詞-サ変可能", inf="", lemma="管理", norm="管理", reading="カンリ", sub_tokens=None),
-    DetailedToken(surface="委員会", tag="名詞-普通名詞-一般", inf="", lemma="委員会", reading="イインカイ", sub_tokens=None),
+    DetailedToken(surface="委員会", tag="名詞-普通名詞-一般", inf="", lemma="委員会", norm="委員会", reading="イインカイ", sub_tokens=None),
 ]
 SUB_TOKEN_TESTS = [
-    ("選挙管理委員会", [None, None, None, None], [None, None, [tokens1]], [[tokens2, tokens3]])
+    ("選挙管理委員会", [None, None, [tokens1]], [[tokens2, tokens3]])
 ]
 # fmt: on
@ -111,18 +111,16 @@ def test_ja_tokenizer_split_modes(ja_tokenizer, text, len_a, len_b, len_c):
    assert len(nlp_c(text)) == len_c
-@pytest.mark.parametrize(
+@pytest.mark.parametrize("text,sub_tokens_list_b,sub_tokens_list_c", SUB_TOKEN_TESTS)
    "text,sub_tokens_list_a,sub_tokens_list_b,sub_tokens_list_c", SUB_TOKEN_TESTS
 )
 def test_ja_tokenizer_sub_tokens(
-    ja_tokenizer, text, sub_tokens_list_a, sub_tokens_list_b, sub_tokens_list_c
+    ja_tokenizer, text, sub_tokens_list_b, sub_tokens_list_c
 ):
    nlp_a = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "A"}}})
    nlp_b = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "B"}}})
    nlp_c = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "C"}}})
-    assert ja_tokenizer(text).user_data["sub_tokens"] == sub_tokens_list_a
+    assert ja_tokenizer(text).user_data.get("sub_tokens") is None
-    assert nlp_a(text).user_data["sub_tokens"] == sub_tokens_list_a
+    assert nlp_a(text).user_data.get("sub_tokens") is None
    assert nlp_b(text).user_data["sub_tokens"] == sub_tokens_list_b
    assert nlp_c(text).user_data["sub_tokens"] == sub_tokens_list_c
@ -132,16 +130,24 @@ def test_ja_tokenizer_sub_tokens(
    [
        (
            "取ってつけた",
-            ("五段-ラ行,連用形-促音便", "", "下一段-カ行,連用形-一般", "助動詞-タ,終止形-一般"),
+            (["五段-ラ行;連用形-促音便"], [], ["下一段-カ行;連用形-一般"], ["助動詞-タ;終止形-一般"]),
-            ("トッ", "テ", "ツケ", "タ"),
+            (["トッ"], ["テ"], ["ツケ"], ["タ"]),
        ),
        (
            "2=3",
            ([], [], []),
            (["ニ"], ["_"], ["サン"])
        ),
    ],
 )
 def test_ja_tokenizer_inflections_reading_forms(
    ja_tokenizer, text, inflections, reading_forms
 ):
-    assert ja_tokenizer(text).user_data["inflections"] == inflections
+    tokens = ja_tokenizer(text)
-    assert ja_tokenizer(text).user_data["reading_forms"] == reading_forms
+    test_inflections = [tt.morph.get("Inflection") for tt in tokens]
    assert test_inflections == list(inflections)
    test_readings = [tt.morph.get("Reading") for tt in tokens]
    assert test_readings == list(reading_forms)
 def test_ja_tokenizer_emptyish_texts(ja_tokenizer):
--- a/spacy/tests/lang/ko/test_serialize.py
+++ b/spacy/tests/lang/ko/test_serialize.py
@ -0,0 +1,24 @@
 import pickle
 from spacy.lang.ko import Korean
 from ...util import make_tempdir
 def test_ko_tokenizer_serialize(ko_tokenizer):
    tokenizer_bytes = ko_tokenizer.to_bytes()
    nlp = Korean()
    nlp.tokenizer.from_bytes(tokenizer_bytes)
    assert tokenizer_bytes == nlp.tokenizer.to_bytes()
    with make_tempdir() as d:
        file_path = d / "tokenizer"
        ko_tokenizer.to_disk(file_path)
        nlp = Korean()
        nlp.tokenizer.from_disk(file_path)
        assert tokenizer_bytes == nlp.tokenizer.to_bytes()
 def test_ko_tokenizer_pickle(ko_tokenizer):
    b = pickle.dumps(ko_tokenizer)
    ko_tokenizer_re = pickle.loads(b)
    assert ko_tokenizer.to_bytes() == ko_tokenizer_re.to_bytes()
--- a/spacy/tests/lang/ky/test_tokenizer.py
+++ b/spacy/tests/lang/ky/test_tokenizer.py
@ -1,6 +1,3 @@
 # coding: utf8
 from __future__ import unicode_literals
 import pytest
--- a/spacy/tests/lang/test_attrs.py
+++ b/spacy/tests/lang/test_attrs.py
@ -58,9 +58,10 @@ def test_lex_attrs_is_currency(text, match):
        ("www.google.com", True),
        ("google.com", True),
        ("sydney.com", True),
-        ("2girls1cup.org", True),
+        ("1abc2def.org", True),
        ("http://stupid", True),
        ("www.hi", True),
        ("example.com/example", True),
        ("dog", False),
        ("1.2", False),
        ("1.a", False),
--- a/spacy/tests/lang/th/test_serialize.py
+++ b/spacy/tests/lang/th/test_serialize.py
@ -0,0 +1,24 @@
 import pickle
 from spacy.lang.th import Thai
 from ...util import make_tempdir
 def test_th_tokenizer_serialize(th_tokenizer):
    tokenizer_bytes = th_tokenizer.to_bytes()
    nlp = Thai()
    nlp.tokenizer.from_bytes(tokenizer_bytes)
    assert tokenizer_bytes == nlp.tokenizer.to_bytes()
    with make_tempdir() as d:
        file_path = d / "tokenizer"
        th_tokenizer.to_disk(file_path)
        nlp = Thai()
        nlp.tokenizer.from_disk(file_path)
        assert tokenizer_bytes == nlp.tokenizer.to_bytes()
 def test_th_tokenizer_pickle(th_tokenizer):
    b = pickle.dumps(th_tokenizer)
    th_tokenizer_re = pickle.loads(b)
    assert th_tokenizer.to_bytes() == th_tokenizer_re.to_bytes()
--- a/spacy/tests/lang/ti/test_text.py
+++ b/spacy/tests/lang/ti/test_text.py
@ -37,7 +37,7 @@ def test_ti_tokenizer_handles_cnts(ti_tokenizer, text, length):
        ("10.000", True),
        ("1000", True),
        ("999,0", True),
-        ("ሐደ", True),
+        ("ሓደ", True),
        ("ክልተ", True),
        ("ትሪልዮን", True),
        ("ከልቢ", False),
--- a/spacy/tests/lang/vi/test_serialize.py
+++ b/spacy/tests/lang/vi/test_serialize.py
@ -1,3 +1,5 @@
 import pickle
 from spacy.lang.vi import Vietnamese
 from ...util import make_tempdir
@ -31,3 +33,9 @@ def test_vi_tokenizer_serialize(vi_tokenizer):
        nlp_r.from_disk(d)
        assert nlp_bytes == nlp_r.to_bytes()
        assert nlp_r.tokenizer.use_pyvi is False
 def test_vi_tokenizer_pickle(vi_tokenizer):
    b = pickle.dumps(vi_tokenizer)
    vi_tokenizer_re = pickle.loads(b)
    assert vi_tokenizer.to_bytes() == vi_tokenizer_re.to_bytes()
--- a/spacy/tests/pipeline/test_attributeruler.py
+++ b/spacy/tests/pipeline/test_attributeruler.py
@ -32,24 +32,6 @@ def pattern_dicts():
    ]
@registry.misc("attribute_ruler_patterns")
 def attribute_ruler_patterns():
    return [
        {
            "patterns": [[{"ORTH": "a"}], [{"ORTH": "irrelevant"}]],
            "attrs": {"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"},
        },
        # one pattern sets the lemma
        {"patterns": [[{"ORTH": "test"}]], "attrs": {"LEMMA": "cat"}},
        # another pattern sets the morphology
        {
            "patterns": [[{"ORTH": "test"}]],
            "attrs": {"MORPH": "Case=Nom|Number=Sing"},
            "index": 0,
        },
    ]
@pytest.fixture
 def tag_map():
    return {
@ -121,7 +103,25 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
    assert doc.has_annotation("LEMMA")
    assert doc.has_annotation("MORPH")
    nlp.remove_pipe("attribute_ruler")
    # initialize with patterns from misc registry
    @registry.misc("attribute_ruler_patterns")
    def attribute_ruler_patterns():
        return [
            {
                "patterns": [[{"ORTH": "a"}], [{"ORTH": "irrelevant"}]],
                "attrs": {"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"},
            },
            # one pattern sets the lemma
            {"patterns": [[{"ORTH": "test"}]], "attrs": {"LEMMA": "cat"}},
            # another pattern sets the morphology
            {
                "patterns": [[{"ORTH": "test"}]],
                "attrs": {"MORPH": "Case=Nom|Number=Sing"},
                "index": 0,
            },
        ]
    nlp.config["initialize"]["components"]["attribute_ruler"] = {
        "patterns": {"@misc": "attribute_ruler_patterns"}
    }
@ -162,6 +162,26 @@ def test_attributeruler_score(nlp, pattern_dicts):
    assert scores["lemma_acc"] == pytest.approx(0.2)
    # no morphs are set
    assert scores["morph_acc"] is None
    nlp.remove_pipe("attribute_ruler")
    # test with custom scorer
    @registry.misc("weird_scorer.v1")
    def make_weird_scorer():
        def weird_scorer(examples, weird_score, **kwargs):
            return {"weird_score": weird_score}
        return weird_scorer
    ruler = nlp.add_pipe(
        "attribute_ruler", config={"scorer": {"@misc": "weird_scorer.v1"}}
    )
    ruler.initialize(lambda: [], patterns=pattern_dicts)
    scores = nlp.evaluate(dev_examples, scorer_cfg={"weird_score": 0.12345})
    assert scores["weird_score"] == 0.12345
    assert "token_acc" in scores
    assert "lemma_acc" not in scores
    scores = nlp.evaluate(dev_examples, scorer_cfg={"weird_score": 0.23456})
    assert scores["weird_score"] == 0.23456
 def test_attributeruler_rule_order(nlp):
--- a/spacy/tests/pipeline/test_morphologizer.py
+++ b/spacy/tests/pipeline/test_morphologizer.py
@ -8,6 +8,7 @@ from spacy.language import Language
 from spacy.tests.util import make_tempdir
 from spacy.morphology import Morphology
 from spacy.attrs import MORPH
 from spacy.tokens import Doc
 def test_label_types():
@ -137,6 +138,41 @@ def test_overfitting_IO():
    assert [str(t.morph) for t in doc] == gold_morphs
    assert [t.pos_ for t in doc] == gold_pos_tags
    # Test overwrite+extend settings
    # (note that "" is unset, "_" is set and empty)
    morphs = ["Feat=V", "Feat=N", "_"]
    doc = Doc(nlp.vocab, words=["blue", "ham", "like"], morphs=morphs)
    orig_morphs = [str(t.morph) for t in doc]
    orig_pos_tags = [t.pos_ for t in doc]
    morphologizer = nlp.get_pipe("morphologizer")
    # don't overwrite or extend
    morphologizer.cfg["overwrite"] = False
    doc = morphologizer(doc)
    assert [str(t.morph) for t in doc] == orig_morphs
    assert [t.pos_ for t in doc] == orig_pos_tags
    # overwrite and extend
    morphologizer.cfg["overwrite"] = True
    morphologizer.cfg["extend"] = True
    doc = Doc(nlp.vocab, words=["I", "like"], morphs=["Feat=A|That=A|This=A", ""])
    doc = morphologizer(doc)
    assert [str(t.morph) for t in doc] == ["Feat=N|That=A|This=A", "Feat=V"]
    # extend without overwriting
    morphologizer.cfg["overwrite"] = False
    morphologizer.cfg["extend"] = True
    doc = Doc(nlp.vocab, words=["I", "like"], morphs=["Feat=A|That=A|This=A", "That=B"])
    doc = morphologizer(doc)
    assert [str(t.morph) for t in doc] == ["Feat=A|That=A|This=A", "Feat=V|That=B"]
    # overwrite without extending
    morphologizer.cfg["overwrite"] = True
    morphologizer.cfg["extend"] = False
    doc = Doc(nlp.vocab, words=["I", "like"], morphs=["Feat=A|That=A|This=A", ""])
    doc = morphologizer(doc)
    assert [str(t.morph) for t in doc] == ["Feat=N", "Feat=V"]
    # Test with unset morph and partial POS
    nlp.remove_pipe("morphologizer")
    nlp.add_pipe("morphologizer")
--- a/spacy/tests/serialize/test_serialize_vocab_strings.py
+++ b/spacy/tests/serialize/test_serialize_vocab_strings.py
@ -1,7 +1,9 @@
 import pytest
 import pickle
 from thinc.api import get_current_ops
 from spacy.vocab import Vocab
 from spacy.strings import StringStore
 from spacy.vectors import Vectors
 from ..util import make_tempdir
@ -129,7 +131,11 @@ def test_serialize_stringstore_roundtrip_disk(strings1, strings2):
@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
 def test_pickle_vocab(strings, lex_attr):
    vocab = Vocab(strings=strings)
    ops = get_current_ops()
    vectors = Vectors(data=ops.xp.zeros((10, 10)), mode="floret", hash_count=1)
    vocab.vectors = vectors
    vocab[strings[0]].norm_ = lex_attr
    vocab_pickled = pickle.dumps(vocab)
    vocab_unpickled = pickle.loads(vocab_pickled)
    assert vocab.to_bytes() == vocab_unpickled.to_bytes()
    assert vocab_unpickled.vectors.mode == "floret"
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@ -1,5 +1,6 @@
 import pytest
 from click import NoSuchOption
 from packaging.specifiers import SpecifierSet
 from spacy.training import docs_to_json, offsets_to_biluo_tags
 from spacy.training.converters import iob_to_docs, conll_ner_to_docs, conllu_to_docs
 from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
@ -491,19 +492,27 @@ def test_string_to_list_intify(value):
    assert string_to_list(value, intify=True) == [1, 2, 3]
@pytest.mark.skip(reason="Temporarily skip until v3.2.0 release")
 def test_download_compatibility():
-    model_name = "en_core_web_sm"
+    spec = SpecifierSet("==" + about.__version__)
-    compatibility = get_compatibility()
+    spec.prereleases = False
-    version = get_version(model_name, compatibility)
+    if about.__version__ in spec:
-    assert get_minor_version(about.__version__) == get_minor_version(version)
+        model_name = "en_core_web_sm"
        compatibility = get_compatibility()
        version = get_version(model_name, compatibility)
        assert get_minor_version(about.__version__) == get_minor_version(version)
@pytest.mark.skip(reason="Temporarily skip until v3.2.0 release")
 def test_validate_compatibility_table():
-    model_pkgs, compat = get_model_pkgs()
+    spec = SpecifierSet("==" + about.__version__)
-    spacy_version = get_minor_version(about.__version__)
+    spec.prereleases = False
-    current_compat = compat.get(spacy_version, {})
+    if about.__version__ in spec:
-    assert len(current_compat) > 0
+        model_pkgs, compat = get_model_pkgs()
-    assert "en_core_web_sm" in current_compat
+        spacy_version = get_minor_version(about.__version__)
        current_compat = compat.get(spacy_version, {})
        assert len(current_compat) > 0
        assert "en_core_web_sm" in current_compat
@pytest.mark.parametrize("component_name", ["ner", "textcat", "spancat", "tagger"])
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@ -8,7 +8,7 @@ from spacy.vocab import Vocab
 from spacy.training import Example
 from spacy.lang.en import English
 from spacy.lang.de import German
-from spacy.util import registry, ignore_error, raise_error
+from spacy.util import registry, ignore_error, raise_error, find_matching_language
 import spacy
 from thinc.api import CupyOps, NumpyOps, get_current_ops
@ -255,6 +255,38 @@ def test_language_pipe_error_handler_custom(en_vocab, n_process):
            assert [doc.text for doc in docs] == ["TEXT 111", "TEXT 333", "TEXT 666"]
@pytest.mark.parametrize("n_process", [1, 2])
 def test_language_pipe_error_handler_input_as_tuples(en_vocab, n_process):
    """Test the error handling of nlp.pipe with input as tuples"""
    Language.component("my_evil_component", func=evil_component)
    ops = get_current_ops()
    if isinstance(ops, NumpyOps) or n_process < 2:
        nlp = English()
        nlp.add_pipe("my_evil_component")
        texts = [
            ("TEXT 111", 111),
            ("TEXT 222", 222),
            ("TEXT 333", 333),
            ("TEXT 342", 342),
            ("TEXT 666", 666),
        ]
        with pytest.raises(ValueError):
            list(nlp.pipe(texts, as_tuples=True))
        nlp.set_error_handler(warn_error)
        logger = logging.getLogger("spacy")
        with mock.patch.object(logger, "warning") as mock_warning:
            tuples = list(nlp.pipe(texts, as_tuples=True, n_process=n_process))
            # HACK/TODO? the warnings in child processes don't seem to be
            # detected by the mock logger
            if n_process == 1:
                mock_warning.assert_called()
                assert mock_warning.call_count == 2
                assert len(tuples) + mock_warning.call_count == len(texts)
            assert (tuples[0][0].text, tuples[0][1]) == ("TEXT 111", 111)
            assert (tuples[1][0].text, tuples[1][1]) == ("TEXT 333", 333)
            assert (tuples[2][0].text, tuples[2][1]) == ("TEXT 666", 666)
@pytest.mark.parametrize("n_process", [1, 2])
 def test_language_pipe_error_handler_pipe(en_vocab, n_process):
    """Test the error handling of a component's pipe method"""
@ -512,6 +544,55 @@ def test_spacy_blank():
    assert nlp.meta["name"] == "my_custom_model"
@pytest.mark.parametrize(
    "lang,target",
    [
        ("en", "en"),
        ("fra", "fr"),
        ("fre", "fr"),
        ("iw", "he"),
        ("mo", "ro"),
        ("mul", "xx"),
        ("no", "nb"),
        ("pt-BR", "pt"),
        ("xx", "xx"),
        ("zh-Hans", "zh"),
        ("zh-Hant", None),
        ("zxx", None),
    ],
 )
 def test_language_matching(lang, target):
    """
    Test that we can look up languages by equivalent or nearly-equivalent
    language codes.
    """
    assert find_matching_language(lang) == target
@pytest.mark.parametrize(
    "lang,target",
    [
        ("en", "en"),
        ("fra", "fr"),
        ("fre", "fr"),
        ("iw", "he"),
        ("mo", "ro"),
        ("mul", "xx"),
        ("no", "nb"),
        ("pt-BR", "pt"),
        ("xx", "xx"),
        ("zh-Hans", "zh"),
    ],
 )
 def test_blank_languages(lang, target):
    """
    Test that we can get spacy.blank in various languages, including codes
    that are defined to be equivalent or that match by CLDR language matching.
    """
    nlp = spacy.blank(lang)
    assert nlp.lang == target
@pytest.mark.parametrize("value", [False, None, ["x", "y"], Language, Vocab])
 def test_language_init_invalid_vocab(value):
    err_fragment = "invalid value"
@ -540,6 +621,32 @@ def test_language_source_and_vectors(nlp2):
    assert nlp.vocab.vectors.to_bytes() == vectors_bytes
@pytest.mark.parametrize("n_process", [1, 2])
 def test_pass_doc_to_pipeline(nlp, n_process):
    texts = ["cats", "dogs", "guinea pigs"]
    docs = [nlp.make_doc(text) for text in texts]
    assert not any(len(doc.cats) for doc in docs)
    doc = nlp(docs[0])
    assert doc.text == texts[0]
    assert len(doc.cats) > 0
    if isinstance(get_current_ops(), NumpyOps) or n_process < 2:
        docs = nlp.pipe(docs, n_process=n_process)
        assert [doc.text for doc in docs] == texts
        assert all(len(doc.cats) for doc in docs)
 def test_invalid_arg_to_pipeline(nlp):
    str_list = ["This is a text.", "This is another."]
    with pytest.raises(ValueError):
        nlp(str_list)  # type: ignore
    assert len(list(nlp.pipe(str_list))) == 2
    int_list = [1, 2, 3]
    with pytest.raises(ValueError):
        list(nlp.pipe(int_list))  # type: ignore
    with pytest.raises(ValueError):
        nlp(int_list)  # type: ignore
@pytest.mark.skipif(
    not isinstance(get_current_ops(), CupyOps), reason="test requires GPU"
 )
--- a/Show More
+++ b/Show More